diff --git a/.github/workflows/nlu_test_flow.yaml b/.github/workflows/nlu_test_flow.yaml index f57df5ae..a4c4ceb8 100644 --- a/.github/workflows/nlu_test_flow.yaml +++ b/.github/workflows/nlu_test_flow.yaml @@ -10,15 +10,15 @@ jobs: build: runs-on: ubuntu-latest env: - JSL_LICENSE: ${{ secrets.JSL_LICENSE }} + JOHNSNOWLABS_LICENSE_JSON: ${{ secrets.JOHNSNOWLABS_LICENSE_JSON }} strategy: matrix: python-version: [3.7] steps: - uses: actions/setup-java@v1 with: - java-version: '1.8.0' # The JDK version to make available on the path. - java-package: jdk # (jre, jdk, or jdk+fx) - defaults to jd + java-version: '1.8.0' + java-package: jdk - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 @@ -27,173 +27,17 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pypandoc scikit-learn - pip install wheel dataclasses pandas numpy pytest modin[ray] pyspark==3.2.0 spark-nlp - java -version - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - ! echo 2 | update-alternatives --config java - - name: NLU Chunker Component tests - if: always() - run: | - python -m unittest discover -s './tests/nlu_core_tests/component_tests/chunker_tests' -p '*tests.py' - - name: NLU Classifier Cyber tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/cyber_tests.py' - - name: NLU Classifier E2E tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/e2e_tests.py' - - name: NLU Classifier Emotion tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/emotion_tests.py' - - name: NLU Classifier Language tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/language_tests.py' - - name: NLU Classifier NER tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/ner_tests.py' - - name: NLU Classifier POS tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/pos_tests.py' - - name: NLU Classifier Question tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/question_tests.py' - - name: NLU Classifier Sarcasm tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/sarcasm_tests.py' - - name: NLU Classifier Sentiment tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/sentiment_tests.py' - - name: NLU Classifier Sentiment test Imdb - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_imdb.py' - - name: NLU Classifier Sentiment test Twitter - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_twitter.py' - - name: NLU Classifier Spam tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/spam_tests.py' - - name: NLU Classifier Toxic tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/toxic_tests.py' - - name: NLU Classifier YAKE tests - - if: always() - run: | - python -m unittest 'tests/nlu_core_tests/component_tests/classifier_tests/yake_tests.py' -# - name: NLU Embed Component tests -# if: always() -# run: | -# python -m unittest discover -s './tests/nlu_core_tests/component_tests/embed_tests' -p '*tests.py' - - name: NLU Text Cleaner Component tests - - if: always() - run: | - python -m unittest discover -s './tests/nlu_core_tests/component_tests/pre_processing_tests' -p '*tests.py' -# - name: NLU Matcher Component tests # not yet converted in Spark 3 -# if: always() -# run: | -# python -m unittest discover -s 'tests/nlu_core_tests/component_tests/matcher_tests' -p '*tests.py' - - name: NLU Typed Dependency Component tests - - if: always() - run: | - python -m unittest discover -s './tests/nlu_core_tests/component_tests/typed_dependency_tests' -p '*tests.py' - - name: NLU Untyped Dependency Component tests - - if: always() - run: | - python -m unittest discover -s './tests/nlu_core_tests/component_tests/untyped_dependency_tests' -p '*tests.py' - - name: NLU Pipe tests - if: always() + pip install wheel dataclasses pandas numpy pytest modin[ray] pytest-xdist pytest-xdist pytest-forked nbformat librosa johnsnowlabs==5.3.4rc1 + pip uninstall nlu -y + - name: Install Licensed Libs + if: run: | - python -m unittest discover -s './tests/nlu_core_tests/pipeline_tests' -p '*tests.py' - - name: NLU Training sentiment tests + python -c 'from johnsnowlabs import nlp;nlp.install(browser_login = False, force_browser=False,visual=True)' + - name: Run one test per lib if: always() run: | - python -m unittest discover -s './tests/nlu_core_tests/training_tests/classifiers' -p '*sentiment_dl*.py' - # Too hevy for Github actions - # - name: NLU Training classifier tests - # if: always() - # run: | - # python -m unittest discover -s './tests/nlu_core_tests/training_tests/classifiers' -p '*classifier_dl*.py' -# - name: NLU Training multi classifier tests -# if: always() -# run: | -# python -m unittest discover -s './tests/nlu_core_tests/training_tests/classifiers' -p '*multi*.py' -# - name: NLU Training NER tests -# if: always() -# run: | -# python -m unittest discover -s './tests/nlu_core_tests/training_tests/classifiers' -p '*ner*.py' - - name: NLU Training POS tests + python tests/run_tests.py one_per_lib + - name: Run all tests if: always() run: | - python -m unittest discover -s './tests/nlu_core_tests/training_tests/classifiers' -p '*pos*.py' -# - name: NLU Healthcare Verification tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests -p 'verification_tests.py' -# - name: NLU OCR tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_ocr_tests -p '*tests.py' -# - name: NLU Healthcare Assertion DL tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/assertion_dl -p '*tests.py' -## - name: NLU Healthcare Contextual Parser tests -## if: always() -## run: | -## python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/contextual_parser -p '*tests.py' -# - name: NLU Healthcare De Identification tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/de_identification -p '*tests.py' -# - name: NLU Healthcare Drug Normalizer tests -# if: always() -# run: | -# python -m unittest 'tests/nlu_hc_tests/component_tests/drug_normalizer/drug_normalizer_test.py' -# - name: NLU Healthcare Generic Classifier tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/generic_classifier -p '*tests.py' -# - name: NLU Healthcare Licensed Classifier tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/licensed_classifier -p '*tests.py' -# - name: NLU Healthcare Relation Extraction tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/relation_extraction -p '*tests.py' -# - name: NLU Healthcare Sentence Entity Resolver tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/component_tests/sentence_entity_resolver -p '*tests.py' -## - name: NLU Healthcare Pipe tests -## if: always() -## run: | -## python -m unittest discover -s ./tests/nlu_hc_tests/pipe_tests -p '*tests.py' -## - name: NLU Healthcare Training Chunk Resolution tests -## if: always() -## run: | -## python -m unittest discover -s ./tests/nlu_hc_tests/training_tests/chunk_resolution -p '*tests.py' -# - name: NLU Healthcare Training Sentence Resolution tests -# if: always() -# run: | -# python -m unittest discover -s ./tests/nlu_hc_tests/training_tests/sentence_resolution -p '*tests.py' -## - name: NLU Saving and Loading tests -## if: always() -## run: | -## python -m unittest discover -s './tests/nlu_core_tests/training_tests/trained_pipe_tests' -p '*tests.py' -## - name: NLU Modin tests -## if: always() -## run: | -## python -m unittest discover -s './tests/modin' -p '*tests.py' + python tests/run_tests.py all diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..b1fc69e0 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include VERSION \ No newline at end of file diff --git a/README.md b/README.md index 23bb7625..f0c2742c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - + # NLU: The Power of Spark NLP, the Simplicity of Python John Snow Labs' NLU is a Python library for applying state-of-the-art text mining, directly on any dataframe, with a single line of code. As a facade of the award-winning Spark NLP library, it comes with **1000+** of pretrained models in **100+**, all production-grade, scalable, and trainable, with **everything in 1 line of code.** diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..d7f1518a --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.1.12 \ No newline at end of file diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 00000000..a6f5d6e1 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,34 @@ +package: + name: nlu + version: {{ environ.get('CODE_VERSION', '') }} + +app: + entry: nlu + summary: The power of 15000+ State-of-the-art pre-trained NLP models in 300 languages with 1 line of Python code. + +source: + path: ../conda_src + +build: + noarch: generic + number: 0 + script: "python3 -m pip install . --no-deps -vv" + +requirements: + build: + - python + run: + - python + - pyspark==3.0.1 + - spark-nlp >=5.2.0 + - numpy + - pyarrow >=0.16.0 + - pandas >=1.3.5 + - dataclasses +about: + home: https://nlu.johnsnowlabs.com/ + license: Apache License 2.0 + license_family: APACHE + license_url: https://github.com/JohnSnowLabs/nlu/blob/master/LICENSE + description: John Snow Labs' NLU is a Python library for applying state-of-the-art text mining, directly on any dataframe, with a single line of code. As a facade of the award-winning Spark NLP library, it comes with hundreds of pretrained models in tens of languages - all production-grade, scalable, and trainable. + summary: The The power of 15000+ State-of-the-art pre-trained NLP models in 300 languages with 1 line of Python code. \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..af175995 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +;markers = +; db_cloud_node_params: marker for parameterizing databricks tests with cloud credentials and node types (azure,aws, gcp) +; db_cloud_params: marker for parameterizing databricks tests over all cloud credentials (azure,aws, gcp) +addopts = -s --capture=no diff --git a/setup.py b/setup.py index e428c650..90ac7c09 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,3 @@ -import nlu - from codecs import open from os import path @@ -10,6 +8,10 @@ with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() +# Get a version from file +with open(path.join(here, 'VERSION')) as version_file: + version = f"{version_file.read().strip()}" + REQUIRED_PKGS = [ 'spark-nlp>=5.0.2', 'numpy', @@ -22,7 +24,7 @@ name='nlu', - version=nlu.version(), + version=version, description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.', diff --git a/tests/base_model_test.py b/tests/base_model_test.py new file mode 100644 index 00000000..eee4fa45 --- /dev/null +++ b/tests/base_model_test.py @@ -0,0 +1,43 @@ +import pytest + +from tests.utils import all_tests, one_per_lib, NluTest, model_and_output_levels_test + + +def model_id(model_to_test: NluTest) -> str: + return f"{model_to_test.test_group}_{model_to_test.nlu_ref}" + + +def all_annotator_tests(): + return all_tests + + +def one_test_per_lib(): + return one_per_lib + + +@pytest.mark.skip(reason="Use run_tests.py instead until pytest-xdist issue is fixed") +@pytest.mark.parametrize("model_to_test", all_annotator_tests(), ids=model_id) +def test_model_all_annotators(model_to_test: NluTest): + model_and_output_levels_test( + nlu_ref=model_to_test.nlu_ref, + lang=model_to_test.lang, + test_group=model_to_test.test_group, + output_levels=model_to_test.output_levels, + input_data_type=model_to_test.input_data_type, + library=model_to_test.library, + pipe_params=model_to_test.pipe_params + ) + + +@pytest.mark.skip(reason="Local testing") +@pytest.mark.parametrize("model_to_test", one_test_per_lib(), ids=model_id) +def test_one_per_lib(model_to_test: NluTest): + model_and_output_levels_test( + nlu_ref=model_to_test.nlu_ref, + lang=model_to_test.lang, + test_group=model_to_test.test_group, + output_levels=model_to_test.output_levels, + input_data_type=model_to_test.input_data_type, + library=model_to_test.library, + pipe_params=model_to_test.pipe_params + ) diff --git a/tests/modin/modin_tests.py b/tests/modin_tests.py similarity index 100% rename from tests/modin/modin_tests.py rename to tests/modin_tests.py diff --git a/tests/nlu_core_tests/component_info_tests.py b/tests/nlu_core_tests/component_info_tests.py index 2f1b4eee..e9835286 100644 --- a/tests/nlu_core_tests/component_info_tests.py +++ b/tests/nlu_core_tests/component_info_tests.py @@ -9,7 +9,6 @@ class TestComponentInfo(unittest.TestCase): def test_list_all_names(self): a = nlu.AllComponentsInfo() a.list_all_components() - a.DEBUG_list_all_components() def test_print_all_default_components_as_markdown(self): d = nlu.Spellbook.component_alias_references diff --git a/tests/nlu_core_tests/component_parameterization_tests.py b/tests/nlu_core_tests/component_parameterization_tests.py deleted file mode 100644 index 6f2d4039..00000000 --- a/tests/nlu_core_tests/component_parameterization_tests.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -import nlu - - -class TestParameterization(unittest.TestCase): - def test_set_parameters(self): - - pipe = nlu.load("sentiment") - print(pipe.keys()) - pipe.generate_class_metadata_table() - - -if __name__ == "__main__": - TestParameterization().test_entities_config() diff --git a/tests/nlu_core_tests/component_tests/albert_for_question_answering _tests.py b/tests/nlu_core_tests/component_tests/albert_for_question_answering _tests.py new file mode 100644 index 00000000..f8fdd7cd --- /dev/null +++ b/tests/nlu_core_tests/component_tests/albert_for_question_answering _tests.py @@ -0,0 +1,15 @@ +import unittest +import nlu + + + +def test_albert_for_question_answering(): + pipe = nlu.load("en.answer_question.squadv2.albert.xxl.by_sultan", verbose=True) + data = "What is my name?|||My name is CKL" + df = pipe.predict( + data, + ) + for c in df.columns: + print(df[c]) + + diff --git a/tests/nlu_hc_tests/component_tests/chunkmapper/chunk_mapper_tests.py b/tests/nlu_core_tests/component_tests/chunk_mapper_tests.py similarity index 100% rename from tests/nlu_hc_tests/component_tests/chunkmapper/chunk_mapper_tests.py rename to tests/nlu_core_tests/component_tests/chunk_mapper_tests.py diff --git a/tests/nlu_core_tests/component_tests/chunker_tests/__init__.py b/tests/nlu_core_tests/component_tests/chunker_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/chunker_tests/chunk_tests.py b/tests/nlu_core_tests/component_tests/chunker_tests/chunk_tests.py deleted file mode 100644 index 31164361..00000000 --- a/tests/nlu_core_tests/component_tests/chunker_tests/chunk_tests.py +++ /dev/null @@ -1,29 +0,0 @@ -import unittest - -from nlu import * - - -class TestChunks(unittest.TestCase): - def test_chunker(self): - example_text = [ - "A person like Jim or Joe", - "An organisation like Microsoft or PETA", - "A location like Germany", - "Anything else like Playstation", - "Person consisting of multiple tokens like Angela Merkel or Donald Trump", - "Organisations consisting of multiple tokens like JP Morgan", - "Locations consisting of multiple tokens like Los Angeles", - "Anything else made up of multiple tokens like Super Nintendo", - ] - res = nlu.load("chunk").predict( - example_text, - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in res.columns: - print(res[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/chunker_tests/ngram_tests.py b/tests/nlu_core_tests/component_tests/chunker_tests/ngram_tests.py deleted file mode 100644 index 486fcbce..00000000 --- a/tests/nlu_core_tests/component_tests/chunker_tests/ngram_tests.py +++ /dev/null @@ -1,50 +0,0 @@ -import unittest - -from nlu import * - - -class TestNGram(unittest.TestCase): - def test_ngram(self): - example_text = [ - "A person like Jim or Joe", - "An organisation like Microsoft or PETA", - "A location like Germany", - "Anything else like Playstation", - "Person consisting of multiple tokens like Angela Merkel or Donald Trump", - "Organisations consisting of multiple tokens like JP Morgan", - "Locations consiting of multiple tokens like Los Angeles", - "Anything else made up of multiple tokens like Super Nintendo", - ] - - print("OUTPUT LEVEL TOKEN") - df = nlu.load("ngram", verbose=True).predict( - example_text, - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - print("OUTPUT LEVEL CHUNK") - df = nlu.load("ngram", verbose=True).predict(example_text, output_level="chunk") - for c in df.columns: - print(df[c]) - - print("OUTPUT LEVEL SENTENCE") - df = nlu.load("ngram", verbose=True).predict( - example_text, output_level="sentence" - ) - for c in df.columns: - print(df[c]) - - print("OUTPUT LEVEL DOCUMENT") - df = nlu.load("ngram", verbose=True).predict( - example_text, output_level="document" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/__init__.py b/tests/nlu_core_tests/component_tests/classifier_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/albert_sequence_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/albert_sequence_classifier_tests.py deleted file mode 100644 index b476ef7a..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/albert_sequence_classifier_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -import nlu - - -class TestAlbertSequenceClassifier(unittest.TestCase): - - def test_albert_sequence_classifier(self): - pipe = nlu.load("en.classify.albert.ag_news", verbose=True) - data = "Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993." - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - pipe = nlu.load("en.classify.albert.imdb", verbose=True) - data = "I really liked that movie!" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print((df[c])) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/asr_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/asr_tests.py deleted file mode 100644 index b11cd53f..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/asr_tests.py +++ /dev/null @@ -1,45 +0,0 @@ -import unittest -import os - - -#os.environ['PYSPARK_PYTHON'] = '/home/ckl/anaconda3/bin/python3' -#os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/ckl/anaconda3/bin/python3' - - - -class AsrTestCase(unittest.TestCase): - def test_wav2vec(self): - import nlu - p = nlu.load('en.speech2text.wav2vec2.v2_base_960h',verbose=True) - FILE_PATH = os.path.normpath(r"tests/datasets/audio/asr/ngm_12484_01067234848.wav") - - print("Got p ",p) - df = p.predict(FILE_PATH) - print(df) - df = p.predict([FILE_PATH,FILE_PATH]) - print(df) - - - def test_hubert(self): - import nlu - p = nlu.load('en.speech2text.hubert.large_ls960',verbose=True) - FILE_PATH = os.path.normpath(r"tests/datasets/audio/asr/ngm_12484_01067234848.wav") - - print("Got p ",p) - df = p.predict(FILE_PATH) - print(df) - df = p.predict([FILE_PATH,FILE_PATH]) - print(df) - def test_whisper(self): - import nlu - p = nlu.load('xx.speech2text.whisper.tiny',verbose=True) - FILE_PATH = os.path.normpath(r"tests/datasets/audio/asr/ngm_12484_01067234848.wav") - - print("Got p ",p) - df = p.predict(FILE_PATH) - print(df) - df = p.predict([FILE_PATH,FILE_PATH]) - print(df) - -if __name__ == '__main__': - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/bert_sequence_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/bert_sequence_tests.py deleted file mode 100644 index 916a9b42..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/bert_sequence_tests.py +++ /dev/null @@ -1,31 +0,0 @@ -import unittest - -import nlu - - -class TestBertSeqClassifier(unittest.TestCase): - def test_bert_seq_classifier(self): - - te = [ - # - # 'en.classify.bert_sequence.imdb_large', - # 'en.classify.bert_sequence.imdb', - # 'en.classify.bert_sequence.ag_news', - # 'en.classify.bert_sequence.dbpedia_14', - # 'en.classify.bert_sequence.finbert', - "en.classify.bert_sequence.dehatebert_mono", - ] - - for t in te: - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - ["Peter love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - drop_irrelevant_cols=False, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/bert_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/bert_zero_shot_tests.py deleted file mode 100644 index 37717748..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/bert_zero_shot_tests.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest - -from nlu import * - - -class TestBertZeroShotClassifier(unittest.TestCase): - def test_bert_zero_shot_classifier(self): - pipe = nlu.load("en.bert.zero_shot_classifier", verbose=True) - df = pipe.predict( - ["I loved this movie when I was a child."], - output_level="sentence" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/camembert_sequence_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/camembert_sequence_tests.py deleted file mode 100644 index b1ed00aa..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/camembert_sequence_tests.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -import nlu - -class TestBertSeqClassifier(unittest.TestCase): - - def test_camembert_seq(self): - pipe = nlu.load("fr.classify.camembert.allocine.base", verbose=True) - data = "Bill Gates and Steve Jobs are good friends" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/cyber_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/cyber_tests.py deleted file mode 100644 index 35f69862..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/cyber_tests.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nlu import * - - -class TestCyber(unittest.TestCase): - def test_cyber_model(self): - pipe = nlu.load("cyberbullying", verbose=True) - df = pipe.predict( - ["Peter love pancaces. I hate Mondays", "I love Fridays"], - output_level="token", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/deberta_sequence_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/deberta_sequence_classifier_tests.py deleted file mode 100644 index aeec58d9..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/deberta_sequence_classifier_tests.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest - -import nlu - - -class TestDebertaSeqClassifier(unittest.TestCase): - def test_deberta_seq_classifier(self): - - models = [ - #"en.classify.sentiment.imdb.deberta.base", - #"en.classify.sentiment.imdb.deberta.large", - "en.classify.news.deberta.small", - #"en.classify.dbpedia", - #"en.classify.sentiment.imdb.deberta.small", - #"en.classify.news.deberta", - #"en.classify.sentiment.imdb.deberta", - #"fr.classify.allocine", - #"ur.classify.sentiment.imdb" - ] - - for model in models: - pipe = nlu.load(model, verbose=True) - df = pipe.predict( - ["I really liked that movie!"], - output_level="document", - drop_irrelevant_cols=False, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/distilbert_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/distilbert_zero_shot_tests.py deleted file mode 100644 index 7c099e7c..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/distilbert_zero_shot_tests.py +++ /dev/null @@ -1,40 +0,0 @@ -import unittest - -import nlu -from nlu import * - - -class TestDistilBertZeroShotClassifier(unittest.TestCase): - def test_distil_bert_zero_shot_classifier(self): - - pipe = nlu.load("en.distilbert.zero_shot_classifier", verbose=True) - df = pipe.predict(["I have a problem with my iphone that needs to be resolved asap!!"], - output_level="sentence", - ) - for c in df.columns: - print(df[c]) - - # Turkish Models and difference examples. - - pipe = nlu.load("tr.distilbert.zero_shot_classifier.multinli", verbose=True) - df = pipe.predict(['Dolar yükselmeye devam ediyor.'], output_level="sentence", ) - for c in df.columns: - print(df[c]) - - pipe = nlu.load("tr.distilbert.zero_shot_classifier.allnli", verbose=True) - df = pipe.predict(['Senaryo çok saçmaydı, beğendim diyemem.'], output_level="sentence", ) - for c in df.columns: - print(df[c]) - - pipe = nlu.load("tr.distilbert.zero_shot_classifier.snli", verbose=True) - df = pipe.predict( - ['Senaryo çok saçmaydı, beğendim diyemem.'], - output_level="sentence", - ) - - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/e2e_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/e2e_tests.py deleted file mode 100644 index 037833c7..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/e2e_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -from nlu import * - - -class TestE2E(unittest.TestCase): - def test_e2e_model(self): - df = nlu.load("en.classify.e2e", verbose=True).predict( - "You are so stupid", output_level="document" - ) - - for c in df.columns: - print(df[c]) - - df = nlu.load("e2e", verbose=True).predict( - "You are so stupid", output_level="sentence" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/emotion_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/emotion_tests.py deleted file mode 100644 index aadae423..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/emotion_tests.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest - -from nlu import * - - -class TestEmotion(unittest.TestCase): - def test_emotion_model(self): - # NLU will predict both as happy. If you reverse order both become sad - - pipe = nlu.load("emotion", verbose=True) - # df = component_list.predict(['I love pancaces. I hate Mondays', 'I love Fridays'], output_level='sentence',drop_irrelevant_cols=False, metadata=True, ) - # for os_components in df.columns: print(df[os_components]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/img_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/img_classifier_tests.py deleted file mode 100644 index 7a795931..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/img_classifier_tests.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nlu import * - - -class VitTest(unittest.TestCase): - def test_vit_model(self): - df = nlu.load("en.classify_image.base_patch16_224").predict([r'/media/ckl/dump/Documents/freelance/MOST_RECENT/jsl/nlu/nlu4realgit3/tests/datasets/ocr/vit/general_images/images/']) - df = nlu.load("en.classify_image.base_patch16_224").predict([r'/media/ckl/dump/Documents/freelance/MOST_RECENT/jsl/nlu/nlu4realgit3/tests/datasets/ocr/vit/general_images/images/']) - print(df) - def test_swin_model(self): - pipe = nlu.load("en.classify_image.swin.tiny").predict([r'/home/cll/Documents/jsl/nlu4realgit3/tests/datasets/ocr/vit/ox.jpg']) - print(pipe) - - -if __name__ == "__main__": - unittest.main() - - - diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/language_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/language_tests.py deleted file mode 100644 index 48b358d2..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/language_tests.py +++ /dev/null @@ -1,29 +0,0 @@ -import unittest - -from nlu import * - - -class TestLanguage(unittest.TestCase): - def test_language_model(self): - pipe = nlu.load("lang", verbose=True) - data = [ - "NLU is an open-source text processing library for advanced natural language processing for the Python language.", - "NLU est une bibliothèque de traitement de texte open source pour le traitement avancé du langage naturel pour les langages de programmation Python.", - "NLU ist eine Open-Source Text verarbeitungs Software fuer fortgeschrittene natuerlich sprachliche Textverarbeitung in der Python Sprache ", - ] - df = pipe.predict( - data, - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - multithread=False, - ) - for c in df.columns: - print(df[c]) - df = pipe.predict(data, output_level="document") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/longformer_sequence_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/longformer_sequence_classifier_tests.py deleted file mode 100644 index c86bf884..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/longformer_sequence_classifier_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -import nlu - - -class TestLongformerSequenceClassifier(unittest.TestCase): - - def test_longformer_sequence_classifier(self): - pipe = nlu.load("en.classify.ag_news.longformer", verbose=True) - data = "Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993." - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - pipe = nlu.load("en.classify.imdb.longformer", verbose=True) - data = "I really liked that movie!" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print((df[c])) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/ner_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/ner_tests.py deleted file mode 100644 index 304f785e..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/ner_tests.py +++ /dev/null @@ -1,32 +0,0 @@ -import unittest - -from nlu import * - - -class TestNer(unittest.TestCase): - def test_zh_ner(self): - pipe = nlu.load("zh.ner", verbose=True) - data = "您的生活就是矩阵编程固有的不平衡方程的剩余部分之和。您是异常的最终结果,尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里。" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - def test_aspect_ner(self): - pipe = nlu.load("en.ner.aspect_sentiment", verbose=True) - data = "We loved our Thai-style main which amazing with lots of flavours very impressive for vegetarian. But the service was below average and the chips were too terrible to finish." - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - def test_ner_pipe_confidences(self): - df = nlu.load("en.ner.onto.glove.6B_100d", verbose=True).predict( - "Donald Trump from America and Angela Merkel from Germany dont share many opinions.", - output_level="token", - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/pos_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/pos_tests.py deleted file mode 100644 index dbbca42f..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/pos_tests.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -from nlu import * - - -class TestPOS(unittest.TestCase): - def test_pos_model(self): - df = nlu.load("pos", verbose=True).predict( - "Part of Speech Tags identify each token in a sentence with a grammatical label" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/question_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/question_tests.py deleted file mode 100644 index b9bb9ceb..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/question_tests.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest - -from nlu import * - - -class TestQuestions(unittest.TestCase): - def test_questions_model(self): - pipe = nlu.load("questions", verbose=True) - data = ["I love pancaces. I hate Mondays", "I love Fridays"] - df = pipe.predict(data, output_level="sentence") - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/roberta_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/roberta_zero_shot_tests.py deleted file mode 100644 index 7e2eae6a..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/roberta_zero_shot_tests.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest - -from nlu import * - - -class TestRobertaZeroShotClassifier(unittest.TestCase): - def test_roberta_zero_shot_classifier(self): - pipe = nlu.load("en.roberta.zero_shot_classifier", verbose=True) - df = pipe.predict( - ["I loved this movie when I was a child."], - output_level="sentence", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/sarcasm_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/sarcasm_tests.py deleted file mode 100644 index 9990130f..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/sarcasm_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class TestSarcasm(unittest.TestCase): - def test_sarcasm_model(self): - pipe = nlu.load("sarcasm", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="sentence", - ) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_imdb.py b/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_imdb.py deleted file mode 100644 index dedab1d2..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_imdb.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -from nlu import * - - -class TestSentimentImdb(unittest.TestCase): - def test_sentiment_imdb_model(self): - pipe = nlu.load("sentiment.imdb", verbose=True) - df = pipe.predict(["I love pancaces. I hate Mondays", "I love Fridays"]) - print(df.columns) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - print(df.columns) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_twitter.py b/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_twitter.py deleted file mode 100644 index 1c55b7ae..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_test_twitter.py +++ /dev/null @@ -1,26 +0,0 @@ -import unittest - -from nlu import * - - -class TestSentimentTwitter(unittest.TestCase): - def test_sentiment_twitter_model(self): - pipe = nlu.load("sentiment.twitter", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="sentence", - ) - print(df.columns) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - print(df.columns) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_tests.py deleted file mode 100644 index d0ba4a93..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/sentiment_tests.py +++ /dev/null @@ -1,51 +0,0 @@ -import unittest - -from nlu import * - - -class TestSentiment(unittest.TestCase): - def test_sentiment_model(self): - pipe = nlu.load("sentiment", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - def test_sentiment_detector_model(self): - pipe = nlu.load("sentiment.imdb", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="sentence", - ) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - def test_sentiment_vivk_model(self): - pipe = nlu.load("sentiment.vivekn", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="sentence", - ) - print(df.columns) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/seq_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/seq_classifier_tests.py deleted file mode 100644 index b84fc394..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/seq_classifier_tests.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from nlu import * - - -class TestSeqClassifier(unittest.TestCase): - def test_sentiment_model(self): - seq_pipe = nlu.load("en.classify.roberta.imdb") - df = seq_pipe.predict("This movie was fucking asweomse!") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/snips.py b/tests/nlu_core_tests/component_tests/classifier_tests/snips.py deleted file mode 100644 index 11357f62..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/snips.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -from nlu import * - - -class TestCyber(unittest.TestCase): - def test_snips_classifer_model(self): - pipe = nlu.load("en.classify.snips", verbose=True) - df = pipe.predict(["I love pancaces. I hate Mondays", "I love Fridays"]) - print(df.columns) - for c in df.columns: - print(c, df[c]) - - def test_snips_ner_model(self): - pipe = nlu.load("en.ner.snips", verbose=True) - df = pipe.predict(["I love pancaces. I hate Mondays", "I love Fridays"]) - print(df.columns) - for c in df.columns: - print(c, df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/spam_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/spam_tests.py deleted file mode 100644 index 98bc6e18..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/spam_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class TestSpam(unittest.TestCase): - def test_spam_model(self): - pipe = nlu.load("spam", verbose=True) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="sentence", - ) - for c in df.columns: - print(df[c]) - df = pipe.predict( - ["I love pancaces. I hate Mondays", "I love Fridays"], - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/tapas_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/tapas_tests.py deleted file mode 100644 index 40f42223..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/tapas_tests.py +++ /dev/null @@ -1,153 +0,0 @@ -import unittest -import sparknlp -import librosa as librosa -from sparknlp.base import * -from sparknlp.annotator import * -import pandas as pd -import pyspark.sql.functions as F -from pyspark.sql.types import * -import pyspark.sql.functions as F -import sparknlp -import sparknlp -from pyspark.ml import Pipeline -from sparknlp.annotator import * -from sparknlp.base import * -import os - - -os.environ['PYSPARK_PYTHON'] = '/home/ckl/anaconda3/bin/python3' -os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/ckl/anaconda3/bin/python3' - - - -class TapasCase(unittest.TestCase): - def test_tapas(self): - """ - json_path = 'my/json_file.json' - 1. pipe.predict(json_path) - 2. pipe.predict([json_path,json_path,json_path]) - - json_string = "chungos muxos" - 3. pipe.predict(json_string) - 4. pipe.predict([json_string,json_string,]) - - - :return: - """ - import nlu - # p = nlu.load('en.tapas.wip',verbose=True) - spark = sparknlp.start() - json_data = """ -{ - "header": ["name", "money", "age"], - "rows": [ - ["Donald Trump", "$100,000,000", "75"], - ["Elon Musk", "$20,000,000,000,000", "55"] - ] -} -""" - - queries = [ - "Who earns less than 200,000,000?", - "Who earns 100,000,000?", - "How much money has Donald Trump?", - "How old are they?", -] - data = spark.createDataFrame([ - [json_data, " ".join(queries)] - ]).toDF("table_json", "questions") - csv_path = '/media/ckl/dump/Documents/freelance/MOST_RECENT/jsl/nlu/nlu4realgit3/tests/datasets/healthcare/sample_ADE_dataset.csv' - csv_data = pd.read_csv(csv_path) - document_assembler = MultiDocumentAssembler() \ - .setInputCols("table_json", "questions") \ - .setOutputCols("document_table", "document_questions") - sentence_detector = SentenceDetector() \ - .setInputCols(["document_questions"]) \ - .setOutputCol("questions") - table_assembler = TableAssembler() \ - .setInputCols(["document_table"]) \ - .setOutputCol("table") \ - .setInputFormat('csv') - # tapas = TapasForQuestionAnswering \ - # .pretrained("table_qa_tapas_base_finetuned_wtq", "en") \ - # .setInputCols(["questions", "table"]) \ - # .setOutputCol("answers") - - pipeline = Pipeline(stages=[ - document_assembler, - sentence_detector, - table_assembler, - # tapas - ]) - - - model = pipeline.fit(data) - model \ - .transform(data) \ - .selectExpr("explode(answers) AS answer") \ - .select("answer") \ - .show(truncate=False) - - - - def test_tapas_nlu_json_string(self): - """ - Like QA DataFormat for Question Answering . - Take in 1 CSV-Ish data-object + 1 Question-Store-Object. - Question-Store-Object is either Str, or array of Str where each element is a question to be asked on the CSV object - nlu.load(tapas).predict((tabular_data, question_data)) - tabular_data may be Pandas DF or Tabular Data String (JSON/CSV) - question_data may be a string or a list of Strings - nlu.load(tapas).predict((tabular_data, 'How old is the average employee?')) - nlu.load(tapas).predict((company_df, ['How old is the average employee?', 'How many people work in the IT deparment?'])) - - - - - - - - - # One column must be question, everything else is context. - input = /TupleIterable - with len(input) == 2 - input[0] = table_like - input[0] = str, Iterable[str] - p.predict((tabular_data, question(s))) - p.predict((tabular_data, question(s))) - - - # One Key must be question, ewverything else is context - p.predict(json_string) - p.predict(csv_string,q) - - p.predict(json_pat,qh) - p.predict(csv_path,q) - - - p.predict('Hello World') # NOT SUPPORTED! - Metadata Keys : question, aggregation, cell_positions cell_scores - :return: - """ - spark = sparknlp.start() - data_df = pd.DataFrame({'name':['Donald Trump','Elon Musk'], 'money': ['$100,000,000','$20,000,000,000,000'], 'age' : ['75','55'] }) - # {"header": ["name","money","age"], "rows": [["Donald Trump","$100,000,000","75"],["Elon Musk", "$20,000,000,000,000", "55"]]} - - questions = [ - "Who earns less than 200,000,000?", - "Who earns 100,000,000?", - "How much money has Donald Trump?", - "How old are they?", - ] - - tapas_data = (data_df, questions) - import nlu - p = nlu.load('en.answer_question.tapas.wikisql.base_finetuned') - res = p.predict(tapas_data) - print(p) - for c in res.columns: - print(res[c]) - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/token_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/token_classifier_tests.py deleted file mode 100644 index 304f785e..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/token_classifier_tests.py +++ /dev/null @@ -1,32 +0,0 @@ -import unittest - -from nlu import * - - -class TestNer(unittest.TestCase): - def test_zh_ner(self): - pipe = nlu.load("zh.ner", verbose=True) - data = "您的生活就是矩阵编程固有的不平衡方程的剩余部分之和。您是异常的最终结果,尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里。" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - def test_aspect_ner(self): - pipe = nlu.load("en.ner.aspect_sentiment", verbose=True) - data = "We loved our Thai-style main which amazing with lots of flavours very impressive for vegetarian. But the service was below average and the chips were too terrible to finish." - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - def test_ner_pipe_confidences(self): - df = nlu.load("en.ner.onto.glove.6B_100d", verbose=True).predict( - "Donald Trump from America and Angela Merkel from Germany dont share many opinions.", - output_level="token", - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/toxic_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/toxic_tests.py deleted file mode 100644 index 8eb00fd4..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/toxic_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class TestToxic(unittest.TestCase): - def test_toxic_model(self): - # nlu.load('en.ner.dl.bert').predict("I like Angela Merkel") - pipe = nlu.load("toxic", verbose=True) - data = [ - "You are so dumb you goofy dummy", - "You stupid person with an identity that shall remain unnamed, such a filthy identity that you have go to a bad place you person!", - ] - df = pipe.predict(data, output_level="sentence") - for c in df.columns: - print(df[c]) - - df = pipe.predict(data, output_level="document", metadata=True) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/transformer_ner_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/transformer_ner_tests.py deleted file mode 100644 index 9217052c..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/transformer_ner_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class TestNer(unittest.TestCase): - def test_camembert_ner(self): - pipe = nlu.load("en.ner.camembert_TEST", verbose=True) - data = "Bill Gates and Steve Jobs are good friends" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - def test_camembert_seq(self): - pipe = nlu.load("fr.camembert_base_sequence_classifier_allocine_TEST", verbose=True) - data = "Bill Gates and Steve Jobs are good friends" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/xlnet_sequence_classifier_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/xlnet_sequence_classifier_tests.py deleted file mode 100644 index 81b5e3ba..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/xlnet_sequence_classifier_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -import nlu - - -class TestXlnetSequenceClassifier(unittest.TestCase): - - def test_xlnet_sequence_classifier(self): - pipe = nlu.load("en.classify.ag_news.xlnet", verbose=True) - data = "Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993." - df = pipe.predict([data], output_level="document") - for c in df.columns: - print(df[c]) - - pipe = nlu.load("en.classify.imdb.xlnet", verbose=True) - data = "I really liked that movie!" - df = pipe.predict([data], output_level="document") - for c in df.columns: - print((df[c])) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/yake_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/yake_tests.py deleted file mode 100644 index 81168421..00000000 --- a/tests/nlu_core_tests/component_tests/classifier_tests/yake_tests.py +++ /dev/null @@ -1,45 +0,0 @@ -import unittest - -from nlu import * - - -class TestYake(unittest.TestCase): - def test_yake_model(self): - # setting meta to true will output scores for keywords. Lower scores are better - df = nlu.load("yake", verbose=True).predict( - "What a wonderful day! Arnold schwanenegger is the Terminator and he wants to get to the American chopper", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - df = nlu.load("yake", verbose=True).predict( - "What a wonderful day! Arnold schwanenegger is the Terminator and he wants to get to the American chopper", - metadata=False, - ) - for c in df.columns: - print(df[c]) - df = nlu.load("yake", verbose=True).predict( - "What a wonderful day! Arnold schwanenegger is the Terminator and he wants to get to the American chopper", - output_level="token", - ) - for c in df.columns: - print(df[c]) - df = nlu.load("yake", verbose=True).predict( - "What a wonderful day! Arnold schwanenegger is the Terminator and he wants to get to the American chopper", - output_level="chunk", - ) - for c in df.columns: - print(df[c]) - # Column name of confidence changed if yake at same or not at same output level! - df = nlu.load("yake", verbose=True).predict( - "What a wonderful day! Arnold schwanenegger is the Terminator and he wants to get to the American chopper", - output_level="document", - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_hc_tests/component_tests/contextual_parser/context_parser_tests.py b/tests/nlu_core_tests/component_tests/context_parser_tests.py similarity index 100% rename from tests/nlu_hc_tests/component_tests/contextual_parser/context_parser_tests.py rename to tests/nlu_core_tests/component_tests/context_parser_tests.py diff --git a/tests/nlu_core_tests/component_tests/embed_tests/embeddings_to_np_conversion_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/embeddings_to_np_conversion_tests.py deleted file mode 100644 index 0949dd37..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/embeddings_to_np_conversion_tests.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest - -from nlu import * - - -class TestEmbeddingsConversion(unittest.TestCase): - def test_word_embeddings_conversion(self): - df = nlu.load("bert", verbose=True).predict("How are you today") - for c in df.columns: - print(df[c]) - - def test_sentence_embeddings_conversion(self): - df = nlu.load("embed_sentence.bert", verbose=True).predict("How are you today") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bert_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bert_tests.py deleted file mode 100644 index 17e0ec8d..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bert_tests.py +++ /dev/null @@ -1,26 +0,0 @@ -import unittest - -from nlu import * - - -class TestBertSentenceEmbeddings(unittest.TestCase): - def test_bert_sentence_embeds(self): - df = nlu.load("embed_sentence.bert", verbose=True).predict( - "Am I the muppet or are you the muppet?", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - # df = nlu.load('en.classify.sarcasm',verbose=True).predict(sarcasm_df['text']) - for c in df.columns: - print(df[c]) - - df = nlu.load("en.embed.bert.small_L4_128", verbose=True).predict( - "No you are the muppet!" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_electra_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_electra_tests.py deleted file mode 100644 index ca6f7529..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_electra_tests.py +++ /dev/null @@ -1,29 +0,0 @@ -import unittest - -from nlu import * - - -class TestElectraSentenceEmbeddings(unittest.TestCase): - def test_electra_sentence_embeds(self): - res = nlu.load("embed_sentence.electra", verbose=True).predict( - "Am I the muppet or are you the muppet?" - ) - # df = nlu.load('en.classify.sarcasm',verbose=True).predict(sarcasm_df['text']) - for c in res: - print(res[c]) - - res = nlu.load("en.embed_sentence.electra", verbose=True).predict( - "Am I the muppet or are you the muppet?" - ) - # df = nlu.load('en.classify.sarcasm',verbose=True).predict(sarcasm_df['text']) - for c in res: - print(res[c]) - - # df = nlu.load('en.embed.bert.small_L4_128', verbose=True).predict("No you are the muppet!") - # print(df.columns) - # print(df) - # print(df['bert_embeddings']) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/bert_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/bert_tests.py deleted file mode 100644 index 5e2ab90b..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/bert_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class TestBertTokenEmbeddings(unittest.TestCase): - def test_bert_model(self): - df = nlu.load("bert", verbose=True).predict( - "Am I the muppet or are you the muppet?" - ) - # df = nlu.load('en.classify.sarcasm',verbose=True).predict(sarcasm_df['text']) - for c in df.columns: - print(df[c]) - - def test_multiple_bert_models(self): - df = nlu.load( - "en.embed.bert.small_L4_128 en.embed.bert.small_L2_256", verbose=True - ).predict("No you are the muppet!") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/distilbert_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/distilbert_tests.py deleted file mode 100644 index 1401c6f8..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/distilbert_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -from nlu import * - - -class TestdistilbertEmbeddings(unittest.TestCase): - def test_distilbert(self): - df = nlu.load("xx.embed.distilbert", verbose=True).predict( - "Am I the muppet or are you the muppet?", output_level="token" - ) - for c in df.columns: - print(df[c]) - - def test_NER(self): - df = nlu.load("ner", verbose=True).predict( - "Donald Trump from America and Angela Merkel from Germany are BFF" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/electra_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/electra_tests.py deleted file mode 100644 index a8922461..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/electra_tests.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest - -from nlu import * - - -class TestElectraTokenEmbeddings(unittest.TestCase): - def test_electra_model(self): - - df = nlu.load("bert electra ", verbose=True).predict( - "Am I the muppet or are you the muppet?" - ) - # df = nlu.load('en.classify.sarcasm',verbose=True).predict(sarcasm_df['text']) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/glove_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/glove_tests.py deleted file mode 100644 index b3a0d88b..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/glove_tests.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -from nlu import * - - -class TestGloveTokenEmbeddings(unittest.TestCase): - def test_glove_model(self): - df = nlu.load("glove", verbose=True).predict( - "Am I the muppet or are you the muppet?", output_level="token" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/multiple_embeddings.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/multiple_embeddings.py deleted file mode 100644 index 2272a6a8..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/multiple_embeddings.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - -from nlu import * - - -class TestMultipleEmbeddings(unittest.TestCase): - def test_multiple_embeddings(self): - df = nlu.load( - "bert en.embed.bert.small_L8_512 en.embed.bert.small_L8_512 en.embed.bert.small_L8_128 electra en.embed.bert.small_L10_128 en.embed.bert.small_L4_128", - verbose=True, - ).predict("Am I the muppet or are you the muppet?") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/roberta_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/roberta_tests.py deleted file mode 100644 index 8e8ad496..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/roberta_tests.py +++ /dev/null @@ -1,55 +0,0 @@ -import unittest - -from nlu import * - - -class TestRobertaEmbeddings(unittest.TestCase): - def test_roberta(self): - embeds = [ - "en.embed.distilbert", - "en.embed.distilbert.base", - "en.embed.distilbert.base.uncased", - "en.embed.distilroberta", - "en.embed.roberta", - "en.embed.roberta.base", - "en.embed.roberta.large", - "xx.embed.distilbert.", - "xx.embed.xlm", - "xx.embed.xlm.base", - "xx.embed.xlm.twitter", - ] - for e in embeds: - print( - f"+++++++++++++++++++++++++++++++++++++++++++++++++{e}+++++++++++++++++++++++++++++++++++++++++++++++++" - ) - p = nlu.load("en.embed.roberta") - df = p.predict("I love new embeds baby", output_level="token") - for c in df.columns: - print(df[c]) - - def test_new_embeds(self): - embeds = [ - "en.embed.distilbert", - "en.embed.distilbert.base", - "en.embed.distilbert.base.uncased", - "en.embed.distilroberta", - "en.embed.roberta", - "en.embed.roberta.base", - "en.embed.roberta.large", - "xx.embed.distilbert.", - "xx.embed.xlm", - "xx.embed.xlm.base", - "xx.embed.xlm.twitter", - ] - for e in embeds: - print( - f"+++++++++++++++++++++++++++++++++++++++++++++++++{e}+++++++++++++++++++++++++++++++++++++++++++++++++" - ) - p = nlu.load(e, verbose=True) - df = p.predict("I love new embeds baby", output_level="token") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/xlm_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/xlm_tests.py deleted file mode 100644 index 6060ccdb..00000000 --- a/tests/nlu_core_tests/component_tests/embed_tests/token_embeddings_tests/xlm_tests.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from nlu import * - - -class TestxlmEmbeddings(unittest.TestCase): - def test_xlm(self): - p = nlu.load("xx.embed.xlm", verbose=True) - df = p.predict("I love new embeds baby", output_level="token") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/matcher_tests/__init__.py b/tests/nlu_core_tests/component_tests/matcher_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/matcher_tests/matchers_tests.py b/tests/nlu_core_tests/component_tests/matcher_tests/matchers_tests.py deleted file mode 100644 index 5b2c32a4..00000000 --- a/tests/nlu_core_tests/component_tests/matcher_tests/matchers_tests.py +++ /dev/null @@ -1,55 +0,0 @@ -import unittest - -from nlu import * - - -class MatchTests(unittest.TestCase): - def test_pattern_matcher(self): - pass - pipe = nlu.load("match.pattern", verbose=True) - df = pipe.predict("2020 was a crazy year but wait for October 1. 2020") - for c in df.columns: - print(df[c]) - - def test_chunk_matcher(self): - pass - pipe = nlu.load("match.chunks", verbose=True) - df = pipe.predict("2020 was a crazy year but wait for October 1. 2020") - for c in df.columns: - print(df[c]) - - def download_entities_files(self): - import urllib2 - - response = urllib2.urlopen("https://wordpress.org/plugins/about/readme.txt") - data = response.read() - filename = "readme.txt" - file_ = open(filename, "w", encoding="utf8") - file_.write(data) - file_.close() - - def test_text_matcher(self): - p = "/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tmp/trasgh/entities.txt" - pipe = nlu.load("match.text", verbose=True) - pipe["text_matcher"].setEntities(p) - df = pipe.predict("2020 was a crazy year but wait for October 1. 2020") - for c in df.columns: - print(df[c]) - - def test_regex_matcher(self): - p = "/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tmp/trasgh/rulesd.txt" - pipe = nlu.load("match.regex", verbose=True) - pipe["regex_matcher"].setExternalRules(path=p, delimiter=",") - df = pipe.predict("2020 was a crazy year but wait for October 1. 2020") - for c in df.columns: - print(df[c]) - - def test_date_matcher(self): - pipe = nlu.load("match.date", verbose=True) - df = pipe.predict("2020 was a crazy year but wait for October 1. 2020") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/partially_implemented_tests.py b/tests/nlu_core_tests/component_tests/partially_implemented_tests.py similarity index 100% rename from tests/nlu_core_tests/partially_implemented_tests.py rename to tests/nlu_core_tests/component_tests/partially_implemented_tests.py diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/__init__.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/document_normalizer_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/document_normalizer_tests.py deleted file mode 100644 index 1cdf7e51..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/document_normalizer_tests.py +++ /dev/null @@ -1,21 +0,0 @@ -import unittest - -from nlu import * - - -class TestNormalize(unittest.TestCase): - def test_document_normalizer_pipe(self): - pipe = nlu.load("norm_document", verbose=True) - data = " Example

This is an example of a simple HTML page with one paragraph.

" - df = pipe.predict( - data, - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/lem_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/lem_tests.py deleted file mode 100644 index bde3c388..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/lem_tests.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nlu import * - - -class TestLem(unittest.TestCase): - def test_stem_pipe(self): - pipe = nlu.load("lemma", verbose=True) - df = pipe.predict( - "HELLO WORLD! How are YOU!?!@", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/normalizer_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/normalizer_tests.py deleted file mode 100644 index ec642d33..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/normalizer_tests.py +++ /dev/null @@ -1,26 +0,0 @@ -import unittest - -from nlu import * - - -class TestNormalize(unittest.TestCase): - def test_norm_pipe(self): - pipe = nlu.load("norm", verbose=True) - df = pipe.predict( - "HELLO WORLD! How are YOU!?!@", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - pipe["normalizer"].setLowercase(True) - - df = pipe.predict("HELLO WORLD! How are YOU!@>?!@") - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/sentence_detector_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/sentence_detector_tests.py deleted file mode 100644 index 1599f8d7..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/sentence_detector_tests.py +++ /dev/null @@ -1,51 +0,0 @@ -import unittest - -from nlu import * - - -class TestSentenceDetector(unittest.TestCase): - def test_sentence_detector(self): - pipe = nlu.load( - "sentence_detector", - verbose=True, - ) - df = pipe.predict( - "I like my sentences detected. Some like their sentences warm. Warm is also good.", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - def test_sentence_detector_multi_lang(self): - pipe = nlu.load( - "xx.sentence_detector", - verbose=True, - ) - df = pipe.predict( - "I like my sentences detected. Some like their sentences warm. Warm is also good.", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - def test_sentence_detector_pragmatic(self): - pipe = nlu.load( - "sentence_detector.pragmatic", - verbose=True, - ) - df = pipe.predict( - "I like my sentences detected. Some like their sentences warm. Warm is also good.", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/spell_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/spell_tests.py deleted file mode 100644 index f51fd4b7..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/spell_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - - -class TestSpellCheckers(unittest.TestCase): - # - # def test_spell_context(self): - # pipe = nlu.load('en.spell', verbose=True) - # df = pipe.predict('I liek penut buttr and jelly', drop_irrelevant_cols=False, metadata=True, ) - # for c in df.columns: print(df[c]) - # - # def test_spell_sym(self): - # component_list = nlu.load('spell.symmetric', verbose=True) - # df = component_list.predict('I liek penut buttr and jelly', drop_irrelevant_cols=False, metadata=True, ) - # for os_components in df.columns: print(df[os_components]) - # - # def test_spell_norvig(self): - # component_list = nlu.load('spell.norvig', verbose=True) - # df = component_list.predict('I liek penut buttr and jelly', drop_irrelevant_cols=False, metadata=True, ) - # for os_components in df.columns: print(df[os_components]) - pass - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/stem_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/stem_tests.py deleted file mode 100644 index f0979aee..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/stem_tests.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nlu import * - - -class TestStem(unittest.TestCase): - def test_stem_pipe(self): - pipe = nlu.load("stem", verbose=True) - df = pipe.predict( - "HELLO WORLD! How are YOU!?!@", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/pre_processing_tests/stopword_tests.py b/tests/nlu_core_tests/component_tests/pre_processing_tests/stopword_tests.py deleted file mode 100644 index 6152b05d..00000000 --- a/tests/nlu_core_tests/component_tests/pre_processing_tests/stopword_tests.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nlu import * - - -class TestLem(unittest.TestCase): - def test_stopwords_pipe(self): - pipe = nlu.load("stopwords", verbose=True) - df = pipe.predict( - "HELLO WORLD! How are YOU!?!@", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/seq2seq/__init__.py b/tests/nlu_core_tests/component_tests/seq2seq/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/seq2seq/bart_transformer.py b/tests/nlu_core_tests/component_tests/seq2seq/bart_transformer.py deleted file mode 100644 index e3011a7f..00000000 --- a/tests/nlu_core_tests/component_tests/seq2seq/bart_transformer.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import sys -import unittest -from nlu import * - - -summarizer_spells = [ - 'en.seq2seq.distilbart_cnn_6_6', - 'en.seq2seq.distilbart_xsum_12_6' -] - -class BartTransformerTests(unittest.TestCase): - def test_bart_transformer(self): - - - for s in summarizer_spells: - pipe = nlu.load(s) - # Configure relations to extract - print("TESTING: ", s) - df = pipe.predict("Paracetamol can alleviate headache or") - print(df.columns) - for c in df: - print(c) - print(df[c]) - - print(df) - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/seq2seq/marian_tests.py b/tests/nlu_core_tests/component_tests/seq2seq/marian_tests.py deleted file mode 100644 index 189e77ea..00000000 --- a/tests/nlu_core_tests/component_tests/seq2seq/marian_tests.py +++ /dev/null @@ -1,70 +0,0 @@ -import unittest - -from nlu import * - - -class TestMarian(unittest.TestCase): - def test_marian_en_to_de(self): - pipe = nlu.load("en.translate_to.de", verbose=True) - data = [ - "Who is president of germany", - "Who is donald trump ?", - "What is NLP?", - "How to make tea?", - ] - df = pipe.predict( - data, - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - print(df.columns) - - print(df["translation"]) - print(df.columns) - - def test_marian_de_to_en(self): - pipe = nlu.load("de.translate_to.en", verbose=True) - # test for each tasks - data = [ - "Wer ist Praesident von Deutschland", - "Wer ist donald trump ?", - "Was ist NLP?", - "Wie macht man Tee?", - ] - df = pipe.predict(data) - print(df.columns) - print(df["translation"]) - print(df.columns) - - def test_marian_de_to_en_pipe(self): - pipe = nlu.load("de.marian.translate_to.en", verbose=True) - # test for each tasks - data = [ - "Wer ist Praesident von Deutschland", - "Wer ist donald trump ?", - "Was ist NLP?", - "Wie macht man Tee?", - ] - df = pipe.predict(data) - print(df.columns) - print(df["marian"]) - print(df.columns) - - def test_marian_model(self): - pipe = nlu.load("en.marian.translate_to.umb", verbose=True) - # test for each tasks - data = [ - "Wer ist Praesident von Deutschland", - "Wer ist donald trump ?", - "Was ist NLP?", - "Wie macht man Tee?", - ] - df = pipe.predict(data) - print(df.columns) - print(df["marian"]) - print(df.columns) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/seq2seq/t5_tests.py b/tests/nlu_core_tests/component_tests/seq2seq/t5_tests.py deleted file mode 100644 index fed48cdd..00000000 --- a/tests/nlu_core_tests/component_tests/seq2seq/t5_tests.py +++ /dev/null @@ -1,379 +0,0 @@ -import unittest - -from nlu import * - - -class TestT5(unittest.TestCase): - def test_t5(self): - pipe = nlu.load("en.t5.small", verbose=True) - data = [ - "The first matrix I designed was quite naturally perfect. It was a work of art. Flawless. Sublime. A triumph only equaled by its monumental failure.", - "I love peanut butter and jelly", - "Who is president of America", - "Who is president of Germany", - "What is your favorite food", - ] - pipe["t5"].setTask("translate English to French") - df = pipe.predict(data, output_level="document") - print("French:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("translate English to German") - df = pipe.predict(data, output_level="document") - print("German:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("Question") - df = pipe.predict(data, output_level="document") - print("Question:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("Make it sad") - df = pipe.predict(data, output_level="document") - print("SAD:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("Make it stupid") - df = pipe.predict(data, output_level="document") - print("STUPID:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("Make it angry") - df = pipe.predict(data, output_level="document") - print("ANGRY:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("Translate English to German") - df = pipe.predict(data, output_level="document") - print("GERMAN:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("cola sentence:") - df = pipe.predict(data, output_level="document") - print("COLA:") - print(df["T5"]) - print(df.columns) - - pipe["t5"].setTask("translate English to Spanish") - df = pipe.predict(data, output_level="document") - print("Spanish:") - print(df["T5"]) - print(df.columns) - - def test_task1_cola(self): - pipe = nlu.load("en.t5.base", verbose=True) - data = [ - "John made Bill master of himself", - "Anna and Mike is going skiing and they is liked is", - "Anna and Mike like to dance", - ] - - pipe["t5"].setTask("cola sentence:") - res = pipe.predict( - data, - output_level="document", - drop_irrelevant_cols=False, - metadata=True, - ) - print("TEST Task 1 : Sentence acceptability judgment,CoLA") - print(res["T5"]) - - def test_task2_RTE(self): - pipe = nlu.load("en.t5.base", verbose=True) - data = [ - "Recent report say Johnny makes he alot of money, he earned 10 million USD each year for the last 5 years. sentence2: Johnny is a millionare.", - "Recent report say Johnny makes he alot of money, he earned 10 million USD each year for the last 5 years. sentence2: Johnny is a poor man.", - "It was raining in England for the last 4 weeks. sentence2: Yesterday, England was very wet. ", - "I live in italy. sentence2: I live in Europe", - "Peter loves New York, it is his favorite city. sentence2: Peter loves new York.", - ] - - pipe["t5"].setTask("rte sentence1:") - res = pipe.predict(data, output_level="document") - print("TEST Task 2 : Natural language inference,RTE") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task3_MNLI(self): - pipe = nlu.load("en.t5.base", verbose=True) - data = [ - "At 8:34, the Boston Center controller received a third, transmission from American 11. premise: The Boston Center controller got a third transmission from American 11." - ] - - pipe["t5"].setTask("mnli hypothesis:") - res = pipe.predict(data, output_level="document") - print("TEST Task 3 : Natural language inference, MNLI") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task4_MRPC(self): - pipe = nlu.load("en.t5.base", verbose=True) - data = [ - 'We acted because we saw the existing evidence in a new light , through the prism of our experience on 11 September , " Rumsfeld said . sentence2: Rather , the US acted because the administration saw " existing evidence in a new light , through the prism of our experience on September 11"', - 'We acted because we saw the existing evidence in a new light , through the prism of our experience on 11 September , " Rumsfeld said . Rather , the US acted because the administration saw " existing evidence in a new light , through the prism of our experience on September 11"' - " It is raining hot dogs! I like ice cream", - " It was 40 degrees all day. It was pretty hot today", - " It is raining hot dogs! sentence2: I like ice cream", - " It was 40 degrees all day", - " sentence2: It was pretty hot today", - ] - - pipe["t5"].setTask("mrpc sentence1:") - res = pipe.predict(data, output_level="document") - print("TEST Task 4 : Natural language inference, MNLI") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task5_QNLI(self): - pipe = nlu.load("en.t5.base", verbose=True) - data = [ - "Where did Jebe die? sentence: Ghenkis Khan recalled Subtai back to Mongolia soon afterwards, and Jebe died on the road back to Samarkand ", - "What does Steve like to eat? sentence: Steve watches TV all day", - ] - - pipe["t5"].setTask("qnli question:") - res = pipe.predict(data, output_level="document") - print("TEST Task 5 : Natural language inference, QNLI") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task6_QQP(self): - data = [ - "What attributes would have made you highly desirable in ancient Rome? question2: How I GET OPPERTINUTY TO JOIN IT COMPANY AS A FRESHER?", - "What was it like in Ancient rome? question2: What was Ancient rome like?", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("qqp question1:") - res = pipe.predict(data, output_level="document") - print("TEST Task 6 : Natural language inference, QQP") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task7_SST2(self): - data = [ - "it confirms fincher ’s status as a film maker who artfully bends technical know-how to the service of psychological insight", - "I hated that movie", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("sst2 sentence: ") - res = pipe.predict(data, output_level="document") - print("TEST Task 7 : BINARY SENTIMENT, SST2") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task8_STSB(self): - data = [ - "What attributes would have made you highly desirable in ancient Rome? sentence2: How I GET OPPERTINUTY TO JOIN IT COMPANY AS A FRESHER?", - "What was it like in Ancient rome? sentence2: What was live like as a King in Ancient Rome?", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("stsb sentence1: ") - res = pipe.predict(data, output_level="document") - print("TEST Task 8 : Regressive Sentence Similarity , STSB") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task9_CB(self): - data = [ - "Valence was helping premise: Valence the void-brain, Valence the virtuous valet. Why couldn’t the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping", - "What attributes would have made you highly desirable in ancient Rome? premise: How I GET OPPERTINUTY TO JOIN IT COMPANY AS A FRESHER?", - "What was it like in Ancient rome? premise: What was live like as a King in Ancient Rome?", - "Peter lov", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("cb hypothesis: ") - res = pipe.predict(data, output_level="document") - print("TEST Task 9 : CB ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task10_COPA(self): - - data = [ - "Many citizens relocated to the capitol. choice2: Many citizens took refuge in other territories. premise: Political violence broke out in the nation. question: effect", - " He fell off the ladder. choice2: He climbed up the lader premise: The man lost his balance on the ladder. question: what happend was result?", - " He fell off the ladder. choice2: He climbed up the lader premise: The man lost his balance on the ladder. question: effect", - " He fell off the ladder. choice2: He climbed up the lader premise: The man lost his balance on the ladder. question: correct", - " many citizens relocated to the capitol. choice2: Many citizens took refuge in other territories premise : Politcal Violence broke out in the nation. question: effect", - " many citizens relocated to the capitol. choice2: Many citizens took refuge in other territories premise : Politcal Violence broke out in the nation. question: correct", - " many citizens relocated to the capitol. choice2: Many citizens took refuge in other territories premise : Politcal Violence broke out in the nation. question: bananas?", - " The assailant struck the man in the head. choice2: The assailant took the man’s wallet. premise: The man fell unconscious. question: What was the cause if this?", - " The assailant struck the man in the head. choice2: The assailant took the man’s wallet. premise: The man fell unconscious. question: effect", - " The assailant struck the man in the head. choice2: The assailant took the man’s wallet. premise: The man fell unconscious. question: correct", - " The assailant struck the man in the head. choice2: The assailant took the man’s wallet. premise: The man fell unconscious.", - " He was in the kitchen cooking choice2: He was at work choice3: He went to the mooon choice4: He went to the gym and worked out premise : The man ate a peanut butter sandwich", - " He went tasdasdasdo the gym and worked dasdaout choice2: He was at work choice3: He went to the mooon choice4: He was in the kitchen cooking premise : The man ate a peanut butter sandwich", - " He went to theasdasdas gdasym dasand dwasdsorked out choice2: He was at work choice3: He went to the mooon choice4: He was in the kitchen cooking premise : The man ate a peanut butter sandwich question: correct", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("copa choice1: ") - res = pipe.predict(data, output_level="document") - print("TEST Task 10 : COPA ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task11_MultiRc(self): - - data = [ - # paragraph: - """ - Why was Joey surprised the morning he woke up for breakfast? - answer: There was a T-REX in his garden. - paragraph: - Sent 1: Once upon a time, there was a squirrel named Joey. - Sent 2: Joey loved to go outside and play with his cousin Jimmy. - Sent 3: Joey and Jimmy played silly games together, and were always laughing. - Sent 4: One day, Joey and Jimmy went swimming together 50 at their Aunt Julie’s pond. - Sent 5: Joey woke up early in the morning to eat some food before they left. - Sent 6: He couldn’t find anything to eat except for pie! - Sent 7: Usually, Joey would eat cereal, fruit (a pear), or oatmeal for breakfast. - Sent 8: After he ate, he and Jimmy went to the pond. - Sent 9: On their way there they saw their friend Jack Rabbit. - Sent 10: They dove into the water and swam for several hours. - Sent 11: The sun was out, but the breeze was cold. - Sent 12: Joey and Jimmy got out of the water and started walking home. - Sent 13: Their fur was wet, and the breeze chilled them. - Sent 14: When they got home, they dried off, and Jimmy put on his favorite purple shirt. - Sent 15: Joey put on a blue shirt with red and green dots. - Sent 16: The two squirrels ate some food that Joey’s mom, Jasmine, made and went off to bed. - """, - """ - Why was Joey surprised the morning he woke up for breakfast? - answer: There was only pie to eat. - paragraph: - Sent 1: Once upon a time, there was a squirrel named Joey. - Sent 2: Joey loved to go outside and play with his cousin Jimmy. - Sent 3: Joey and Jimmy played silly games together, and were always laughing. - Sent 4: One day, Joey and Jimmy went swimming together 50 at their Aunt Julie’s pond. - Sent 5: Joey woke up early in the morning to eat some food before they left. - Sent 6: He couldn’t find anything to eat except for pie! - Sent 7: Usually, Joey would eat cereal, fruit (a pear), or oatmeal for breakfast. - Sent 8: After he ate, he and Jimmy went to the pond. - Sent 9: On their way there they saw their friend Jack Rabbit. - Sent 10: They dove into the water and swam for several hours. - Sent 11: The sun was out, but the breeze was cold. - Sent 12: Joey and Jimmy got out of the water and started walking home. - Sent 13: Their fur was wet, and the breeze chilled them. - Sent 14: When they got home, they dried off, and Jimmy put on his favorite purple shirt. - Sent 15: Joey put on a blue shirt with red and green dots. - Sent 16: The two squirrels ate some food that Joey’s mom, Jasmine, made and went off to bed.""", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("multirc question: ") - res = pipe.predict( - data, - output_level="document", - drop_irrelevant_cols=False, - metadata=True, - ) - print("TEST Task 11 : MultiRC - Question Answering ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task12_WiC(self): - data = [ - """ - sentence1: The airplane crash killed his family. - sentence2: He totally killed that rock show!. - word : kill - """, - """ - sentence1: The expanded window will give us time to catch the thieves. - sentence2: You have a two-hour window of turning in your homework. - word : window - """, - """ - sentence1: He jumped out the window. - sentence2: You have a two-hour window of turning in your homework. - word : window - """, - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("wic pos: ") # wic pos: - res = pipe.predict(data, output_level="document") - print("TEST Task 12 : WiC - Word sense disambiguation ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task13_WSC_DPR(self): - # todo - data = [ - "wsc: The stable was very roomy, with four good stalls; a large swinging window opened into the yard , which made *it* pleasant and airy.", - "wsc : The party was really crazy and a lot of people had fun until *it* ended.", - "wsc : The party was really crazy but the the car killed everybody, *it* was going so fast!.", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("") # wsc: - res = pipe.predict(data, output_level="document") - print("TEST Task 13 : WSC - Coreference resolution ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task14_text_summarization(self): - - data = [ - "the belgian duo took to the dance floor on monday night with some friends . manchester united face newcastle in the premier league on wednesday . red devils will be looking for just their second league away win in seven . louis van gaal’s side currently sit two points clear of liverpool in fourth ." - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("summarize: ") # wsc: - res = pipe.predict(data, output_level="document") - print("TEST Task 14 : Summarization ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_task15_SQuAD_question_answering(self): - - data = [ - "What does increased oxygen concentrations in the patient’s lungs displace? context: Hyperbaric (high-pressure) medicine uses special oxygen chambers to increase the partial pressure of O 2 around the patient and, when needed, the medical staff. Carbon monoxide poisoning, gas gangrene, and decompression sickness (the ’bends’) are sometimes treated using these devices. Increased O 2 concentration in the lungs helps to displace carbon monoxide from the heme group of hemoglobin. Oxygen gas is poisonous to the anaerobic bacteria that cause gas gangrene, so increasing its partial pressure helps kill them. Decompression sickness occurs in divers who decompress too quickly after a dive, resulting in bubbles of inert gas, mostly nitrogen and helium, forming in their blood. Increasing the pressure of O 2 as soon as possible is part of the treatment.", - "What did Joey eat for breakfast ? context : Once upon a time, there was a squirrel named Joey. Joey loved to go outside and play with his cousin Jimmy. Joey and Jimmy played silly games together, and were always laughing. One day, Joey and Jimmy went swimming together 50 at their Aunt Julie’s pond. Joey woke up early in the morning to eat some food before they left. for breakfast. After he ate, he and Jimmy went to the pond. On their way there they saw their friend Jack Rabbit. They dove into the water and swam for several hours. The sun was out, but the breeze was cold. Joey and Jimmy got out of the water and started walking home. Their fur was wet, and the breeze chilled them. When they got home, they dried off, and Jimmy put on his favorite purple shirt. Joey put on a blue shirt with red and green dots. The two squirrels ate some food that Joey’s mom, Jasmine, made and went off to bed,", - ] - - pipe = nlu.load("en.t5.base", verbose=True) - pipe["t5"].setTask("question: ") # wsc: - res = pipe.predict(data, output_level="document") - print("TEST Task 15 : SQuAD question answering ") - print(res.columns) - print(res[["T5", "document"]]) - - def test_pre_config_t5_summarize(self): - data = [ - "the belgian duo took to the dance floor on monday night with some friends . manchester united face newcastle in the premier league on wednesday . red devils will be looking for just their second league away win in seven . louis van gaal’s side currently sit two points clear of liverpool in fourth ." - ] - - pipe = nlu.load("en.t5.summarize", verbose=True) - - res = pipe.predict(data) - print(res.columns) - print(res[["T5", "document"]]) - - def test_pre_config_t5_summarize_alias(self): - data = [ - "the belgian duo took to the dance floor on monday night with some friends . manchester united face newcastle in the premier league on wednesday . red devils will be looking for just their second league away win in seven . louis van gaal’s side currently sit two points clear of liverpool in fourth ." - ] - - pipe = nlu.load("summarize", verbose=True) - - res = pipe.predict(data) - print(res.columns) - print(res[["T5", "document"]]) - pipe.print_info() - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_bert_coref_tests/__init__.py b/tests/nlu_core_tests/component_tests/span_bert_coref_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/span_bert_coref_tests/span_bert_coref.py b/tests/nlu_core_tests/component_tests/span_bert_coref_tests/span_bert_coref.py deleted file mode 100644 index a4a64ffd..00000000 --- a/tests/nlu_core_tests/component_tests/span_bert_coref_tests/span_bert_coref.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest -import nlu - - -class SpanBertCorefCase(unittest.TestCase): - def test_coref_model(self): - data = 'John told Mary he would like to borrow a book' - p = nlu.load('en.coreference.spanbert') - res = p.predict(data) - - for c in res.columns: - print(res[c]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/albert_for_question_answering _tests.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/albert_for_question_answering _tests.py deleted file mode 100644 index 89dcffa3..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/albert_for_question_answering _tests.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class AlbertForQuestionAnsweringTestCase (unittest.TestCase): - def test_albert_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.albert.xxl.by_sultan", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/bert_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/bert_for_question_answering.py deleted file mode 100644 index 413bbf56..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/bert_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class BertForQuestionAnsweringTestCase (unittest.TestCase): - def test_bert_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.bert.base_cased.by_deepset", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/camembert_for_question_answering _tests.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/camembert_for_question_answering _tests.py deleted file mode 100644 index 4544369c..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/camembert_for_question_answering _tests.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class CamemBertForQuestionAnsweringTestCase (unittest.TestCase): - def test_albert_for_question_answering(self): - pipe = nlu.load("fr.answer_question.camembert.fquad", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/deberta_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/deberta_for_question_answering.py deleted file mode 100644 index d6b255ad..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/deberta_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class DeBertaForQuestionAnsweringTestCase (unittest.TestCase): - def test_deberta_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.deberta", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/distilbert_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/distilbert_for_question_answering.py deleted file mode 100644 index a086db76..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/distilbert_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class DistilBertForQuestionAnsweringTestCase (unittest.TestCase): - def test_distilbert_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.distil_bert.base_cased", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/longformer_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/longformer_for_question_answering.py deleted file mode 100644 index 990ef870..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/longformer_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class LongformerForQuestionAnsweringTestCase (unittest.TestCase): - def test_longformer_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.longformer.base", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/roberta_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/roberta_for_question_answering.py deleted file mode 100644 index d5c6e54c..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/roberta_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class RoBertaForQuestionAnsweringTestCase (unittest.TestCase): - def test_roberta_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.roberta.base.by_deepset", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/xlmroberta_for_question_answering.py b/tests/nlu_core_tests/component_tests/span_classifier_tests/xlmroberta_for_question_answering.py deleted file mode 100644 index 24374141..00000000 --- a/tests/nlu_core_tests/component_tests/span_classifier_tests/xlmroberta_for_question_answering.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import nlu - - -class XlmRoBertaForQuestionAnsweringTestCase (unittest.TestCase): - def test_xlmroberta_for_question_answering(self): - pipe = nlu.load("en.answer_question.squadv2.xlm_roberta.base", verbose=True) - data = "What is my name?|||My name is CKL" - df = pipe.predict( - data, - ) - for c in df.columns: - print(df[c]) - - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/span_classifier_tests/span_question_data_conversion_tests.py b/tests/nlu_core_tests/component_tests/span_question_data_conversion_tests.py similarity index 100% rename from tests/nlu_core_tests/component_tests/span_classifier_tests/span_question_data_conversion_tests.py rename to tests/nlu_core_tests/component_tests/span_question_data_conversion_tests.py diff --git a/tests/nlu_core_tests/component_tests/tokenizer_tests/word_segmenter_tests.py b/tests/nlu_core_tests/component_tests/tokenizer_tests/word_segmenter_tests.py deleted file mode 100644 index 4c741ecf..00000000 --- a/tests/nlu_core_tests/component_tests/tokenizer_tests/word_segmenter_tests.py +++ /dev/null @@ -1,32 +0,0 @@ -import unittest - -from nlu import * - - -class TestWordSegmenter(unittest.TestCase): - def test_word_segmenter(self): - pipe = nlu.load("zh.segment_words", verbose=True) - data = "您的生活就是矩阵编程固有的不平衡方程的剩余部分之和。您是异常的最终结果,尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里。" - df = pipe.predict( - data, - output_level="token", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - pipe = nlu.load("zh.tokenize", verbose=True) - data = "您的生活就是矩阵编程固有的不平衡方程的剩余部分之和。您是异常的最终结果,尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里。" - df = pipe.predict( - [data], - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/typed_dependency_tests/__init__.py b/tests/nlu_core_tests/component_tests/typed_dependency_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/typed_dependency_tests/typed_dependency_tests_tests.py b/tests/nlu_core_tests/component_tests/typed_dependency_tests/typed_dependency_tests_tests.py deleted file mode 100644 index 18af210d..00000000 --- a/tests/nlu_core_tests/component_tests/typed_dependency_tests/typed_dependency_tests_tests.py +++ /dev/null @@ -1,35 +0,0 @@ -import unittest - -from nlu import * - - -class TestDepTyped(unittest.TestCase): - def test_dependency_typed_model(self): - # This test takes too much ram on standard github actions machine - return - df = nlu.load("dep.typed", verbose=True).predict( - "I love peanutbutter and jelly", - output_level="sentence", - drop_irrelevant_cols=False, - metadata=True, - ) - for c in df.columns: - print(df[c]) - - print("SENTENCE") - df = nlu.load("dep.typed", verbose=True).predict( - "I love peanutbutter and jelly", output_level="sentence" - ) - for c in df.columns: - print(df[c]) - - print("TOKEN") - df = nlu.load("dep.typed", verbose=True).predict( - "I love peanutbutter and jelly", output_level="token" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/untyped_dependency_tests/__init__.py b/tests/nlu_core_tests/component_tests/untyped_dependency_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/component_tests/untyped_dependency_tests/untyped_dependency_tests_tests.py b/tests/nlu_core_tests/component_tests/untyped_dependency_tests/untyped_dependency_tests_tests.py deleted file mode 100644 index a4380f76..00000000 --- a/tests/nlu_core_tests/component_tests/untyped_dependency_tests/untyped_dependency_tests_tests.py +++ /dev/null @@ -1,29 +0,0 @@ -import unittest - -from nlu import * - - -class TestDepUntyped(unittest.TestCase): - def test_dependency_untyped_model(self): - # This test takes too much ram on standard github actions machine - df = nlu.load("dep.untyped", verbose=True).predict( - "I love peanutbutter and jelly", output_level="document" - ) - for c in df.columns: - print(df[c]) - - df = nlu.load("dep.untyped", verbose=True).predict( - "I love peanutbutter and jelly", output_level="sentence" - ) - for c in df.columns: - print(df[c]) - - df = nlu.load("dep.untyped", verbose=True).predict( - "I love peanutbutter and jelly", output_level="token" - ) - for c in df.columns: - print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/training_tests/trained_pipe_tests/model_load_tests.py b/tests/nlu_core_tests/model_load_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/trained_pipe_tests/model_load_tests.py rename to tests/nlu_core_tests/model_load_tests.py diff --git a/tests/nlu_core_tests/training_tests/trained_pipe_tests/model_save_tests.py b/tests/nlu_core_tests/model_save_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/trained_pipe_tests/model_save_tests.py rename to tests/nlu_core_tests/model_save_tests.py diff --git a/tests/nlu_core_tests/namespace_tests.py b/tests/nlu_core_tests/namespace_tests.py deleted file mode 100644 index dfc13883..00000000 --- a/tests/nlu_core_tests/namespace_tests.py +++ /dev/null @@ -1,388 +0,0 @@ -import unittest - -from nlu import * -from tests.test_utils import get_sample_pdf - - -class TestNameSpace(unittest.TestCase): - def test_tokenize(self): - df = nlu.load("en.tokenize").predict("What a wonderful day!") - - print(df) - - df = nlu.load("tokenize").predict("What a wonderful day!") - print(df) - - def test_pos(self): - - df = nlu.load("pos", verbose=True).predict("What a wonderful day!") - print(df) - - # - # def test_embed(self): - # # df = nlu.load('en.embed').predict('What a wonderful day!') - # # - # # print(df) - # - # df = nlu.load('embed').predict('What a wonderful day!') - # print(df) - # - # - # def test_embed_glove(self): - # df = nlu.load('en.embed.glove').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('embed.glove').predict('What a wonderful day!') - # print(df) - # df = nlu.load('glove').predict('What a wonderful day!') - # print(df) - # - - def test_sentiment_twitter_out(self): - # res=nlu.load('en.sentiment.twitter',verbose=True).predict('@elonmusk Tesla stock price is too high imo') # ifninite loop ?? - res = nlu.load("en.sentiment.imdb", verbose=True).predict( - "The Matrix was a pretty good movie" - ) - - print(res) - print(res.columns) - - def test_output_levels(self): - print("token test") - df = nlu.load("sentiment", verbose=True).predict( - "What a wonderful day!", output_level="token" - ) - print(df) - print("document test") - df = nlu.load("sentiment", verbose=True).predict( - "What a wonderful day!", output_level="document" - ) - print(df) - - print("sentence test") - df = nlu.load("sentiment", verbose=True).predict( - "What a wonderful day!", output_level="sentence" - ) - print(df) - - print("chunk test") - df = nlu.load("sentiment", verbose=True).predict( - "I like peanut butter and jelly!", output_level="chunk" - ) - print(df) - - def test_ner_multilingual(self): - df = nlu.load("ner", verbose=True).predict( - "New York is a great place and America aswell" - ) - - print(df) - - def test_sentiment(self): - df = nlu.load("en.sentiment").predict("What a wonderful day!") - - def test_emotion(self): - df = nlu.load("en.classify.emotion").predict("What a wonderful day!") - - print(df) - - def test_spell(self): - - df = nlu.load("spell").predict("What a wonderful day!") - print(df) - - # - def test_dependency(self): - df = nlu.load("dep", verbose=True).predict("What a wonderful day!") - print(df) - - def test_dependency_untyped(self): - df = nlu.load("dep.untyped", verbose=True).predict("What a wonderful day!") - - print(df) - - def test_bert(self): - df = nlu.load("bert").predict("What a wonderful day!") - - print(df) - - def test_lang(self): - df = nlu.load("lang", verbose=True).predict("What a wonderful day!") - print(df) - print(df.columns) - print(df["language_de"]) - print(df["language_fr"]) - print(len(df["language_de"][0])) - # df = nlu.load('xx.classify.lang').predict('What a wonderful day!') - # print(df) - # df = nlu.load('classify.lang').predict('What a wonderful day!') - # print(df) - # print(df) - - def test_explain(self): - df = nlu.load("en.explain").predict("What a wonderful day!") - print(df) - df = nlu.load("explain").predict("What a wonderful day!") - print(df) - - def test_match(self): - df = nlu.load("match.date", verbose=True).predict("What a wonderful day!") - print(df) - # df = nlu.load('en.match.date').predict('What a wonderful day!') - # print(df) - - def test_clean_stop(self): - # df = nlu.load('clean.stop').predict('What a wonderful day!') - # print(df) - df = nlu.load("en.clean.stop").predict("What a wonderful day!") - print(df) - - def test_spell(self): - df = nlu.load("spell").predict("What a wonderful day!") - - print(df) - - df = nlu.load("en.spell").predict("What a wonderful day!") - - print(df) - - # def test_all_spell(self): - # df = nlu.load('en.spell.symmetric').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('en.spell.context').predict('What a wonderful day!') - - # print(df) - # df = nlu.load('en.spell.norvig').predict('What a wonderful day!') - # - # print(df) - # df = nlu.load('spell').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('en.spell').predict('What a wonderful day!') - # - # print(df) - - # def test_biobert(self): - # df = nlu.load('biobert').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('en.embed.biobert').predict('What a wonderful day!') - # print(df) - # - # def test_elmo(self): - # df = nlu.load('en.embed.elmo').predict('What a wonderful day!') - # print(df) - # df = nlu.load('elmo').predict('What a wonderful day!') - # print(df) - # - # def test_use(self): - # df = nlu.load('en.embed.use').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('use').predict('What a wonderful day!') - # print(df) - # - # def test_albert(self): - # df = nlu.load('en.embed.albert').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('albert').predict('What a wonderful day!') - # print(df) - # - # def test_xlnet(self): - # df = nlu.load('en.embed.xlnet').predict('What a wonderful day!') - # - # print(df) - # - # df = nlu.load('xlnet').predict('What a wonderful day!') - # print(df) - - def test_lemma(self): - df = nlu.load("lemma").predict("What a wonderful day!") - - print(df) - df = nlu.load("en.lemma").predict("What a wonderful day!") - - print(df) - - # def test_norm(self): - # df = nlu.load('lemma').predict('What a wonderful day!') - # - # print(df) - # df = nlu.load('en.lemma').predict('What a wonderful day!') - # - # print(df) - # - # def test_use(self): - # df = nlu.load('en.embed_sentence.use').predict('What a wonderful day!') - # print(df) - # - # def test_glove(self): - # df = nlu.load('nl.ner.wikiner.glove_6B_300').predict('What a wonderful day!') - # - # print(df) - - def test_sentence_detector(self): - df = nlu.load("sentence_detector", verbose=True).predict( - "What a wonderful day! Tomorrow will be even better!" - ) - - print(df) - - def test_stopwords(self): - df = nlu.load("match.chunk").predict("What a wonderful day!") - print(df) - - def test_classify_lang(self): - df = nlu.load("xx.classify.wiki_7").predict("What a wonderful day!") - print(df) - - def test_sentiment_on_datasets(self): - df = nlu.load("sentiment.twitter").predict("What a wonderful day!") - print(df) - # df = nlu.load('sentiment.imdb').predict('What a wonderful day!') - # print(df) - - def test_multiple_nlu_references(self): - # df = nlu.load('elmo bert').predict('What a wonderful day!') - df = nlu.load("elmo").predict("What a wonderful day!") - - print(df) - # df = nlu.load('sentiment.imdb').predict('What a wonderful day!') - # print(df) - - def test_sentiment_output(self): - res = nlu.load("sentiment", verbose=True).predict( - "Your life is the sum of a remainder of an unbalanced equation inherent to the programming of the matrix. You are the eventuality of an anomaly, which despite my sincerest efforts I have been unable to eliminate from what is otherwise a harmony of mathematical precision. While it remains a burden assiduously avoided, it is not unexpected, and thus not beyond a measure of control. Which has led you, inexorably, here.", - output_level="sentence", - ) - # res = nlu.load('bert',verbose=True).predict('@Your life is the sum of a remainder of an unbalanced equation inherent to the programming of the matrix. You are the eventuality of an anomaly, which despite my sincerest efforts I have been unable to eliminate from what is otherwise a harmony of mathematical precision. While it remains a burden assiduously avoided, it is not unexpected, and thus not beyond a measure of control. Which has led you, inexorably, here.', output_level='sentence') - - print(res) - print(res["sentiment"]) - - print(res.dtypes) - - def test_stem(self): - pdf = get_sample_pdf() - res = nlu.load("stem", verbose=True).predict(pdf) - print(res) - res = nlu.load("en.stem", verbose=True).predict(pdf) - print(res) - - def test_norm(self): - pdf = get_sample_pdf() - res = nlu.load("norm", verbose=True).predict(pdf, output_positions=True) - print(res) - # res = nlu.load('en.norm',verbose=True).predict(pdf) - # print(res) - - def test_chunk(self): - res = nlu.load("chunk", verbose=True).predict("I like peanut butter and jelly!") - print(res) - - def test_ngram(self): - pdf = get_sample_pdf() - # res = nlu.load('ngram',verbose=True).predict(pdf ) - pipe = nlu.load("ngram", verbose=True) - # print(res['ngrams']) - print("PIPE", pipe) - res = nlu.load("en.ngram", verbose=True).predict(pdf) - print(res["ngrams"]) - - def test_chunk_embeds(self): - pdf = get_sample_pdf() - res = nlu.load("embed_chunk", verbose=True).predict("What a wondful day!") - print(res) - res = nlu.load("en.embed_chunk", verbose=True).predict(pdf) - print(res) - - def test_regex_matcher(self): - pdf = get_sample_pdf() - res = nlu.load("match.regex", verbose=True).predict(pdf) - print(res) - - def test_text_matcher(self): - pdf = get_sample_pdf() - res = nlu.load("match.text", verbose=True).predict(pdf) - print(res) - - def test_auto_sentence_embed_bert(self): # TODO WIP - pdf = get_sample_pdf() - res = nlu.load("embed_sentence.bert", verbose=True).predict(pdf) - print(res) - - def test_auto_sentence_embed_elmo(self): # TODO WIP - pdf = get_sample_pdf() - res = nlu.load("embed_sentence.elmo", verbose=True).predict(pdf) - print(res) - - # def test_bad_pandas_column_datatype(self): - # sdf = get_sample_spark_dataframe() - # res = nlu.load('asdasj.asdas',verbose=True).predict(sdf, output_level='sentence') - # # res = nlu.load('bert',verbose=True).predict('@Your life is the sum of a remainder of an unbalanced equation inherent to the programming of the matrix. You are the eventuality of an anomaly, which despite my sincerest efforts I have been unable to eliminate from what is otherwise a harmony of mathematical precision. While it remains a burden assiduously avoided, it is not unexpected, and thus not beyond a measure of control. Which has led you, inexorably, here.', output_level='sentence') - # - # print(res) - # - # def test_bad_pandas_dataframe_datatype(self): - # sdf = get_sample_spark_dataframe() - # res = nlu.load('asdasj.asdas',verbose=True).predict(sdf, output_level='sentence') - # # res = nlu.load('bert',verbose=True).predict('@Your life is the sum of a remainder of an unbalanced equation inherent to the programming of the matrix. You are the eventuality of an anomaly, which despite my sincerest efforts I have been unable to eliminate from what is otherwise a harmony of mathematical precision. While it remains a burden assiduously avoided, it is not unexpected, and thus not beyond a measure of control. Which has led you, inexorably, here.', output_level='sentence') - # - # print(res) - - # 2.6 test - - def test_electra(self): - pdf = get_sample_pdf() - res = nlu.load("en.embed.electra", verbose=True).predict(pdf) - print(res) - - def test_embed_sentence_bert(self): - pdf = get_sample_pdf() - res = nlu.load("en.embed_sentence.small_bert_L2_128", verbose=True).predict(pdf) - print(res) - - def test_embed_sentence_bert(self): - pdf = get_sample_pdf() - res = nlu.load( - "en.embed_sentence.biobert.pubmed_base_cased", verbose=True - ).predict(pdf) - print(res) - - def test_toxic(self): - pdf = get_sample_pdf() - res = nlu.load("en.classify.toxic", verbose=True).predict(pdf) - print(res) - - def test_e2e(self): - pdf = get_sample_pdf() - res = nlu.load("en.classify.e2e", verbose=True).predict(pdf) - print(res) - - def test_labse(self): - pdf = get_sample_pdf() - res = nlu.load("xx.embed_sentence.labse", verbose=True).predict(pdf) - print(res) - - def test_xx_bert(self): - pdf = get_sample_pdf() - res = nlu.load("xx.embed_sentence", verbose=True).predict(pdf) - print(res) - - def test_26_bert(self): - res = nlu.load("en.ner.bert", verbose=True).predict( - "The NLU library is a machine learning library, simmilar to Tensorflow and Keras" - ) - print(res) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/pandas/pandas_tests.py b/tests/nlu_core_tests/pandas_tests.py similarity index 100% rename from tests/pandas/pandas_tests.py rename to tests/nlu_core_tests/pandas_tests.py diff --git a/tests/nlu_core_tests/pipeline_tests/__init__.py b/tests/nlu_core_tests/pipeline_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/pipeline_tests/simple_pretrained_pipe_tests.py b/tests/nlu_core_tests/pipeline_tests/simple_pretrained_pipe_tests.py deleted file mode 100644 index a039f4e5..00000000 --- a/tests/nlu_core_tests/pipeline_tests/simple_pretrained_pipe_tests.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest - -from nlu import * - - -class PretrainedPipeTests(unittest.TestCase): - def simple_pretrained_pipe_tests(self): - df = nlu.load("ner.onto", verbose=True).predict("I love peanutbutter and jelly") - for c in df.columns: - print(df[c]) - - # def test_offline_load_pipe(self): - # pipe_path ='/home/ckl/cache_pretrained/analyze_sentimentdl_use_imdb_en_2.7.1_2.4_1610723836151' - # df = nlu.load(path = pipe_path,verbose=True).predict('I love peanutbutter and jelly') - # for c in df.columns: print(df[c]) - # def test_offline_load_model(self): - # model_path ='/home/ckl/cache_pretrained/stopwords_hi_hi_2.5.4_2.4_1594742439035' - # model_path = '/home/ckl/cache_pretrained/bert_token_classifier_ner_ud_gsd_ja_3.2.2_3.0_1631279615344' - # df = nlu.load(path = model_path,verbose=True).predict('I love peanutbutter and jelly') - # for c in df.columns: print(df[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_core_tests/component_tests/__init__.py b/tests/nlu_core_tests/training_tests/__init__.py similarity index 100% rename from tests/nlu_core_tests/component_tests/__init__.py rename to tests/nlu_core_tests/training_tests/__init__.py diff --git a/tests/nlu_core_tests/training_tests/classifiers/classifier_dl_tests.py b/tests/nlu_core_tests/training_tests/classifier_dl_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/classifiers/classifier_dl_tests.py rename to tests/nlu_core_tests/training_tests/classifier_dl_tests.py diff --git a/tests/nlu_core_tests/training_tests/classifiers/__init__.py b/tests/nlu_core_tests/training_tests/classifiers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/training_tests/dependency/__init__.py b/tests/nlu_core_tests/training_tests/dependency/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_core_tests/training_tests/classifiers/multi_classifier_dl_tests.py b/tests/nlu_core_tests/training_tests/multi_classifier_dl_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/classifiers/multi_classifier_dl_tests.py rename to tests/nlu_core_tests/training_tests/multi_classifier_dl_tests.py diff --git a/tests/nlu_core_tests/training_tests/classifiers/ner_tests.py b/tests/nlu_core_tests/training_tests/ner_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/classifiers/ner_tests.py rename to tests/nlu_core_tests/training_tests/ner_tests.py diff --git a/tests/nlu_core_tests/training_tests/classifiers/pos_tests.py b/tests/nlu_core_tests/training_tests/pos_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/classifiers/pos_tests.py rename to tests/nlu_core_tests/training_tests/pos_tests.py diff --git a/tests/nlu_hc_tests/training_tests/sentence_resolution/sentence_resolver_tests.py b/tests/nlu_core_tests/training_tests/sentence_resolver_tests.py similarity index 100% rename from tests/nlu_hc_tests/training_tests/sentence_resolution/sentence_resolver_tests.py rename to tests/nlu_core_tests/training_tests/sentence_resolver_tests.py diff --git a/tests/nlu_core_tests/training_tests/classifiers/sentiment_dl_tests.py b/tests/nlu_core_tests/training_tests/sentiment_dl_tests.py similarity index 100% rename from tests/nlu_core_tests/training_tests/classifiers/sentiment_dl_tests.py rename to tests/nlu_core_tests/training_tests/sentiment_dl_tests.py diff --git a/tests/nlu_core_tests/training_tests/spell_checkers/__init__.py b/tests/nlu_core_tests/training_tests/spell_checkers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_viz_tests/viz_tests.py b/tests/nlu_core_tests/viz_tests.py similarity index 100% rename from tests/nlu_viz_tests/viz_tests.py rename to tests/nlu_core_tests/viz_tests.py diff --git a/tests/nlu_hc_tests/__init__.py b/tests/nlu_hc_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_hc_tests/component_tests/__init__.py b/tests/nlu_hc_tests/component_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_hc_tests/component_tests/assertion_dl/assertion_tests.py b/tests/nlu_hc_tests/component_tests/assertion_dl/assertion_tests.py deleted file mode 100644 index 063fec4b..00000000 --- a/tests/nlu_hc_tests/component_tests/assertion_dl/assertion_tests.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class AssertionTests(unittest.TestCase): - def test_assertion_dl_model(self): - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - - # data = 'Patient has a headache for the last 2 weeks and appears anxious when she walks fast. No alopecia noted. She denies pain' - # data = """Miss M. is a 67-year-old lady, with past history of COPD and Hypertension, presents with a 3-weeks history of a lump in her right Breast. The lump appeared suddenly, also painful. 5 days ago, another lump appeared in her right axilla. On examination a 2 x 3 cm swelling was seen in the right Breast. It was firm and also non-tender and immobile. There was no discharge. Another 1x1 cm circumferential swelling was found in the right Axilla, which was freely mobile and also tender. Her family history is remarkable for Breast cancer (mother), cervical cancer (maternal grandmother), heart disease (father), COPD (Brother), dementia (Grandfather), diabetes (Grandfather), and CHF (Grandfather).""" - # res = nlu.load('en.assert.healthcare', verbose=True).predict(data, metadata=True) # .predict(data) - data = ( - "Miss M. is a 67-year-old lady, with past history of COPD and Hypertension, " - "presents with a 3-weeks history of a lump in her right Breast. " - "The lump appeared suddenly, also painful. 5 days ago, another lump appeared in her right axilla." - " On examination a 2 x 3 cm swelling was seen in the right Breast." - " It was firm and also non-tender and immobile. There was no discharge. " - "Another 1x1 cm circumferential swelling was found in the right Axilla, " - "which was freely mobile and also tender." - " Her family history is remarkable for Breast cancer (mother), " - "cervical cancer (maternal grandmother), heart disease (father), " - "COPD (Brother), dementia (Grandfather), diabetes (Grandfather), and CHF (Grandfather)." - ) - - res = nlu.load("en.assert.biobert", verbose=True).predict(data, metadata=True) - print(res.columns) - for c in res: - print(res[c]) - print(res) - - -if __name__ == "__main__": - AssertionTests().test_assertion_dl_model() diff --git a/tests/nlu_hc_tests/component_tests/chunkmapper/__init__.py b/tests/nlu_hc_tests/component_tests/chunkmapper/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_hc_tests/component_tests/de_identification/de_identification_tests.py b/tests/nlu_hc_tests/component_tests/de_identification/de_identification_tests.py index fcef8094..e69de29b 100644 --- a/tests/nlu_hc_tests/component_tests/de_identification/de_identification_tests.py +++ b/tests/nlu_hc_tests/component_tests/de_identification/de_identification_tests.py @@ -1,37 +0,0 @@ -import os -import sys -import unittest -sys.path.append(os.getcwd()) - -import nlu - -os.environ["PYTHONPATH"] = "F:/Work/repos/nlu" -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable -from johnsnowlabs import nlp, visual -# nlp.settings.enforce_versions=False -# nlp.install(json_license_path='license.json',visual=True) -spark = nlp.start() -class DeidentificationTests(unittest.TestCase): - def test_deidentification(self): - # b = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli','en','clinical/models') - # m = RelationExtractionModel().pretrained("posology_re") - # - # res = nlu.load('en.ner.deid.augmented en.de_identify', verbose=True).predict('DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin', return_spark_df=True) - - res = nlu.load("en.de_identify").predict( - "DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin", - drop_irrelevant_cols=False, - metadata=True, - ) - # res = nlu.load('zh.segment_words pos', verbose=True)#.predict('DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin', return_spark_df=True) - - for c in res: - print(c) - print(res[c]) - - # print(res) - - -if __name__ == "__main__": - DeidentificationTests().test_deidentification() diff --git a/tests/nlu_hc_tests/component_tests/drug_normalizer/drug_normalizer_test.py b/tests/nlu_hc_tests/component_tests/drug_normalizer/drug_normalizer_test.py deleted file mode 100644 index 844ec818..00000000 --- a/tests/nlu_hc_tests/component_tests/drug_normalizer/drug_normalizer_test.py +++ /dev/null @@ -1,37 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class DrugNormalizerTests(unittest.TestCase): - def test_drug_normalizer(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - - data = [ - "Agnogenic one half cup", - "adalimumab 54.5 + 43.2 gm", - "aspirin 10 meq/ 5 ml oral sol", - "interferon alfa-2b 10 million unit ( 1 ml ) injec", - "Sodium Chloride/Potassium Chloride 13bag", - ] - res = nlu.load("norm_drugs").predict( - data, output_level="document" - ) # .predict(data) - - print(res.columns) - for c in res: - print(res[c]) - - print(res) - - -if __name__ == "__main__": - DrugNormalizerTests().test_entities_config() diff --git a/tests/nlu_hc_tests/component_tests/generic_classifier/generic_classifier_tests.py b/tests/nlu_hc_tests/component_tests/generic_classifier/generic_classifier_tests.py index bb4f82ea..e69de29b 100644 --- a/tests/nlu_hc_tests/component_tests/generic_classifier/generic_classifier_tests.py +++ b/tests/nlu_hc_tests/component_tests/generic_classifier/generic_classifier_tests.py @@ -1,39 +0,0 @@ -import os -import sys -import unittest -sys.path.append(os.getcwd()) - -import nlu - -os.environ["PYTHONPATH"] = "F:/Work/repos/nlu" -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable -from johnsnowlabs import nlp, visual -# nlp.settings.enforce_versions=False -# nlp.install(json_license_path='license.json',visual=True) -nlp.start(visual=True) - -class DeidentificationTests(unittest.TestCase): - def test_generic_classifier(self): - - res = nlu.load("bert elmo", verbose=True).predict( - "DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin" - ) - - # elmo_embeddings and bert_embeddings is what should be passed 2 the feature asselmber/generic classifier - - # res.show() - # for os_components in res.columns: - # print(os_components) - # res.select(os_components).show(truncate=False) - # res = nlu.load('en.extract_relation', verbose=True).predict('The patient got cancer in my foot and damage in his brain') - - for c in res: - print(c) - print(res[c]) - - # print(res) - - -if __name__ == "__main__": - DeidentificationTests().test_entities_config() diff --git a/tests/nlu_hc_tests/component_tests/licensed_classifier/licensed_classifier_tests.py b/tests/nlu_hc_tests/component_tests/licensed_classifier/licensed_classifier_tests.py deleted file mode 100644 index 7dc6b0ff..00000000 --- a/tests/nlu_hc_tests/component_tests/licensed_classifier/licensed_classifier_tests.py +++ /dev/null @@ -1,34 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class LicensedClassifierTests(unittest.TestCase): - def test_LicensedClassifier(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # b = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli','en','clinical/models') - # m = RelationExtractionModel().pretrained("posology_re") - # - # res = nlu.load('en.ner.deid.augmented en.de_identify', verbose=True).predict('DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin', return_spark_df=True) - - res = nlu.load("en.classify.ade.conversational", verbose=True).predict( - "DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin" - ) - - print(res) - for c in res: - print(c) - print(res[c]) - - -if __name__ == "__main__": - LicensedClassifierTests().test_LicensedClassifier() diff --git a/tests/nlu_hc_tests/component_tests/pipe_tests/__init__.py b/tests/nlu_hc_tests/component_tests/pipe_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_hc_tests/component_tests/pipe_tests/multi_ner_tests.py b/tests/nlu_hc_tests/component_tests/pipe_tests/multi_ner_tests.py deleted file mode 100644 index d4bcd5b1..00000000 --- a/tests/nlu_hc_tests/component_tests/pipe_tests/multi_ner_tests.py +++ /dev/null @@ -1,35 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class MultiNerTests(unittest.TestCase): - def test_multi_ner_pipe(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # res = nlu.load('en.ner.diseases en.resolve_chunk.snomed.findings', verbose=True).predict(['The patient has cancer and high fever and will die next week.', ' She had a seizure.'], drop_irrelevant_cols=False, metadata=True, ) - - data = [ - "The patient has cancer and high fever and will die next week.", - " She had a seizure.", - ] - res = nlu.load( - "en.med_ner.tumour en.med_ner.radiology en.med_ner.diseases en.ner.onto ", - verbose=True, - ).predict(data) - - for c in res: - print(res[c]) - - print(res) - - -if __name__ == "__main__": - MultiNerTests().test_entities_config() diff --git a/tests/nlu_hc_tests/component_tests/pipe_tests/pretrained_pipe_tests.py b/tests/nlu_hc_tests/component_tests/pipe_tests/pretrained_pipe_tests.py deleted file mode 100644 index 696197e4..00000000 --- a/tests/nlu_hc_tests/component_tests/pipe_tests/pretrained_pipe_tests.py +++ /dev/null @@ -1,30 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class TestPretrainedPipe(unittest.TestCase): - def test_pretrained_pipe(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - data = [ - "The patient has cancer and high fever and will die next week.", - " She had a seizure.", - ] - res = nlu.load("en.explain_doc.era", verbose=True).predict(data) - - for c in res: - print(res[c]) - - print(res) - - -if __name__ == "__main__": - TestPretrainedPipe().test_pretrained_pipe() diff --git a/tests/nlu_hc_tests/component_tests/relation_extraction/relation_extraction_tests.py b/tests/nlu_hc_tests/component_tests/relation_extraction/relation_extraction_tests.py deleted file mode 100644 index 0901c11c..00000000 --- a/tests/nlu_hc_tests/component_tests/relation_extraction/relation_extraction_tests.py +++ /dev/null @@ -1,36 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class RelationExtractionTests(unittest.TestCase): - def test_relation_extraction(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # res = nlu.load('en.ner.posology en.extract_relation.drug_drug_interaction', verbose=True).predict('The patient got cancer in my foot and damage in his brain but we gave him 50G of and 50mg Penicilin and this helped is brain injury after 6 hours. 1 Hour after the penicilin, 3mg Morphium was administred which had no problems with the Penicilin', return_spark_df=True) - s1 = "The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also given 1 unit of Metformin daily. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day." - data = [s1] - # res = nlu.load('med_ner.posology relation.drug_drug_interaction', verbose=True).predict(data, drop_irrelevant_cols=False, metadata=True, ) - res = nlu.load("relation.drug_drug_interaction", verbose=True).predict( - data, - drop_irrelevant_cols=False, - metadata=True, - ) - - for c in res: - print(c) - print(res[c]) - - # print(res) - - -if __name__ == "__main__": - RelationExtractionTests().test_entities_config() diff --git a/tests/nlu_hc_tests/component_tests/sentence_entity_resolver/sentence_entity_resolution_tests.py b/tests/nlu_hc_tests/component_tests/sentence_entity_resolver/sentence_entity_resolution_tests.py deleted file mode 100644 index 7618b9c9..00000000 --- a/tests/nlu_hc_tests/component_tests/sentence_entity_resolver/sentence_entity_resolution_tests.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class SentenceResolutionTests(unittest.TestCase): - def test_assertion_dl_model(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # b = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli','en','clinical/models') - - # todo en.ner.ade Error not accessable in 2.7.6?? - s1 = "The patient has COVID. He got very sick with it." - s2 = "Peter got the Corona Virus!" - s3 = "COVID 21 has been diagnosed on the patient" - data = [s1, s2, s3] - # en.resolve_sentence.icd10cm - resolver_ref = "en.resolve.icd10cm.augmented_billable" - resolver_ref = 'en.resolve.umls' - res = nlu.load(f"en.med_ner.diseases {resolver_ref}", verbose=True).predict( - data, drop_irrelevant_cols=False, metadata=True, - positions=True, - ) - - # res = nlu.load('en.ner.anatomy', verbose=True).predict(['The patient has cancer and a tumor and high fever and will die next week. He has pain in his left food and right upper brain', ' She had a seizure.'], drop_irrelevant_cols=False, metadata=True) - print(res.columns) - for c in res: - print(c) - print(res[c]) - - print(res) - - -if __name__ == "__main__": - SentenceResolutionTests().test_entities_config() diff --git a/tests/nlu_hc_tests/component_tests/summarizer_tests.py b/tests/nlu_hc_tests/component_tests/summarizer_tests.py deleted file mode 100644 index edcb847e..00000000 --- a/tests/nlu_hc_tests/component_tests/summarizer_tests.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import sys -import unittest - -import nlu -import tests.secrets as sct - -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable - -summarizer_spells = [ - 'en.summarize.clinical_jsl', - 'en.summarize.clinical_jsl_augmented', - 'en.summarize.biomedical_pubmed', - 'en.summarize.generic_jsl', - 'en.summarize.clinical_questions', - 'en.summarize.radiology', - 'en.summarize.clinical_guidelines_large', - 'en.summarize.clinical_laymen', -] - - -class MedicalSummarizerTests(unittest.TestCase): - def test_medical_summarizer(self): - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # b = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli','en','clinical/models') - - - for s in summarizer_spells: - pipe = nlu.load(s) - # Configure relations to extract - print("TESTING: ", s) - df = pipe.predict("Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer.") - print(df.columns) - for c in df: - print(c) - print(df[c]) - - print(df) - - -if __name__ == "__main__": - MedicalSummarizerTests().test_medical_summarizer() diff --git a/tests/nlu_hc_tests/component_tests/text_generator_tests.py b/tests/nlu_hc_tests/component_tests/text_generator_tests.py deleted file mode 100644 index 5c7166f0..00000000 --- a/tests/nlu_hc_tests/component_tests/text_generator_tests.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -import sys -import unittest - -import nlu -import secrets as sct - -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable - -text_generator_spells = [ - 'en.generate.biomedical_biogpt_base', - 'en.generate.biogpt_chat_jsl_conversational' -] - - -class MedicalTextGeneratorTests(unittest.TestCase): - def test_medical_text_generator(self): - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - - - for s in text_generator_spells: - pipe = nlu.load(s) - # Configure relations to extract - print("TESTING: ", s) - df = pipe.predict("Covid 19 is",output_level='chunk') - print(df.columns) - for c in df: - print(c) - print(df[c]) - - print(df) - - -if __name__ == "__main__": - MedicalTextGeneratorTests().test_medical_text_generator() diff --git a/tests/nlu_hc_tests/component_tests/zero_shot_ner_tests.py b/tests/nlu_hc_tests/component_tests/zero_shot_ner_tests.py deleted file mode 100644 index ed5e2cc7..00000000 --- a/tests/nlu_hc_tests/component_tests/zero_shot_ner_tests.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest - -from johnsnowlabs import nlp - - -import nlu -import tests.secrets as sct - - -class ZeroShotNerTests(unittest.TestCase): - def test_zero_shot_relation_model(self): - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - - pipe = nlu.load('en.zero_shot.ner_roberta') - print(pipe) - pipe['zero_shot_ner'].setEntityDefinitions( - { - "PROBLEM": [ - "What is the disease?", - "What is his symptom?", - "What is her disease?", - "What is his disease?", - "What is the problem?", - "What does a patient suffer", - "What was the reason that the patient is admitted to the clinic?", - ], - "DRUG": [ - "Which drug?", - "Which is the drug?", - "What is the drug?", - "Which drug does he use?", - "Which drug does she use?", - "Which drug do I use?", - "Which drug is prescribed for a symptom?", - ], - "ADMISSION_DATE": ["When did patient admitted to a clinic?"], - "PATIENT_AGE": [ - "How old is the patient?", - "What is the gae of the patient?", - ], - } - ) - - df = pipe.predict( - [ - "The doctor pescribed Majezik for my severe headache.", - "The patient was admitted to the hospital for his colon cancer.", - "27 years old patient was admitted to clinic on Sep 1st by Dr. X for a right-sided pleural effusion for thoracentesis.", - ] - ) - # Configure relationsz to extract - print(df.columns) - for c in df: - print(c) - print(df[c]) - - print(df) - - -if __name__ == "__main__": - ZeroShotNerTests().test_zero_shot_relation_model() diff --git a/tests/nlu_hc_tests/component_tests/zero_shot_relation_tests.py b/tests/nlu_hc_tests/component_tests/zero_shot_relation_tests.py deleted file mode 100644 index 1ca012f6..00000000 --- a/tests/nlu_hc_tests/component_tests/zero_shot_relation_tests.py +++ /dev/null @@ -1,36 +0,0 @@ -import unittest - -import nlu -import tests.secrets as sct - - -class ZeroShotRelationTests(unittest.TestCase): - def test_zero_shot_relation_model(self): - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - # b = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli','en','clinical/models') - - pipe = nlu.load('med_ner.clinical relation.zeroshot_biobert') - # Configure relations to extract - pipe['zero_shot_relation_extraction'].setRelationalCategories({ - "CURE": ["{{TREATMENT}} cures {{PROBLEM}}."], - "IMPROVE": ["{{TREATMENT}} improves {{PROBLEM}}.", "{{TREATMENT}} cures {{PROBLEM}}."], - "REVEAL": ["{{TEST}} reveals {{PROBLEM}}."]}).setMultiLabel(False) - df = pipe.predict("Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer.") - # res = nlu.load('en.ner.anatomy', verbose=True).predict(['The patient has cancer and a tumor and high fever and will die next week. He has pain in his left food and right upper brain', ' She had a seizure.'], drop_irrelevant_cols=False, metadata=True) - print(df.columns) - for c in df: - print(c) - print(df[c]) - - print(df) - - -if __name__ == "__main__": - ZeroShotRelationTests().test_zero_shot_relation_model() diff --git a/tests/nlu_hc_tests/verification_tests.py b/tests/nlu_hc_tests/verification_tests.py deleted file mode 100644 index 42c474de..00000000 --- a/tests/nlu_hc_tests/verification_tests.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -import os -import unittest - -import nlu -import tests.secrets as sct - - -class TestAuthentification(unittest.TestCase): - def test_auth(self): - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - # JSL_SECRET = sct.JSL_SECRET_3_4_2 - JSL_SECRET = sct.JSL_SECRET - res = ( - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - .load("en.med_ner.diseases") - .predict("He has cancer") - ) - for c in res.columns: - print(res[c]) - - def test_auth_miss_match(self): - SPARK_NLP_LICENSE = "wrong_license" - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - # JSL_SECRET = sct.JSL_SECRET_3_4_2 - JSL_SECRET = sct.JSL_SECRET - res = ( - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - .load("en.med_ner.diseases") - .predict("He has cancer") - ) - for c in res.columns: - print(res[c]) - - def test_auth_via_file(self): - secrets_json_path = os.path.join(os.path.abspath("./"), "license.json") - print("license path:", secrets_json_path) - with open(secrets_json_path, "w", encoding="utf8") as file: - json.dump(sct.license_dict, file) - res = ( - nlu.auth(secrets_json_path) - .load("en.med_ner.diseases", verbose=True) - .predict("He has cancer") - ) - # res = nlu.load('en.med_ner.diseases',verbose=True).predict("He has cancer") - for c in res.columns: - print(res[c]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/nlu_ocr_tests/__init__.py b/tests/nlu_ocr_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/nlu_ocr_tests/convnext_image_classifier.py b/tests/nlu_ocr_tests/convnext_image_classifier.py deleted file mode 100644 index f1803f98..00000000 --- a/tests/nlu_ocr_tests/convnext_image_classifier.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest -import nlu -class ConvNextOcrTest(unittest.TestCase): - def test_img_classification(self): - img_path = 'tests\\datasets\\ocr\\images\\teapot.jpg' - p = nlu.load('en.classify_image.convnext.tiny',verbose=True) - dfs = p.predict(img_path) - print(dfs['classified_image_results']) - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py b/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py deleted file mode 100644 index 9e5db019..00000000 --- a/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py +++ /dev/null @@ -1,37 +0,0 @@ -import tests.secrets as sct -import unittest -import nlu - -SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE -AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID -AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY -JSL_SECRET = sct.JSL_SECRET -OCR_SECRET = sct.OCR_SECRET -OCR_LICENSE = sct.OCR_LICENSE -# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) - -class OcrTest(unittest.TestCase): - - def test_text_to_pdf(self): - nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) - # text that we generate PDF to has to come from an image struct! - # We need convert text to img struct! - - p = nlu.load('ppt2table',verbose=True) - dfs = p.predict([f1,f2]) - for df in dfs : - print(df) - - def test_DOC_table_extraction(self): - nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) - f1 = '/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/table_DOCX/doc2.docx' - p = nlu.load('doc2table',verbose=True) - dfs = p.predict([f1]) - for df in dfs : - print(df) - - - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/nlu_ocr_tests/ocr_table_extraction_tests.py b/tests/nlu_ocr_tests/ocr_table_extraction_tests.py deleted file mode 100644 index 4bd86178..00000000 --- a/tests/nlu_ocr_tests/ocr_table_extraction_tests.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import sys - -sys.path.append(os.getcwd()) -import unittest -import nlu -nlu.auth(sct.SPARK_NLP_LICENSE,sct.AWS_ACCESS_KEY_ID,sct.AWS_SECRET_ACCESS_KEY,sct.JSL_SECRET, sct.OCR_LICENSE, sct.OCR_SECRET) -# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) - -class OcrTest(unittest.TestCase): - - def test_PDF_table_extraction(self): - """:cvar - 1. PdfToTextAble - 2. DocToTextTable - 3. PptToTextTable - 4.1 ImageTableDetector --> Find Locations of Tables - 4.2 Image TableCell Detector ---> FInd Location of CELLS on the table - 4.3 ImageCellsToTextTable ----> Find TEXT inside of the Cells on the table - - """ - """:cvar - Whats the difference between DocToTextTable transformer and - using ImageTableDetector + ImageTableCellDetector + ImageCellsToTextTable - The first is pragamtic and the second one is DL based? - When to use which annotator? - ---> for NON SELECTABLE TEXT ImageTableDetector + ImageTableCellDetector + ImageCellsToTextTable - ---> For text whci his selectable DocToTextTable3. - """ - img_path = 'tests/datasets/ocr/table_pdf_highlightable_text/data.pdf' - p = nlu.load('pdf2table',verbose=True) - dfs = p.predict(img_path) - for df in dfs : - print(df) - - def test_PPT_table_extraction(self): - f1 = 'tests/datasets/ocr/table_PPT/54111.ppt' - f2 ='tests/datasets/ocr/table_PPT/mytable.ppt' - p = nlu.load('ppt2table',verbose=True) - dfs = p.predict([f1 ]) - for df in dfs : - print(df) - - def test_DOC_table_extraction(self): - f1 = 'tests/datasets/ocr/docx_with_table/doc2.docx' - p = nlu.load('doc2table',verbose=True) - dfs = p.predict([f1]) - for df in dfs : - print(df) - - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py b/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py deleted file mode 100644 index ec25389a..00000000 --- a/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py +++ /dev/null @@ -1,34 +0,0 @@ - -import os -import sys - -sys.path.append(os.getcwd()) -import unittest -import nlu - -os.environ["PYTHONPATH"] = "F:/Work/repos/nlu" -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable -from johnsnowlabs import nlp, visual - -# nlp.install(json_license_path='license.json',visual=True) -nlp.start(visual=True) - -# print('hi') -class OcrTest(unittest.TestCase): - - def test_classify_document(self): - # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) - # text that we generate PDF to has to come from an image struct! - # We need convert text to img struct! - p = nlu.load('en.classify_image.tabacco',verbose=True) - res = p.predict('cv_test.png') - for i,j in res.iterrows(): - print(i,j) - print(res) - # for r in res.columns: - # print(r[res]) - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/nlu_viz_tests/__init__.py b/tests/nlu_viz_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/release_tests/341_release/341_tests.py b/tests/release_tests/341_release/341_tests.py deleted file mode 100644 index 171f7912..00000000 --- a/tests/release_tests/341_release/341_tests.py +++ /dev/null @@ -1,108 +0,0 @@ -import sys -import unittest - -import nlu - - -class Test341(unittest.TestCase): - def test_341_models(self): - te = [ - "en.embed.longformer.clinical", - "en.classify.emotion.bert", - "en.classify.typos.distilbert", - "de.classify.news_sentiment.bert", - "xx.embed.albert.indic", - "xx.ner.masakhaner", - "fr.embed.word2vec_wiki_1000", - "fr.embed.word2vec_wac_200", - "fr.embed.w2v_cc_300d", - "vi.embed.distilbert.cased", - ] - - fails = [] - for t in te: - try: - print(f"Testing spell = {t}") - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - ["Peter love pancaces. I hate Mondays", "I love Fridays"] - ) - for c in df.columns: - print(df[c]) - except Exception as err: - print(f"Failure for spell = {t} ", err) - e = sys.exc_info() - print(e[0]) - print(e[1]) - fails.append(t) - - fail_string = "\n".join(fails) - print(f"Done testing, failures = {fail_string}") - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - def test_341_HC_models(self): - import tests.secrets as sct - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - te = [ - "en.med_ner.supplement_clinical", - "en.resolve.rxnorm.augmented_re", - "en.classify.gender.seq_biobert", - "es.embed.sciwiki_300d", - "en.classify.ade.seq_biobert", - "en.classify.pico.seq_biobert", - "en.classify.ade.seq_distilbert", - "es.med_ner.deid.generic", - "es.med_ner.deid.subentity", - "en.relation.temporal_events_clinical", - "en.relation.adverse_drug_events.clinical", - "en.relation.adverse_drug_events.clinical.biobert", - ] - sample_texts = [ - """ - Antonio Pérez Juan, nacido en Cadiz, España. Aún no estaba vacunado, se infectó con Covid-19 el dia 14/03/2020 y tuvo que ir al Hospital. Fue tratado con anticuerpos monoclonales en la Clinica San Carlos.. - """, - """ - Datos del paciente. Nombre: Jose . Apellidos: Aranda Martinez. NHC: 2748903. NASS: 26 37482910. - """, - """The patient was given metformin 500 mg, 2.5 mg of coumadin and then ibuprofen""", - """he patient was given metformin 400 mg, coumadin 5 mg, coumadin, amlodipine 10 MG""", - """To compare the results of recording enamel opacities using the TF and modified DDE indices.""", - """I felt a bit drowsy and had blurred vision after taking Aspirin.""", - ] - fails = [] - succs = [] - for t in te: - try: - print(f"Testing spell = {t}") - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - sample_texts, - drop_irrelevant_cols=False, - metadata=True, - ) - print(df.columns) - for c in df.columns: - print(df[c]) - succs.append(t) - except Exception as err: - print(f"Failure for spell = {t} ", err) - fails.append(t) - break - fail_string = "\n".join(fails) - succ_string = "\n".join(succs) - print(f"Done testing, failures = {fail_string}") - print(f"Done testing, successes = {succ_string}") - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/release_tests/341_release/__init__.py b/tests/release_tests/341_release/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/release_tests/342_release/342_tests.py b/tests/release_tests/342_release/342_tests.py deleted file mode 100644 index 313f0e1a..00000000 --- a/tests/release_tests/342_release/342_tests.py +++ /dev/null @@ -1,126 +0,0 @@ -import sys -import unittest - -import nlu - - -class Test341(unittest.TestCase): - def test_341_models(self): - te = [ - "en.embed.deberta_v3_xsmall", - "en.embed.deberta_v3_small", - "en.embed.deberta_v3_base", - "en.embed.deberta_v3_large", - "xx.embed.mdeberta_v3_bas", - ] - - fails = [] - for t in te: - try: - print(f"Testing spell = {t}") - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - ["Peter love pancaces. I hate Mondays", "I love Fridays"] - ) - for c in df.columns: - print(df[c]) - except Exception as err: - print(f"Failure for spell = {t} ", err) - e = sys.exc_info() - print(e[0]) - print(e[1]) - fails.append(t) - - fail_string = "\n".join(fails) - print(f"Done testing, failures = {fail_string}") - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - def test_341_HC_models(self): - import tests.secrets as sct - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - te = [ - "en.med_ner.clinical_trials", - "es.med_ner.deid.generic.roberta", - "es.med_ner.deid.subentity.roberta", - "en.med_ner.deid.generic_augmented", - "en.med_ner.deid.subentity_augmented", - ] - sample_texts = [ - """ - Antonio Pérez Juan, nacido en Cadiz, España. Aún no estaba vacunado, se infectó con Covid-19 el dia 14/03/2020 y tuvo que ir al Hospital. Fue tratado con anticuerpos monoclonales en la Clinica San Carlos.. - """, - """ - Datos del paciente. Nombre: Jose . Apellidos: Aranda Martinez. NHC: 2748903. NASS: 26 37482910. - """, - """The patient was given metformin 500 mg, 2.5 mg of coumadin and then ibuprofen""", - """he patient was given metformin 400 mg, coumadin 5 mg, coumadin, amlodipine 10 MG""", - """To compare the results of recording enamel opacities using the TF and modified DDE indices.""", - """I felt a bit drowsy and had blurred vision after taking Aspirin.""", - ] - fails = [] - succs = [] - for t in te: - try: - print(f"Testing spell = {t}") - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - sample_texts, - drop_irrelevant_cols=False, - metadata=True, - ) - print(df.columns) - for c in df.columns: - print(df[c]) - succs.append(t) - except Exception as err: - print(f"Failure for spell = {t} ", err) - fails.append(t) - break - fail_string = "\n".join(fails) - succ_string = "\n".join(succs) - print(f"Done testing, failures = {fail_string}") - print(f"Done testing, successes = {succ_string}") - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - def test_quick(self): - import tests.secrets as sct - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth( - SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET - ) - te = [ - "sentiment", - ] - sample_texts = ["""Billy loves Soda. Sarah said so"""] - succs = [] - for t in te: - print(f"Testing spell = {t}") - pipe = nlu.load(t, verbose=True) - df = pipe.predict( - sample_texts, - drop_irrelevant_cols=False, - metadata=True, - output_level="document", - ) - - print(df.columns) - for c in df.columns: - print(df[c]) - succs.append(t) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/release_tests/release_344/__init__.py b/tests/release_tests/release_344/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/release_tests/release_344/tests_344.py b/tests/release_tests/release_344/tests_344.py deleted file mode 100644 index 0b24ea49..00000000 --- a/tests/release_tests/release_344/tests_344.py +++ /dev/null @@ -1,741 +0,0 @@ -import unittest -import nlu -import sys - - -class Test344(unittest.TestCase): - - def test_344_models(self): - import pandas as pd - te = [ - # 'en.ner.debertav3_large.conll03', - # 'en.ner.debertav3_base.conll03', - # 'en.ner.debertav3_small.conll03', - # 'en.ner.debertav3_xsmall.conll03', - # 'en.ner.debertav3_large.ontonotes', - # 'en.ner.debertav3_base.ontonotes', - # 'en.ner.debertav3_small.ontonotes', - # 'en.ner.debertav3_xsmall.ontonotes', - # 'fr.embed.camembert_large', - # 'fr.embed.camembert_base', - # 'fr.embed.camembert_ccnet4g', - # 'fr.embed.camembert_base_ccnet', - # 'fr.embed.camembert_oscar_4g', - # 'fr.embed.camembert_wiki_4g', - # 'fr.embed.albert', - # 'fr.embed.distilbert', - # 'mr.embed.distilbert', - # 'mr.embed.albert', - # 'id.embed.distilbert', - # 'ar.embed.distilbert', - # 'ar.embed.albert', - # 'fa.embed.albert', - # 'mr.embed.albert', - # 'jv.embed.distilbert', - # 'ms.embed.albert', - # 'ms.embed.distilbert', - # - # - # - # 'en.med_ner.biomedical_bc2gm', # OK? - # 'en.resolve.rxnorm_action_treatment', # OK? - # 'pt.med_ner.deid.subentity', - # 'pt.med_ner.deid.generic', - # 'pt.med_ner.deid', - # BATCH 2 - # 'bn.embed.indic_transformers_bn_distilbert' , - # 'de.embed.distilbert_base_de_cased' , - # 'de.embed.distilbert_base_german_cased' , - # 'de.embed.albert_german_ner' , - # 'en.embed.albert_xlarge_v1' , - # 'en.embed.albert_base_v1' , - # 'en.embed.albert_xxlarge_v1' , - # 'en.embed.distilbert_base_en_cased' , - # 'en.embed.distilbert_base_uncased_sparse_90_unstructured_pruneofa' , - # 'en.embed.distilbert_base_uncased_sparse_85_unstructured_pruneofa' , - # 'es.embed.distilbert_base_es_multilingual_cased' , - # 'es.embed.distilbert_base_es_cased' , - # 'fa.embed.distilbert_fa_zwnj_base' , - # 'hi.embed.distilbert_base_hi_cased' , - # 'hi.embed.indic_transformers_hi_distilbert' , - # 'it.embed.distilbert_base_it_cased' , - # 'it.embed.BERTino' , - # 'ja.embed.distilbert_base_ja_cased' , - # 'ja.embed.albert_base_japanese_v1' , - # 'nl.embed.distilbert_base_cased' , - # 'pl.embed.distilbert_base_cased' , - # 'ro.embed.distilbert_base_cased' , - # 'ro.embed.ALR_BERT' , - # 'th.embed.distilbert_base_cased' , - # 'tr.embed.distilbert_base_cased' , - # 'uk.embed.distilbert_base_cased' , - # 'ur.embed.distilbert_base_cased' , - # 'zh.embed.distilbert_base_cased' , - # 'ar.embed.albert_xlarge_arabic' , - # 'ar.embed.albert_large_arabic' , - # 'fa.embed.albert_fa_zwnj_base_v2' , - # 'mr.embed.marathi_albert_v2' , - # 'ms.embed.albert_tiny_bahasa_cased' , - # 'ms.embed.albert_base_bahasa_cased' , - # 'pt.embed.distilbert_base_cased' , - # 'jv.embed.javanese_distilbert_small_imdb' , - # 'ru.embed.distilbert_base_cased' , - - # 'mr.embed.albert_v2', - - # 'en.classify.questionpair', - # 'en.classify.question_vs_statement', - # 'en.classify.song_lyrics', - # 'af.embed.w2v_cc_300d', - # 'af.stopwords', - # 'als.embed.w2v_cc_300d', - # 'am.embed.w2v_cc_300d', - # 'am.embed.am_roberta', - # 'am.stopwords', - # 'an.embed.w2v_cc_300d', - # 'ar.pos.arabic_camelbert_msa_pos_msa', - # 'ar.pos.arabic_camelbert_mix_pos_egy', - # 'ar.pos.arabic_camelbert_da_pos_glf', - # 'ar.pos.arabic_camelbert_ca_pos_glf', - # 'ar.pos.arabic_camelbert_msa_pos_egy', - # 'ar.pos.arabic_camelbert_ca_pos_egy', - # 'ar.pos.arabic_camelbert_msa_pos_glf', - # 'ar.pos.arabic_camelbert_mix_pos_glf', - # 'ar.pos.arabic_camelbert_da_pos_egy', - # 'ar.stopwords', - # 'ar.embed.multi_dialect_bert_base_arabic', - # 'ar.ner.arabic_camelbert_da_ner', - # 'ar.ner.arabic_camelbert_mix_ner', - # 'ar.pos', - # 'ar.ner.multilingual_cased_ner_hrl', - # 'ar.ner.arabic_camelbert_msa_ner', - # 'ar.ner.ANER', - # 'ar.ner.arabert_ner', - # 'ar.lemma', - # 'ar.pos.arabic_camelbert_mix_pos_msa', - # 'ar.embed.mbert_ar_c19', - # 'ar.embed.bert_base_arabic_camelbert_msa_half', - # 'ar.embed.bert_large_arabertv02', - # 'ar.embed.AraBertMo_base_V1', - # 'ar.embed.DarijaBERT', - # 'ar.embed.bert_base_arabertv02', - # 'ar.embed.arabert_c19', - # 'ar.embed.bert_base_arabic_camelbert_msa', - # 'ar.embed.bert_base_arabertv2', - # 'ar.embed.bert_base_arabic', - # 'ar.embed.Ara_DialectBERT', - # 'ar.embed.MARBERT', - # 'ar.embed.bert_base_arabic_camelbert_msa_eighth', - # 'ar.embed.MARBERTv2', - # 'ar.embed.bert_large_arabertv2', - # 'ar.embed.bert_base_arabert', - # 'ar.embed.bert_base_arabertv01', - # 'ar.embed.bert_mini_arabic', - # 'ar.embed.bert_large_arabic', - # 'ar.embed.bert_large_arabertv02_twitter', - # 'ar.embed.dziribert', - # 'ar.embed.bert_base_arabertv02_twitter', - # 'ar.embed.bert_medium_arabic', - # 'ar.pos.arabic_camelbert_da_pos_msa', - # 'ar.embed.bert_base_qarib', - # 'ar.embed.bert_base_qarib60_860k', - # 'ar.embed.bert_base_qarib60_1790k', - # 'ar.embed.bert_base_arabic_camelbert_msa_sixteenth', - # 'ar.embed.bert_base_arabic_camelbert_mix', - # 'ar.embed.bert_base_arabic_camelbert_msa_quarter', - # 'arz.embed.w2v_cc_300d', - # 'as.embed.w2v_cc_300d', - # 'ast.embed.w2v_cc_300d', - # 'az.embed.w2v_cc_300d', - # 'az.stopwords', - # 'ba.embed.w2v_cc_300d', - # 'bar.embed.w2v_cc_300d', - # 'bcl.embed.w2v_cc_300d', - # 'be.embed.w2v_cc_300d', - # 'be.lemma', - # 'bg.embed.w2v_cc_300d', - # 'bg.stopwords', - # 'bh.embed.w2v_cc_300d', - # 'bn.embed.w2v_cc_300d', - # 'bn.embed.indic_transformers_bn_bert', - # 'bn.embed.muril_adapted_local', - # 'bn.embed.bangla_bert', - # 'bn.stopwords', - # 'bpy.embed.w2v_cc_300d', - # 'br.embed.w2v_cc_300d', - # 'ca.lemma', - # 'ca.embed.w2v_cc_300d', - # 'ca.stopwords', - # 'ce.embed.w2v_cc_300d', - # 'ceb.embed.w2v_cc_300d', - # 'co.embed.w2v_cc_300d', - # 'cop.pos', - # 'cs.stopwords', - # 'cs.embed.w2v_cc_300d', - # 'cs.pos', - # 'cs.lemma', - # 'cs.lemma', - # 'cs.lemma', - # 'cu.pos', - # 'cv.embed.w2v_cc_300d', - # 'da.embed.w2v_cc_300d', - # 'da.lemma', - # 'da.stopwords', - # 'de.embed.bert_base_historical_german_rw_cased', - # 'de.embed.gbert_base', - # 'de.embed.german_financial_statements_bert', - # 'de.stopwords', - # 'de.lemma', - # 'de.embed.bert_base_german_dbmdz_uncased', - # 'de.embed.roberta_base_wechsel_german', - # 'de.embed.gbert_large', - # 'de.embed.bert_base_5lang_cased', - # 'de.embed.bert_base_german_cased_oldvocab', - # 'de.embed.bert_base_de_cased', - # 'de.embed.bert_base_german_uncased', - # 'de.embed.bert_base_german_dbmdz_cased', - # 'dv.embed.w2v_cc_300d', - # 'el.stopwords', - # 'eml.embed.w2v_cc_300d', - # 'en.embed.muppet_roberta_base', - # 'en.embed.muppet_roberta_large', - # 'en.embed.fairlex_ecthr_minilm', - # 'en.embed.distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies', - # 'en.embed.legal_roberta_base', - # 'en.embed.distilroberta_base', - # 'en.embed.pmc_med_bio_mlm_roberta_large', - # 'en.lemma', - # 'en.lemma', - # 'en.lemma', - # 'en.embed.roberta_pubmed', - # 'en.embed.fairlex_scotus_minilm', - # 'en.embed.distilroberta_base_finetuned_jira_qt_issue_title', - # 'en.embed.chEMBL26_smiles_v2', - # 'en.embed.SecRoBERTa', - # 'en.embed.distilroberta_base_climate_d_s', - # 'en.embed.chEMBL_smiles_v1', - # 'en.embed.distilroberta_base_climate_f', - # 'en.embed.distilroberta_base_climate_d', - # 'en.embed.Bible_roberta_base', - # 'en.embed.w2v_cc_300d', - # 'en.pos', - # 'en.ner.ner_chemical_bionlp_bc5cdr_pubmed', - # 'en.pos.roberta_large_english_upos', - # 'en.ner.roberta_ticker', - # 'en.embed.bert_political_election2020_twitter_mlm', - # 'en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier', - # 'en.embed.crosloengual_bert', - # 'en.embed.chemical_bert_uncased', - # 'en.embed.deberta_base_uncased', - # 'en.embed.bert_base_en_cased', - # 'en.embed.bert_for_patents', - # 'en.embed.SecBERT', - # 'en.embed.bert_base_5lang_cased', - # 'en.embed.DiLBERT', - # 'en.embed.FinancialBERT', - # 'en.embed.false_positives_scancode_bert_base_uncased_L8_1', - # 'en.embed.legal_bert_small_uncased', - # 'en.embed.legal_bert_base_uncased', - # 'en.embed.COVID_SciBERT', - # 'en.embed.e', - # 'en.embed.danbert_small_cased', - # 'en.embed.bert_base_uncased_dstc9', - # 'en.embed.hateBERT', - # 'en.embed.childes_bert', - # 'en.embed.clinical_pubmed_bert_base_512', - # 'en.embed.netbert', - # 'en.embed.psych_search', - # 'en.embed.muril_adapted_local', - # 'en.embed.finbert_pretrain_yiyanghkust', - # 'en.embed.lic_class_scancode_bert_base_cased_L32_1', - # 'en.embed.sec_bert_sh', - # 'en.embed.sec_bert_num', - # 'en.embed.finest_bert', - # 'en.embed.bert_large_cased_whole_word_masking', - # 'en.embed.clinical_pubmed_bert_base_128', - # 'en.embed.bert_base_uncased_sparse_70_unstructured', - # 'en.embed.sec_bert_base', - # 'en.stopwords', - # 'en.embed.agriculture_bert_uncased', - # 'en.embed.bert_large_uncased_whole_word_masking', - # 'en.embed.ge', - # 'en.ner.roberta_large_finetuned_abbr', - # 'en.ner.roberta_classics_ner', - # 'en.pos.roberta_base_english_upos', - # 'en.ner.roberta_large_ner_english', - # 'en.ner.ner_gene_dna_rna_jnlpba_pubmed', - # 'en.ner.ner_disease_ncbi_bionlp_bc5cdr_pubmed', - # 'eo.embed.w2v_cc_300d', - # 'es.embed.bertin_base_gaussian', - # 'es.embed.bertin_roberta_base_spanish', - # 'es.embed.bertin_roberta_large_spanish', - # 'es.embed.bertin_base_stepwise', - # 'es.embed.dpr_spanish_passage_encoder_allqa_base', - # 'es.embed.dpr_spanish_question_encoder_allqa_base', - # 'es.embed.beto_gn_base_cased', - # 'es.embed.dpr_spanish_passage_encoder_squades_base', - # 'es.embed.dpr_spanish_question_encoder_squades_base', - # 'es.embed.bert_base_es_cased', - # 'es.embed.bert_base_5lang_cased', - # 'es.embed.alberti_bert_base_multilingual_cased', - # 'es.embed.roberta_base_bne', - # 'es.embed.jurisbert', - # 'es.embed.mlm_spanish_roberta_base', - # 'es.embed.roberta_large_bne', - # 'es.pos', - # 'es.embed.bertin_base_random_exp_512seqlen', - # 'es.embed.bertin_base_gaussian_exp_512seqlen', - # 'es.ner.roberta_base_bne_capitel_ner_plus', - # 'es.ner.roberta_base_bne_capitel_ner', - # 'es.ner.RuPERTa_base_finetuned_ner', - # 'es.pos.roberta_base_bne_capitel_pos', - # 'es.ner.NER_LAW_MONEY4', - # 'es.pos.roberta_large_bne_capitel_pos', - # 'es.ner.bsc_bio_ehr_es_pharmaconer', - # 'es.embed.RoBERTalex', - # 'es.ner.roberta_large_bne_capitel_ner', - # 'es.embed.RuPERTa_base', - # 'es.embed.bertin_base_random', - # 'es.lemma', - # 'es.stopwords', - # 'es.pos.RuPERTa_base_finetuned_pos', - # 'es.embed.bertin_base_stepwise_exp_512seqlen', - # 'es.ner.bsc_bio_ehr_es_cantemist', - # 'et.stopwords', - # 'et.pos', - # 'et.embed.w2v_cc_300d', - # 'et.lemma', - # 'et.lemma', - # 'eu.stopwords', - # 'eu.embed.w2v_cc_300d', - # 'eu.lemma', - # 'fa.embed.roberta_fa_zwnj_base', - # 'fa.ner.roberta_fa_zwnj_base_ner', - # 'fa.pos', - # 'fa.stopwords', - # 'fi.embed.w2v_cc_300d', - # 'fi.pos', - # 'fi.lemma', - # 'fi.stopwords', - # 'fi.lemma', - # 'fo.pos', - # 'fr.embed.bert_base_fr_cased', - # 'fr.pos', - # 'fr.pos', - # 'fr.embed.french_roberta', - # 'fr.lemma', - # 'fr.lemma', - # 'fr.stopwords', - # 'fr.embed.roberta_base_wechsel_french', - # 'frr.embed.w2v_cc_300d', - # 'fy.embed.w2v_cc_300d', - # 'ga.pos', - # 'ga.stopwords', - # 'gd.embed.w2v_cc_300d', - # 'gl.embed.w2v_cc_300d', - # 'gl.lemma', - # 'gom.embed.w2v_cc_300d', - # 'grc.lemma', - # 'grc.stopwords', - # 'grc.lemma', - # 'grc.pos', - # 'gu.embed.RoBERTa_hindi_guj_san', - # 'gu.stopwords', - # 'gv.embed.w2v_cc_300d', - # 'he.stopwords', - # 'hi.stopwords', - # 'hi.embed.RoBERTa_hindi_guj_san', - # 'hi.embed.indic_transformers_hi_roberta', - # 'hi.embed.muril_adapted_local', - # 'hi.embed.indic_transformers_hi_bert', - # 'hr.embed.w2v_cc_300d', - # 'hr.stopwords', - # 'hr.lemma', - # 'hsb.embed.w2v_cc_300d', - # 'hu.lemma', - # 'hu.stopwords', - # 'hy.stopwords', - # 'hy.lemma', - # 'hy.embed.w2v_cc_300d', - # 'hyw.pos', - # 'hyw.lemma', - # 'id.pos', - # 'id.embed.indo_roberta_small', - # 'id.embed.indonesian_roberta_base', - # 'id.pos.indonesian_roberta_base_posp_tagger', - # 'id.lemma', - # 'id.lemma', - # 'id.embed.roberta_base_indonesian_522M', - # 'id.stopwords', - # 'id.embed.indonesian_roberta_large', - # 'is.lemma', - # 'is.stopwords', - # 'it.stopwords', - # 'it.pos', - # 'it.embed.bert_base_italian_xxl_cased', - # 'it.embed.bert_base_italian_xxl_uncased', - # 'it.embed.chefberto_italian_cased', - # 'it.embed.hseBert_it_cased', - # 'it.embed.wineberto_italian_cased', - # 'it.pos', - # 'it.lemma', - # 'it.lemma', - # 'it.lemma', - # 'ja.embed.bert_base_ja_cased', - # 'ja.embed.bert_base_japanese_char_v2', - # 'ja.embed.bert_base_japanese_char_extended', - # 'ja.embed.bert_large_japanese_char', - # 'ja.embed.bert_large_japanese', - # 'ja.embed.bert_small_japanese', - # 'ja.embed.bert_large_japanese_char_extended', - # 'ja.pos', - # 'ja.embed.bert_small_japanese_fin', - # 'ja.embed.bert_base_japanese_basic_char_v2', - # 'ja.stopwords', - # 'ja.embed.bert_base_japanese_char_whole_word_masking', - # 'ja.embed.bert_base_japanese_char', - # 'ja.embed.bert_base_japanese_whole_word_masking', - # 'ja.embed.bert_base_japanese_v2', - # 'jv.embed.javanese_roberta_small', - # 'jv.embed.javanese_roberta_small_imdb', - # 'jv.embed.javanese_bert_small_imdb', - # 'jv.embed.javanese_bert_small', - # 'ka.embed.w2v_cc_300d', - # 'kn.embed.KNUBert', - # 'kn.embed.KanBERTo', - # 'kn.stopwords', - # 'ko.lemma', - # 'ko.stopwords', - # 'ko.embed.roberta_ko_small', - # 'ko.pos', - # 'ko.embed.bert_kor_base', - # 'ko.embed.dbert', - # 'ko.embed.KR_FinBert', - # 'ko.embed.bert_base_v1_sports', - # 'ko.lemma', - # 'ky.stopwords', - # 'la.lemma', - # 'la.lemma', - # 'la.pos', - # 'la.pos', - # 'lb.stopwords', - # 'lb.lemma', - # 'lb.embed.w2v_cc_300d', - # 'lij.stopwords', - # 'lmo.embed.w2v_cc_300d', - # 'lt.embed.w2v_cc_300d', - # 'lt.lemma', - # 'lt.stopwords', - # 'lv.stopwords', - # 'lv.pos', - # 'mai.embed.w2v_cc_300d', - # 'mg.embed.w2v_cc_300d', - # 'min.embed.w2v_cc_300d', - # 'mk.stopwords', - # 'mk.lemma', - # 'mk.embed.w2v_cc_300d', - # 'ml.stopwords', - # 'ml.embed.w2v_cc_300d', - # 'mn.embed.w2v_cc_300d', - # 'mr.lemma', - # 'mr.stopwords', - # 'mr.embed.marathi_bert', - # 'mr.embed.muril_adapted_local', - # 'mr.pos', - # 'ms.embed.w2v_cc_300d', - # 'mt.lemma', - # 'mt.pos', - # 'mt.embed.w2v_cc_300d', - # 'mwl.embed.w2v_cc_300d', - # 'my.embed.w2v_cc_300d', - # 'myv.embed.w2v_cc_300d', - # 'mzn.embed.w2v_cc_300d', - # 'nah.embed.w2v_cc_300d', - # 'nap.embed.w2v_cc_300d', - # 'nb.stopwords', - # 'nb.lemma', - # 'nds.embed.w2v_cc_300d', - # 'ne.embed.w2v_cc_300d', - # 'ne.stopwords', - # 'new.embed.w2v_cc_300d', - # 'nl.pos.fullstop_dutch_punctuation_prediction', - # 'nl.stopwords', - # 'nl.embed.robbert_v2_dutch_base', - # 'nl.embed.robbertje_1_gb_bort', - # 'nl.embed.robbertje_1_gb_shuffled', - # 'nl.embed.robbertje_1_gb_non_shuffled', - # 'nl.embed.robbertje_1_gb_merged', - # 'nl.embed.w2v_cc_300d', - # 'nl.lemma', - # 'nn.embed.w2v_cc_300d', - # 'no.lemma', - # 'no.pos', - # 'no.pos', - # 'no.pos', - # 'no.embed.w2v_cc_300d', - # 'no.lemma', - # 'nso.embed.w2v_cc_300d', - # 'oc.embed.w2v_cc_300d', - # 'or.embed.w2v_cc_300d', - # 'orv.lemma', - # 'os.embed.w2v_cc_300d', - # 'pa.embed.w2v_cc_300d', - # 'pa.embed.muril_adapted_local', - # 'pfl.embed.w2v_cc_300d', - # 'pl.stopwords', - # 'pl.embed.w2v_cc_300d', - # 'pl.lemma', - # 'pms.embed.w2v_cc_300d', - # 'pnb.embed.w2v_cc_300d', - # 'ps.embed.w2v_cc_300d', - # 'pt.embed.BR_BERTo', - # 'pt.embed.gs_all', - # 'pt.stopwords', - # 'pt.embed.gs_clinical', - # 'pt.embed.gs_biomedical', - # 'pt.lemma', - # 'pt.lemma', - # 'pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos', - # 'pt.ner.satellite_instrument_roberta_NER', - # 'pt.embed.bert_small_gl_cased', - # 'pt.embed.bert_large_cased_pt_lenerbr', - # 'pt.embed.bert_large_portuguese_cased', - # 'pt.embed.bert_base_cased_pt_lenerbr', - # 'pt.embed.bert_base_portuguese_cased_finetuned_peticoes', - # 'pt.embed.bert_base_portuguese_cased', - # 'pt.embed.bert_base_pt_cased', - # 'pt.embed.bert_base_gl_cased', - # 'qhe.lemma', - # 'qtd.pos', - # 'qu.embed.w2v_cc_300d', - # 'rm.embed.w2v_cc_300d', - # 'ro.embed.w2v_cc_300d', - # 'ro.stopwords', - # 'ro.pos', - # 'ro.lemma', - # 'ru.pos', - # 'ru.lemma', - # 'ru.lemma', - # 'ru.embed.ruRoberta_large', - # 'ru.pos', - # 'ru.stopwords', - # 'ru.embed.roberta_base_russian_v0', - # 'ru.embed.bert_base_ru_cased', - # 'ru.embed.w2v_cc_300d', - # 'sa.embed.w2v_cc_300d', - # 'sa.lemma', - # 'sa.pos', - # 'sa.stopwords', - # 'sah.embed.w2v_cc_300d', - # 'sc.embed.w2v_cc_300d', - # 'scn.embed.w2v_cc_300d', - # 'sco.embed.w2v_cc_300d', - # 'sd.embed.w2v_cc_300d', - # 'sh.embed.w2v_cc_300d', - # 'si.stopwords', - # 'si.embed.w2v_cc_300d', - # 'sk.stopwords', - # 'sk.lemma', - # 'sk.embed.w2v_cc_300d', - # 'sl.lemma', - # 'sl.stopwords', - # 'sl.pos', - # 'sl.embed.w2v_cc_300d', - # 'sme.lemma', - # 'sme.pos', - # 'so.embed.w2v_cc_300d', - # 'sq.stopwords', - # 'sq.embed.w2v_cc_300d', - # 'sr.lemma', - # 'sr.embed.w2v_cc_300d', - # 'sr.lemma', - # 'sr.stopwords', - # 'su.embed.w2v_cc_300d', - # 'su.embed.sundanese_roberta_base', - # 'sv.stopwords', - # 'sv.embed.w2v_cc_300d', - # 'sv.lemma', - # 'sv.lemma', - # 'sw.embed.w2v_cc_300d', - # 'ta.stopwords', - # 'ta.embed.w2v_cc_300d', - # 'ta.embed.muril_adapted_local', - # 'te.embed.indic_transformers_te_bert', - # 'te.embed.telugu_bertu', - # 'te.embed.muril_adapted_local', - # 'te.embed.indic_transformers_te_roberta', - # 'te.stopwords', - # 'te.lemma', - # 'te.embed.w2v_cc_300d', - # 'tg.embed.w2v_cc_300d', - # 'th.stopwords', - # 'th.embed.w2v_cc_300d', - # 'ti.stopwords', - # 'tk.embed.w2v_cc_300d', - # 'tl.lemma', - # 'tl.embed.w2v_cc_300d', - # 'tl.stopwords', - # 'tl.embed.roberta_tagalog_large', - # 'tl.embed.roberta_tagalog_base', - # 'tn.stopwords', - # 'tr.lemma', - # 'tr.stopwords', - # 'tr.lemma', - # 'tr.pos', - # 'tr.embed.w2v_cc_300d', - # 'tr.lemma', - # 'tr.pos', - # 'tr.pos', - # 'tr.lemma', - # 'tr.lemma', - # 'tt.stopwords', - # 'tt.embed.w2v_cc_300d', - # 'ug.embed.w2v_cc_300d', - # 'uk.embed.ukr_roberta_base', - # 'uk.stopwords', - # 'uk.embed.w2v_cc_300d', - # 'uk.pos.bert_large_slavic_cyrillic_upos', - # 'uk.pos.bert_base_slavic_cyrillic_upos', - # 'ur.embed.muril_adapted_local', - # 'ur.embed.roberta_urdu_small', - # 'ur.lemma', - # 'ur.lemma', - # 'ur.pos', - # 'ur.embed.w2v_cc_300d', - # 'ur.stopwords', - # 'uz.embed.w2v_cc_300d', - # 'vec.embed.w2v_cc_300d', - # 'vi.stopwords', - # 'vi.embed.w2v_cc_300d', - # 'vls.embed.w2v_cc_300d', - # 'vo.embed.w2v_cc_300d', - # 'wa.embed.w2v_cc_300d', - # 'war.embed.w2v_cc_300d', - # 'wo.pos', - # 'xmf.embed.w2v_cc_300d', - # 'yi.embed.w2v_cc_300d', - # 'yo.embed.w2v_cc_300d', - # 'zea.embed.w2v_cc_300d', - # 'zh.embed.wobert_chinese_plus_base', - # 'zh.embed.bert_base_chinese_jinyong', - # 'zh.embed.rbt3', - # 'zh.embed.jdt_fin_roberta_wwm', - # 'zh.embed.mengzi_oscar_base', - # 'zh.embed.roberta_base_wechsel_chinese', - # 'zh.embed.sikubert', - # 'zh.embed.jdt_fin_roberta_wwm_large', - # 'zh.embed.rbtl3', - # 'zh.embed.macbert4csc_base_chinese', - # 'zh.pos.chinese_roberta_large_upos', - # 'zh.pos.chinese_roberta_base_upos', - # 'zh.pos.chinese_bert_wwm_ext_upos', - # 'zh.pos', - # 'zh.stopwords', - # 'zh.pos.bert_base_chinese_pos', - # 'zh.embed.rbt6', - # 'zh.embed.sikuroberta', - # 'zh.embed.uer_large', - # 'zh.embed.env_bert_chinese', - # 'zh.embed.chinese_roberta_wwm_ext', - # 'zh.embed.chinese_macbert_base', - # 'zh.embed.bert_base_zh_cased', - # 'zh.embed.bert_large_chinese', - # 'zh.embed.chinese_roberta_wwm_large_ext_fix_mlm', - # 'zh.embed.chinese_roberta_wwm_ext_large', - # 'zh.embed.chinese_bert_wwm_ext', - # 'zh.embed.chinese_macbert_large', - # 'zh.embed.mengzi_oscar_base_retrieval', - # 'zh.embed.mengzi_bert_base_fin', - # 'zh.embed.wobert_chinese_base', - # 'zh.embed.wobert_chinese_plus', - # 'zh.embed.rbt4', - # 'zh.embed.mengzi_oscar_base_caption', - # 'zh.embed.mengzi_bert_base', - # 'zh.embed.w2v_cc_300d', - - - # 'bs.embed.w2v_cc_300d', # TODO BAD?!?! - # 'nl.embed.MedRoBERTa' # BAD! - - ] - - fails = [] - fail_insta = True - for t in te: - try: - print(f'Testing spell = {t}') - pipe = nlu.load(t, verbose=True) - df = pipe.predict(['Peter love pancaces. I hate Mondays', 'I love Fridays']) - for c in df.columns: print(df[c]) - except Exception as err: - print(f'Failure for spell = {t} ', err) - e = sys.exc_info() - print(e[0]) - print(e[1]) - fails.append(t) - if fail_insta : - raise Exception(err) - fail_string = "\n".join(fails) - print(f'Done testing, failures = {fail_string}') - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - def test_344_HC_models(self): - import tests.secrets as sct - - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth(SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET) - te = [ - 'en.med_ner.biomedical_bc2gm', # OK? - 'en.resolve.rxnorm_action_treatment', # OK? - - # 'en.classify.rct_binary.use', # BAD - # 'en.classify.rct_binary.biobert', # BAD - 'pt.med_ner.deid.subentity', - 'pt.med_ner.deid.generic', - 'pt.med_ner.deid', - ] - sample_texts = [""" - Antonio Pérez Juan, nacido en Cadiz, España. Aún no estaba vacunado, se infectó con Covid-19 el dia 14/03/2020 y tuvo que ir al Hospital. Fue tratado con anticuerpos monoclonales en la Clinica San Carlos.. - """, - """ - Datos del paciente. Nombre: Jose . Apellidos: Aranda Martinez. NHC: 2748903. NASS: 26 37482910. - """, - """The patient was given metformin 500 mg, 2.5 mg of coumadin and then ibuprofen""", - """he patient was given metformin 400 mg, coumadin 5 mg, coumadin, amlodipine 10 MG""", - """To compare the results of recording enamel opacities using the TF and modified DDE indices.""", - """I felt a bit drowsy and had blurred vision after taking Aspirin.""", - ] - fails = [] - succs = [] - fail_insta = True - for t in te: - - try: - print(f'Testing spell = {t}') - pipe = nlu.load(t, verbose=True) - df = pipe.predict(sample_texts, drop_irrelevant_cols=False, metadata=True, ) - print(df.columns) - for c in df.columns: - print(df[c]) - succs.append(t) - except Exception as err: - print(f'Failure for spell = {t} ', err) - fails.append(t) - if fail_insta: - break - fail_string = '\n'.join(fails) - succ_string = '\n'.join(succs) - print(f'Done testing, failures = {fail_string}') - print(f'Done testing, successes = {succ_string}') - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/release_tests/release_400/__init__.py b/tests/release_tests/release_400/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/release_tests/release_400/tests_400.py b/tests/release_tests/release_400/tests_400.py deleted file mode 100644 index e11745b0..00000000 --- a/tests/release_tests/release_400/tests_400.py +++ /dev/null @@ -1,124 +0,0 @@ -import unittest -import nlu -import sys - - -class Test400(unittest.TestCase): - - def test_400_models(self): - import pandas as pd - q = 'What is my name?' - c = 'My name is Clara and I live in Berkeley' - - - te = [ - 'en.span_question.albert' - ] - data = f'{q}|||{c}' - data = [data,data,data] - fails = [] - fail_insta = True - for t in te: - try: - print(f'Testing spell = {t}') - pipe = nlu.load(t, verbose=True) - df = pipe.predict(data,metadata=True) - for c in df.columns: print(df[c]) - except Exception as err: - print(f'Failure for spell = {t} ', err) - e = sys.exc_info() - print(e[0]) - print(e[1]) - fails.append(t) - if fail_insta : - raise Exception(err) - fail_string = "\n".join(fails) - print(f'Done testing, failures = {fail_string}') - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - def test_344_HC_models(self): - import tests.secrets as sct - - def test_400_HC_models(self): - import tests.secrets as sct - SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE - AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY - JSL_SECRET = sct.JSL_SECRET - nlu.auth(SPARK_NLP_LICENSE, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET) - te = [ - 'es.embed.scielo300d', - 'en.ner.clinical_trials_abstracts', - 'en.med_ner.clinical_trials_abstracts', - 'en.med_ner.pathogen', - 'en.med_ner.living_species.token_bert', - 'en.med_ner.living_species', - 'en.med_ner.living_species.biobert', - 'en.classify.stress', - 'es.med_ner.living_species', - 'es.med_ner.living_species.bert', - 'es.med_ner.living_species.roberta', - 'es.med_ner.living_species.300', - 'es.med_ner.living_species', - 'fr.med_ner.living_species', - 'fr.med_ner.living_species.bert', - 'pt.med_ner.living_species.token_bert', - 'pt.med_ner.living_species', - 'pt.med_ner.living_species.roberta', - 'pt.med_ner.living_species.bert', - 'it.med_ner.living_species', - 'it.med_ner.living_species.bert', - 'it.med_ner.living_species', - 'cat.med_ner.living_species', - 'gal.med_ner.living_species', - 'ro.med_ner.living_species.bert', - 'ro.med_ner.clinical', - 'ro.embed.clinical.bert.base_cased', - 'ro.med_ner.deid.subentity', - 'ro.med_ner.deid.subentity.bert', - 'en.med_ner.pathogen.pipeline', - 'en.med_ner.biomedical_bc2gm.pipeline', - 'ro.deid.clinical', - 'en.med_ner.clinical_trials_abstracts.pipe', - - ] - sample_texts = [""" - Antonio Pérez Juan, nacido en Cadiz, España. Aún no estaba vacunado, se infectó con Covid-19 el dia 14/03/2020 y tuvo que ir al Hospital. Fue tratado con anticuerpos monoclonales en la Clinica San Carlos.. - """, - """ - Datos del paciente. Nombre: Jose . Apellidos: Aranda Martinez. NHC: 2748903. NASS: 26 37482910. - """, - """The patient was given metformin 500 mg, 2.5 mg of coumadin and then ibuprofen""", - """he patient was given metformin 400 mg, coumadin 5 mg, coumadin, amlodipine 10 MG""", - """To compare the results of recording enamel opacities using the TF and modified DDE indices.""", - """I felt a bit drowsy and had blurred vision after taking Aspirin.""", - ] - fails = [] - succs = [] - fail_insta = True - for t in te: - - try: - print(f'Testing spell = {t}') - pipe = nlu.load(t, verbose=True) - df = pipe.predict(sample_texts, drop_irrelevant_cols=False, metadata=True, ) - print(df.columns) - for c in df.columns: - print(df[c]) - succs.append(t) - except Exception as err: - print(f'Failure for spell = {t} ', err) - fails.append(t) - if fail_insta: - break - fail_string = '\n'.join(fails) - succ_string = '\n'.join(succs) - print(f'Done testing, failures = {fail_string}') - print(f'Done testing, successes = {succ_string}') - if len(fails) > 0: - raise Exception("Not all new spells completed successfully") - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/run_tests.py b/tests/run_tests.py new file mode 100644 index 00000000..b0b9f0ba --- /dev/null +++ b/tests/run_tests.py @@ -0,0 +1,103 @@ +import argparse +import json +import os +import subprocess +import sys +from typing import List + +import colorama + +sys.path.append(os.getcwd()) +sys.path.append(os.getcwd() + '/tests') +sys.path.append(os.getcwd() + '/tests/utils') +from utils import all_tests +from tests.utils import one_per_lib + +_TEST_TIMEOUT = 60 * 60 * 20 + + +def run_cmd_and_check_succ( + args: List[str], + log=True, + timeout=60 +) -> bool: + print(f"👷 Executing {colorama.Fore.LIGHTGREEN_EX}{args}{colorama.Fore.RESET}") + try: + r = subprocess.run(args, capture_output=True, timeout=timeout) + was_suc = process_was_suc(r) + if was_suc: + print( + f"{colorama.Fore.LIGHTGREEN_EX}✅ Success running {args}{colorama.Fore.RESET}" + ) + else: + print( + f"{colorama.Fore.LIGHTRED_EX}❌ Failure running {args}{colorama.Fore.RESET}" + ) + if not was_suc and log: + log_process(r) + return was_suc + except subprocess.TimeoutExpired: + try: + log_process(r) + except: + print("No logs to print") + print(f"{colorama.Fore.LIGHTRED_EX}❌ Timeout running {args}{colorama.Fore.RESET}") + return False + + +def process_was_suc( + result: subprocess.CompletedProcess, +) -> bool: + return result.returncode == 0 + + +def log_process(result: subprocess.CompletedProcess): + print("______________STDOUT:") + print(result.stdout.decode()) + print("______________STDERR:") + print(result.stderr.decode()) + + +if __name__ == '__main__': + # Workaround until logging issue with pytest-xdist is fixed + # https://github.com/pytest-dev/pytest-xdist/issues/402 + # We need to launch every test in a separate process + # because we cannot de-allocate models from JVM from within a test + # So to prevent JVM-OOM we need to run each test in a separate process + + parser = argparse.ArgumentParser(description='Testing CLI') + parser.add_argument('test_type', choices=['all', 'one_per_lib'], default='all', help='Type of test to run') + args = parser.parse_args() + logs = {} + tests_to_execute = all_tests + + if args.test_type == 'all': + tests_to_execute = all_tests + elif args.test_type == 'one_per_lib': + tests_to_execute = one_per_lib + total_tests = len(tests_to_execute) + + print(f'Running Tests: {tests_to_execute}') + for i, test_params in enumerate(tests_to_execute): + if i % 3 == 0: + # Delete models so we dont run out of diskspace + os.system('rm -r ~/cache_pretrained') + print(f"{'#' * 10} Running test {i} of {total_tests} with config {test_params} {'#' * 10}") + logs[i] = {} + logs[i]['test_data'] = test_params + print(f"Running test {i} {test_params}") + py_path = 'python' + with open('test.json', 'w') as json_file: + json.dump(test_params.dict(), json_file) + cmd = [py_path, 'tests/utils/run_test.py', 'test.json'] + logs[i]['success'] = run_cmd_and_check_succ(cmd, timeout=_TEST_TIMEOUT) + + print(f"{'#' * 10} Failed tests {'#' * 10}") + failed = 0 + for test_idx in logs: + if not logs[test_idx]['success']: + failed += 1 + print(logs[test_idx]) + print(f"{'#' * 10} {failed} of {total_tests} failed {'#' * 10}") + if failed > 0: + raise Exception('Tests Failed') diff --git a/tests/secrets.py b/tests/secrets.py deleted file mode 100644 index cc7408aa..00000000 --- a/tests/secrets.py +++ /dev/null @@ -1,10 +0,0 @@ -import json -import os - -license_dict = json.loads(os.getenv("JSL_LICENSE")) -AWS_ACCESS_KEY_ID = license_dict.get("AWS_ACCESS_KEY_ID") -AWS_SECRET_ACCESS_KEY = license_dict.get("AWS_SECRET_ACCESS_KEY") -JSL_SECRET = license_dict.get("SECRET") -SPARK_NLP_LICENSE = license_dict.get("SPARK_NLP_LICENSE") -OCR_SECRET = license_dict.get("SPARK_OCR_SECRET") -OCR_LICENSE = license_dict.get("SPARK_OCR_LICENSE") diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 00000000..19789870 --- /dev/null +++ b/tests/utils/__init__.py @@ -0,0 +1,5 @@ +from .test_data import get_test_data +from ._secrets import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, JSL_SECRET, SPARK_NLP_LICENSE, OCR_SECRET, OCR_LICENSE, \ + JSON_LIC_PATH +from .model_test import NluTest, PipeParams, all_tests, one_per_lib +from .test_utils import model_and_output_levels_test diff --git a/tests/utils/_secrets.py b/tests/utils/_secrets.py new file mode 100644 index 00000000..00c1b981 --- /dev/null +++ b/tests/utils/_secrets.py @@ -0,0 +1,21 @@ +import json +import os + +if os.path.exists('./tests/lic.json'): + with open('./tests/lic.json', 'r') as f: + license_dict = json.loads(f.read()) +elif 'JOHNSNOWLABS_LICENSE_JSON' in os.environ: + license_dict = json.loads(os.getenv("JOHNSNOWLABS_LICENSE_JSON")) + with open('./tests/lic.json', 'w') as f: + json.dump(license_dict, f) +else: + raise Exception("No license found") + +AWS_ACCESS_KEY_ID = license_dict.get("AWS_ACCESS_KEY_ID") +AWS_SECRET_ACCESS_KEY = license_dict.get("AWS_SECRET_ACCESS_KEY") +JSL_SECRET = license_dict.get("SECRET") +SPARK_NLP_LICENSE = license_dict.get("SPARK_NLP_LICENSE") +OCR_SECRET = license_dict.get("SPARK_OCR_SECRET") +OCR_LICENSE = license_dict.get("SPARK_OCR_LICENSE") + +JSON_LIC_PATH = './tests/lic.json' diff --git a/tests/utils/model_test.py b/tests/utils/model_test.py new file mode 100644 index 00000000..b20a3539 --- /dev/null +++ b/tests/utils/model_test.py @@ -0,0 +1,280 @@ +from typing import Optional, List, Union, Dict + +from pydantic import BaseModel + + +# Define the PipeParams data class using Pydantic +class PipeParams(BaseModel): + pipe_key: str + param_setter: str + param_val: Union[str, int, float, bool, Dict[str, str], Dict[str, List[str]]] + + +# Define the NluTest data class using Pydantic +class NluTest(BaseModel): + nlu_ref: str + lang: str + test_group: str + input_data_type: str + library: str + output_levels: Optional[List[str]] = None + pipe_params: Optional[List[PipeParams]] = None + + +ocr_tests = [ + NluTest(nlu_ref="pdf2table", lang='en', test_group='table_extractor', input_data_type='PPT_table', + library='ocr'), + NluTest(nlu_ref="ppt2table", lang='en', test_group='table_extractor', input_data_type='PDF_table', + library='ocr'), + NluTest(nlu_ref="doc2table", lang='en', test_group='table_extractor', input_data_type='DOC_table', + library='ocr'), + NluTest(nlu_ref="en.classify_image.convnext.tiny", lang='en', test_group='img_classifier', + input_data_type='IMG_classifier', library='ocr'), +] + +medical_tests = [ + NluTest(nlu_ref="en.assert.biobert", lang='en', test_group='assertion', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="relation.drug_drug_interaction", lang='en', test_group='relation', input_data_type='medical', + output_levels=['chunk', 'tokens', 'embeddings', 'document', 'relation'], library='healthcare'), + NluTest(nlu_ref="en.de_identify", lang='en', test_group='hc_deidentification', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.norm_drugs", lang='en', test_group='hc_drugnormalizer', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="bert elmo", lang='en', test_group='hc_genericclassifier', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.classify.ade.conversational", lang='en', test_group='hc_licensedclassifier', + input_data_type='medical', library='healthcare'), + NluTest(nlu_ref="en.med_ner.tumour en.med_ner.radiology en.med_ner.diseases en.ner.onto", lang='en', + test_group='hc_pipeline', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.explain_doc.era", lang='en', test_group='hc_pipeline', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.med_ner.diseases en.resolve.icd10cm.augmented_billable", lang='en', + test_group='hc_resolver', + input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.summarize.clinical_jsl", lang='en', test_group='hc_summarizer', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.generate.generic_flan_base", lang='en', test_group='hc_generation', + input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="en.zero_shot.ner_roberta", lang='en', test_group='hc_generation', input_data_type='medical', + library='healthcare', + output_levels=["sentence"], + pipe_params=[PipeParams(pipe_key='zero_shot_ner', param_setter='setEntityDefinitions', param_val={ + "PROBLEM": [ + "What is the disease?", + "What is his symptom?", + "What is her disease?", + "What is his disease?", + "What is the problem?", + "What does a patient suffer", + "What was the reason that the patient is admitted to the clinic?", + ], + "DRUG": [ + "Which drug?", + "Which is the drug?", + "What is the drug?", + "Which drug does he use?", + "Which drug does she use?", + "Which drug do I use?", + "Which drug is prescribed for a symptom?", + ], + "ADMISSION_DATE": ["When did patient admitted to a clinic?"], + "PATIENT_AGE": [ + "How old is the patient?", + "What is the gae of the patient?", + ], + })]), + NluTest(nlu_ref="en.med_ner.clinical en.relation.zeroshot_biobert", lang='en', test_group='hc_generation', + input_data_type='medical', library='healthcare', + pipe_params=[ + PipeParams(pipe_key='zero_shot_relation_extraction', param_setter='setRelationalCategories', + param_val={ + "CURE": ["{{TREATMENT}} cures {{PROBLEM}}."], + "IMPROVE": ["{{TREATMENT}} improves {{PROBLEM}}.", + "{{TREATMENT}} cures {{PROBLEM}}."], + "REVEAL": ["{{TEST}} reveals {{PROBLEM}}."]}), + PipeParams(pipe_key='zero_shot_relation_extraction', param_setter='setMultiLabel', + param_val=False)]), + +] + +nlp_tests = [ + NluTest(nlu_ref="chunk", lang='en', test_group='chunker', input_data_type='generic', library='open_source'), + NluTest(nlu_ref="ngram", lang='en', test_group='chunker', input_data_type='generic', library='open_source'), + NluTest(nlu_ref="zh.segment_words", lang='zh', test_group='tokenizer', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="zh.tokenize", lang='zh', test_group='tokenizer', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="tokenize", lang='en', test_group='tokenizer', input_data_type='generic', + library='open_source'), + + NluTest(nlu_ref="en.classify.albert.ag_news", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.albert.imdb", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.bert_sequence.dehatebert_mono", lang='en', test_group='classifier', + input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.bert.zero_shot_classifier", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.speech2text.wav2vec2.v2_base_960h", lang='en', test_group='classifier', + input_data_type='asr', + library='open_source'), + NluTest(nlu_ref="en.speech2text.hubert", lang='en', test_group='classifier', input_data_type='asr', + library='open_source'), + NluTest(nlu_ref="cyberbullying", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.news.deberta.small", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="e2e", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="emotion", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="pos", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.distilbert.zero_shot_classifier", lang='en', test_group='classifier', + input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="tr.distilbert.zero_shot_classifier.allnli", lang='tr', test_group='classifier', + input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="lang", lang='fr', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="lang", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="fr.classify.camembert.base", lang='fr', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.ag_news.longformer", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.imdb.longformer", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="zh.ner", lang='zh', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.ner.onto.glove.6B_100d", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="questions", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.roberta.zero_shot_classifier", lang='en', test_group='classifier', + input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="sarcasm", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="sentiment.imdb", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="sentiment.twitter", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="sentiment.vivekn", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.roberta.imdb", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.snips", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="spam", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="toxic", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.ner.camembert", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.ag_news.xlnet", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify.imdb.xlnet", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="yake", lang='en', test_group='classifier', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.classify_image.base_patch16_224", lang='en', test_group='classifier', + input_data_type='IMG_vit', + library='open_source'), + NluTest(nlu_ref="en.classify_image.swin.tiny", lang='en', test_group='classifier', input_data_type='IMG_vit', + library='open_source'), + NluTest(nlu_ref="en.answer_question.tapas.wikisql.base_finetuned", lang='en', test_group='classifier', + input_data_type='tapas', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="embed_sentence.bert", lang='en', test_group='sentence_embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="embed_sentence.electra", lang='en', test_group='sentence_embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="bert", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="xx.embed.distilbert", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="bert electra", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="glove", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="bert en.embed.bert.small_L8_128 electra", lang='en', test_group='embedding', + input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.embed.roberta", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="xx.embed.xlm", lang='en', test_group='embedding', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="norm_document", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="lemma", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="norm", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="xx.sentence_detector", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="stem", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="stopwords", lang='en', test_group='pre_processing', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.seq2seq.distilbart_cnn_6_6", lang='en', test_group='seq2seq', input_data_type='summarizer', + library='open_source', + pipe_params=[PipeParams(pipe_key='bart_transformer', param_setter='setTask', param_val='summarize:'), + PipeParams(pipe_key='bart_transformer', param_setter='setMaxOutputLength', + param_val=200)]), + NluTest(nlu_ref="xx.de.translate_to.en", lang='de', test_group='seq2seq', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.coreference.spanbert", lang='en', test_group='span_bert_coref', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="dep.typed", lang='en', test_group='typed_dependency', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="dep.untyped", lang='en', test_group='untyped_dependency', input_data_type='generic', + library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.bert.base", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="fr.answer_question.camembert.fquad", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.deberta", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.distil_bert.base_cased", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.longformer.base", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.roberta.base.by_deepset", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="en.answer_question.squadv2.xlm_roberta.base", lang='en', test_group='span_classifier', + input_data_type='qa', + output_levels=['chunk', 'tokens', 'embeddings'], library='open_source'), + NluTest(nlu_ref="ner.onto", lang='en', test_group='pipeline', input_data_type='generic', library='open_source'), + NluTest(nlu_ref="en.t5.small", lang='en', test_group='seq2seq', input_data_type='generic', + library='open_source', + pipe_params=[PipeParams(pipe_key='t5_transformer', param_setter='setTask', + param_val='translate English to French')]), + NluTest(nlu_ref="match.chunks", lang='en', test_group='matcher', input_data_type='generic', + library='open_source'), + +] + + +one_per_lib = [ + NluTest(nlu_ref="chunk", lang='en', test_group='chunker', input_data_type='generic', library='open_source'), + NluTest(nlu_ref="en.assert.biobert", lang='en', test_group='assertion', input_data_type='medical', + library='healthcare'), + NluTest(nlu_ref="ppt2table", lang='en', test_group='table_extractor', input_data_type='PDF_table', + library='ocr'), +] + + +all_tests = ocr_tests + medical_tests + nlp_tests diff --git a/tests/utils/run_test.py b/tests/utils/run_test.py new file mode 100644 index 00000000..1a903d01 --- /dev/null +++ b/tests/utils/run_test.py @@ -0,0 +1,18 @@ +import json +import os +import sys + +sys.path.append(os.getcwd()) +from test_utils import model_and_output_levels_test + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("Usage: python run_test.py ") + sys.exit(1) + + with open(sys.argv[1], 'r') as json_file: + test_data = json.load(json_file) + + # Call your test function with the test data from the JSON file + print('Running test with data: ', type(test_data), test_data) + model_and_output_levels_test(**test_data) diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py new file mode 100644 index 00000000..4d652441 --- /dev/null +++ b/tests/utils/test_data.py @@ -0,0 +1,156 @@ +from dataclasses import dataclass +from typing import List, Union + +import pandas as pd + + +@dataclass +class TestData: + data: Union[str, List[str]] + + +generic_data = { + 'en': TestData([ + "A person like Jim or Joe", + "An organisation like Microsoft or PETA", + "A location like Germany", + "Anything else like Playstation", + "Person consisting of multiple tokens like Angela Merkel or Donald Trump", + "Organisations consisting of multiple tokens like JP Morgan", + "Locations consiting of multiple tokens like Los Angeles", + "Anything else made up of multiple tokens like Super Nintendo", + "Disney Comics was a comic book publishing company operated by The Walt Disney Company which ran from 1990 to 1993.", + "I really liked that movie!", + "Peter love pancaces. I hate Mondays", + "Donald Trump from America and Angela Merkel from Germany dont share many opinions.", + "You stupid person with an identity that shall remain unnamed, such a filthy identity that you have go to a bad place you person!", + " Example

This is an example of a simple HTML page with one paragraph.

", + "HELLO WORLD! How are YOU!?!@", + "I liek penut buttr and jelly", + 'John told Mary he would like to borrow a book', + + ]) + , + 'de': TestData( + [ + "Wer ist Praesident von Deutschland", + "Was ist NLP?", + ]) + , + 'zh': TestData( + [ + '您的生活就是矩阵编程固有的不平衡方程的剩余部分之和。您是异常的最终结果,尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里。', + '尽管做出了我最大的努力,但我仍然无法消除数学精度的和谐。尽管仍然不遗余力地避免了负担,但这并不意外,因此也不超出控制范围。这无可避免地将您引向了这里' + ]) + , + 'tr': TestData( + [ + "Dolar yükselmeye devam ediyor.", + "Senaryo çok saçmaydı, beğendim diyemem.", + ]) + , + 'fr': TestData( + [ + "NLU est une bibliothèque de traitement de texte open source pour le traitement avancé du langage naturel pour les langages de programmation Python.", + "A aller voir d'urgence !", + ]) + , +} + +medical_data = { + 'en': TestData([ + 'Gravid with estimated fetal weight of 6-6/12 pounds. LABORATORY DATA: Laboratory tests include a CBC which is normal. HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet', + """Miss M. is a 67-year-old lady, with past history of COPD and Hypertension, + presents with a 3-weeks history of a lump in her right Breast. + The lump appeared suddenly, also painful. 5 days ago, another lump appeared in her right axilla. + On examination a 2 x 3 cm swelling was seen in the right Breast. + It was firm and also non-tender and immobile. There was no discharge. + Another 1x1 cm circumferential swelling was found in the right Axilla, + which was freely mobile and also tender. + Her family history is remarkable for Breast cancer (mother), + cervical cancer (maternal grandmother), heart disease (father), + COPD (Brother), dementia (Grandfather), diabetes (Grandfather), and CHF (Grandfather).""", + "The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also given 1 unit of Metformin daily. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day.", + "DR Johnson administerd to the patient Peter Parker last week 30 MG of penicilin", + "interferon alfa-2b 10 million unit ( 1 ml ) injec", + "The patient has cancer and high fever and will die next week.", + "The patient has COVID. He got very sick with it.", + "Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer.", + "Covid 19 is", + "The doctor pescribed Majezik for my severe headache.", + "The patient is a 71-year-old female patient of Dr. X. and she was given Aklis, Dermovate, Aacidexam and Paracetamol.", + + ]) +} + +image_data = { + 'PPT_table': TestData(['tests/datasets/ocr/table_PPT/54111.ppt', + 'tests/datasets/ocr/table_PPT/mytable.ppt', + ]), + 'PDF_table': TestData(['tests/datasets/ocr/table_pdf_highlightable_text/data.pdf', + ]), + 'DOC_table': TestData(['tests/datasets/ocr/docx_with_table/doc2.docx', + ]), + 'IMG_vit': TestData(['tests/datasets/ocr/vit/general_images/images/bluetick.jpg', + 'tests/datasets/ocr/vit/general_images/images/chihuahua.jpg', + 'tests/datasets/ocr/vit/general_images/images/egyptian_cat.jpeg', + 'tests/datasets/ocr/vit/ox.jpg', + 'tests/datasets/ocr/vit/general_images/images/hen.JPEG', + 'tests/datasets/ocr/vit/general_images/images/hippopotamus.JPEG', + 'tests/datasets/ocr/vit/general_images/images/junco.JPEG', + 'tests/datasets/ocr/vit/general_images/images/palace.JPEG', + 'tests/datasets/ocr/vit/general_images/images/tractor.JPEG' + ]), + 'IMG_classifier': TestData(['tests/datasets/ocr/images/teapot.jpg']), +} + +audio_data = { + 'asr': TestData(['tests/datasets/audio/asr/ngm_12484_01067234848.wav']), +} + +qa_data = { + 'tapas': TestData([(pd.DataFrame( + {'name': ['Donald Trump', 'Elon Musk'], 'money': ['$100,000,000', '$20,000,000,000,000'], 'age': ['75', '55']}), + [ + "Who earns less than 200,000,000?", + "Who earns 100,000,000?", + "How much money has Donald Trump?", + "How old are they?", + ])] + ), + 'qa': TestData(['What is my name?|||My name is CKL']), +} + +summarizer_data = { + 'summarizer': TestData([ + '''LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I'll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say 'kid star goes off the rails,'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's "Equus." Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.''' + ]), +} + + +def get_test_data(lang, input_data_type): + if input_data_type == 'generic': + if lang not in generic_data: + raise NotImplementedError(f'No data for language {lang}') + return generic_data[lang].data + elif input_data_type == 'medical': + if lang not in medical_data: + raise NotImplementedError(f'No data for language {lang}') + return medical_data[lang].data + + elif input_data_type in image_data: + return image_data[input_data_type].data + + elif input_data_type in audio_data: + return audio_data[input_data_type].data + + elif input_data_type in summarizer_data: + return summarizer_data[input_data_type].data + + elif input_data_type in qa_data: + if input_data_type in ["tapas"]: + return qa_data[input_data_type].data[0] + else: + return qa_data[input_data_type].data + else: + raise NotImplementedError(f'No data for type {input_data_type} in language {lang}') diff --git a/tests/test_utils.py b/tests/utils/test_utils.py similarity index 51% rename from tests/test_utils.py rename to tests/utils/test_utils.py index 384bacaa..8c303b2a 100644 --- a/tests/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,7 +1,32 @@ +import os +import sys + import pandas as pd import sparknlp +import _secrets as secrets import nlu +from test_data import get_test_data + +os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable + + +def log_df(df, test_group): + if df is None: + raise Exception('Cannot log Df which is none') + if test_group == 'table_extractor': + assert len(test_group) > 0, 'At least one table should have been extracted' + for extracted_df in df: + log_df(extracted_df, 'generic') + return + for c in df.columns: + print(df[c]) + + +def log_and_validate(df, test_group): + log_df(df, test_group) + validate_predictions(df) def get_sample_pdf(): @@ -80,16 +105,16 @@ def get_sample_pdf_with_extra_cols_and_entities(): def download_dataset( - data_url, - output_file_name, - output_folder, + data_url, + output_file_name, + output_folder, ): import urllib.request download_path = ( - create_dataset_dir_if_not_exist_and_get_path() - + output_folder - + output_file_name + create_dataset_dir_if_not_exist_and_get_path() + + output_folder + + output_file_name ) # Check if dir exists, if not create it @@ -133,3 +158,56 @@ def create_path_if_not_exist(path): if not os.path.exists(path): print("Creating dir", path) os.mkdir(path) + + +def model_and_output_levels_test(nlu_ref, lang, test_group=None, output_levels=None, input_data_type='generic', + library='open_source', pipe_params=None): + from johnsnowlabs import nlp + if library == 'open_source': + nlp.start() + elif library == 'healthcare': + nlp.start(json_license_path=secrets.JSON_LIC_PATH,) + elif library == 'ocr': + nlp.start(json_license_path=secrets.JSON_LIC_PATH, visual=True) + else: + raise Exception(f'Library {library} is not supported') + + if not output_levels: + # default everything except relation. Add it manually for RE models + output_levels = ['chunk', 'tokens', 'embeddings', 'document'] + for output_level in output_levels: + model_test(nlu_ref, output_level=output_level, lang=lang, test_group=test_group, + input_data_type=input_data_type, pipe_params=pipe_params) + + +def model_test(nlu_ref, output_level=None, drop_irrelevant_cols=False, metadata=True, positions=True, + test_group=None, + lang='en', + input_data_type='generic', pipe_params=None): + print(f'Testing Model {nlu_ref} with output_level={output_level} test_group={test_group}') + pipe = nlu.load(nlu_ref, verbose=True) + data = get_test_data(lang, input_data_type=input_data_type) + + if (input_data_type in ['asr', 'IMG_vit', 'IMG_classifier']): + metadata = False + positions = False + + if pipe_params is not None: + for p in pipe_params: + getattr(pipe[p.pipe_key], p.param_setter)(p.param_val) + + df = pipe.predict(data, output_level=output_level, + drop_irrelevant_cols=drop_irrelevant_cols, metadata=metadata, + positions=positions) + log_and_validate(df, test_group) + + if isinstance(data, list): + df = pipe.predict(data[0], output_level=output_level, + drop_irrelevant_cols=drop_irrelevant_cols, metadata=metadata, + positions=positions) + log_and_validate(df, test_group) + + +def validate_predictions(df): + # TODO + return True