diff --git a/nlu/__init__.py b/nlu/__init__.py index 16559b4e..1e924ff1 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -1,5 +1,4 @@ -__version__ = '5.1.3' - +__version__ = '5.1.4' import nlu.utils.environment.env_utils as env_utils diff --git a/nlu/components/classifiers/span_medical/__init__.py b/nlu/components/classifiers/span_medical/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/classifiers/span_medical/span_medical.py b/nlu/components/classifiers/span_medical/span_medical.py new file mode 100644 index 00000000..cb9be4c8 --- /dev/null +++ b/nlu/components/classifiers/span_medical/span_medical.py @@ -0,0 +1,18 @@ +class SpanMedical: + @staticmethod + def get_default_model(): + from sparknlp_jsl.annotator import MedicalQuestionAnswering + + return MedicalQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "context"]) \ + .setOutputCol("answer") + + + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + from sparknlp_jsl.annotator import MedicalQuestionAnswering + + return MedicalQuestionAnswering.pretrained(name, language, bucket) \ + .setInputCols(["document_question", "context"]) \ + .setOutputCol("answer") diff --git a/nlu/pipe/col_substitution/col_substitution_HC.py b/nlu/pipe/col_substitution/col_substitution_HC.py index b483b0d8..8aa13231 100644 --- a/nlu/pipe/col_substitution/col_substitution_HC.py +++ b/nlu/pipe/col_substitution/col_substitution_HC.py @@ -395,3 +395,28 @@ def substitute_generic_classifier_parser_cols(c, cols, is_unique=True, nlu_ident logger.info(f'Dropping unmatched metadata_col={col} for c={c}') # new_cols[col]= f"{new_base_name}_confidence" return new_cols +def substitute_hc_span_classifier_cols(c, cols, nlu_identifier=True): + """ + QA classifier + """ + new_cols = {} + #new_base_name = 'answer' if nlu_identifier == 'UNIQUE' else f'{nlu_identifier}_answer' + new_base_name = 'answer' + for col in cols: + if 'answer_results' in col: + new_cols[col] = f'{new_base_name}' + if 'answer_results_score' in col: + new_cols[col] = f'{new_base_name}_confidence' + + elif 'span_start_score' in col: + new_cols[col] = f'{new_base_name}_start_confidence' + elif 'span_end_score' in col: + new_cols[col] = f'{new_base_name}_end_confidence' + elif 'start' in col and not 'score' in col: + new_cols[col] = f'{new_base_name}_start' + elif 'end' in col and not 'score' in col: + new_cols[col] = f'{new_base_name}_end' + elif 'sentence' in col: + new_cols[col] = f'{new_base_name}_sentence' + + return new_cols \ No newline at end of file diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py index bde1a292..ca7d94b0 100644 --- a/nlu/pipe/nlu_component.py +++ b/nlu/pipe/nlu_component.py @@ -76,6 +76,7 @@ def __init__(self, requires_image_format: bool = False, # Set to true for OCR annotators that require image format is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format is_light_pipe_incompatible: bool = False, # Set to true for OCR annotators that require image format + prefer_light_pipe: bool = False, # Set True for annos that should run in light pipe ): self.name = name self.type = type @@ -118,6 +119,7 @@ def __init__(self, self.requires_image_format = requires_image_format self.is_visual_annotator = is_visual_annotator self.is_light_pipe_incompatible = is_light_pipe_incompatible + self.prefer_light_pipe = prefer_light_pipe def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel], nlu_ref: str, diff --git a/nlu/pipe/utils/data_conversion_utils.py b/nlu/pipe/utils/data_conversion_utils.py index 3741a21e..7743d238 100644 --- a/nlu/pipe/utils/data_conversion_utils.py +++ b/nlu/pipe/utils/data_conversion_utils.py @@ -13,18 +13,25 @@ from pyspark.sql.types import StringType, StructType, StructField +class NluDataParseException(Exception): + """Custom exception class""" + + def __init__(self, message="An error occurred parsing data with NLU"): + self.message = message + super().__init__(self.message) + class DataConversionUtils: # Modin aswell but optional, so we dont import the type yet supported_types = [pyspark.sql.DataFrame, pd.DataFrame, pd.Series, np.ndarray] @staticmethod def except_text_col_not_found(cols): - raise ValueError( + raise NluDataParseException( f'Could not find column named "text" in input Pandas Dataframe. Please ensure one column named such exists. Columns in DF are : {cols} ') @staticmethod def except_invalid_question_data_format(cols): - raise ValueError( + raise NluDataParseException( f'You input data format is invalid for question answering with span classification.' f'Make sure you have at least 2 columns in you dataset, named context/question for pandas Dataframes' f'For Strings/Iterables/Tuples make sure to use the format `question|||context` or (question,context) ') @@ -301,7 +308,6 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is # TODO invalid Table Data Format Exception pass if isinstance(data[0], str): - return DataConversionUtils.table_question_str_to_sdf(data, spark_sess) if isinstance(data[0], pd.DataFrame): return DataConversionUtils.table_question_pdf_to_sdf(data, spark_sess) @@ -321,6 +327,8 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is return DataConversionUtils.question_tuple_iterable_to_sdf(data, spark_sess) elif isinstance(data[0], str): return DataConversionUtils.question_str_iterable_to_sdf(data, spark_sess) + except NluDataParseException as err : + raise err except: ValueError("Data could not be converted to Spark Dataframe for internal conversion.") else: diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py index 62d12b8a..9c80913a 100644 --- a/nlu/pipe/utils/pipe_utils.py +++ b/nlu/pipe/utils/pipe_utils.py @@ -668,6 +668,9 @@ def add_metadata_to_pipe(pipe: NLUPipeline): for c in pipe.components: # Check for OCR componments + if c.prefer_light_pipe: + pipe.prefer_light = True + if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator: pipe.contains_ocr_components = True if c.requires_image_format: diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py index 1af89f5d..9a2cb4de 100644 --- a/nlu/pipe/utils/predict_helper.py +++ b/nlu/pipe/utils/predict_helper.py @@ -8,7 +8,7 @@ from sparknlp.common import AnnotatorType from nlu.pipe.utils.audio_data_conversion_utils import AudioDataConversionUtils -from nlu.pipe.utils.data_conversion_utils import DataConversionUtils +from nlu.pipe.utils.data_conversion_utils import DataConversionUtils, NluDataParseException from nlu.pipe.utils.ocr_data_conversion_utils import OcrDataConversionUtils logger = logging.getLogger('nlu') @@ -211,8 +211,6 @@ def __db_endpoint_predict__(pipe, data): 1) parse pred params from first row maybe 2) serialize/deserialize img """ - print("CUSOTM NLU MODE!") - print(data.columns) params = PredictParams.maybe_from_pandas_df(data) if params: params = params.dict() @@ -366,12 +364,14 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met try: return __predict_standard_spark(pipe, data, output_level, positions, keep_stranger_features, metadata, drop_irrelevant_cols, return_spark_df, get_embeddings) + except NluDataParseException as err: + logger.warning(f"Predictions Failed={err}") + raise err except Exception as err: logger.warning(f"Predictions Failed={err}") pipe.print_exception_err(err) raise Exception("Failure to process data with NLU") - def debug_print_pipe_cols(pipe): for c in pipe.components: print(f'{c.spark_input_column_names}->{c.name}->{c.spark_output_column_names}') diff --git a/nlu/spellbook.py b/nlu/spellbook.py index 6fb87f36..58d386e8 100644 --- a/nlu/spellbook.py +++ b/nlu/spellbook.py @@ -10598,7 +10598,7 @@ class Spellbook: 'de.deid.pipeline': 'german_deid_pipeline_spark24', 'de.med_ner.deid_generic.pipeline': 'ner_deid_generic_pipeline'}, 'en': { - + 'en.answer_question.clinical_notes_onnx.pipeline': 'clinical_notes_qa_base_onnx_pipeline', 'en.classify.bert_sequence.binary_rct_biobert.pipeline': 'bert_sequence_classifier_binary_rct_biobert_pipeline', 'en.classify.bert_sequence.vop_hcp_consult.pipeline': 'bert_sequence_classifier_vop_hcp_consult_pipeline', 'en.classify.bert_sequence.vop_drug_side_effect.pipeline': 'bert_sequence_classifier_vop_drug_side_effect_pipeline', @@ -10634,6 +10634,7 @@ class Spellbook: 'en.explain_doc.clinical_ade': 'explain_clinical_doc_ade', 'en.explain_doc.clinical_radiology.pipeline': 'explain_clinical_doc_radiology', 'en.explain_doc.era': 'explain_clinical_doc_era', + 'en.explain_doc.clinical_granular': 'explain_clinical_doc_granular', 'en.icd10_icd9.mapping': 'icd10_icd9_mapping', 'en.icd10cm.umls.mapping': 'icd10cm_umls_mapping', 'en.icd10cm_resolver.pipeline': 'icd10cm_resolver_pipeline', @@ -10761,10 +10762,12 @@ class Spellbook: 'en.resolve.medication': 'medication_resolver_pipeline', 'en.resolve.medication_transform.pipeline': 'medication_resolver_transform_pipeline', 'en.rxnorm.umls.mapping': 'rxnorm_umls_mapping', + 'en.rxnorm.mes.mapping': 'rxnorm_mesh_mapping', 'en.snomed.umls.mapping': 'snomed_umls_mapping', 'en.spell.clinical.pipeline': 'spellcheck_clinical_pipeline', 'en.summarize.biomedical_pubmed.pipeline':'summarizer_biomedical_pubmed_pipeline', 'en.summarize.clinical_guidelines_large.pipeline': 'summarizer_clinical_guidelines_large_pipeline', + 'en.summarize.clinical_laymen_onnx.pipeline': 'summarizer_clinical_laymen_onnx_pipeline', 'en.summarize.clinical_jsl_augmented.pipeline': 'summarizer_clinical_jsl_augmented_pipeline', 'en.summarize.clinical_questions.pipeline': 'summarizer_clinical_questions_pipeline', 'en.summarize.generic_jsl.pipeline': 'summarizer_generic_jsl_pipeline', @@ -11105,6 +11108,7 @@ class Spellbook: 'en.med_ner.tumour': 'nerdl_tumour_demo', 'en.med_ner.vop': 'ner_vop', 'en.med_ner.vop_emb_clinical_large': 'ner_vop_emb_clinical_large', + 'en.med_ner.vop_langtest': 'ner_vop_langtest', 'en.mesh_to_umls': 'mesh_umls_mapper', 'en.ner.clinical_trials_abstracts': 'ner_clinical_trials_abstracts', 'en.ner.drug_development_trials': 'bert_token_classifier_drug_development_trials', @@ -16830,6 +16834,7 @@ class Spellbook: 'nerdl_tumour_demo': 'MedicalNerModel', 'ner_vop': 'MedicalNerModel', 'ner_vop_emb_clinical_large': 'MedicalNerModel', + 'ner_vop_langtest': 'MedicalNerModel', 'ngram': 'NGramGenerator', 'nl': 'RoBertaEmbeddings', 'nli_mpnet_base_v2': 'MPNetEmbeddings', diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py index a6254eed..088a14a6 100644 --- a/nlu/universe/annotator_class_universe.py +++ b/nlu/universe/annotator_class_universe.py @@ -15,7 +15,7 @@ class AnnoClassRef: JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings', - A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings', + A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings', A_N.WHISPER_FOR_CTC: 'WhisperForCTC', A_N.HUBERT_FOR_CTC: 'HubertForCTC', @@ -240,6 +240,7 @@ class AnnoClassRef: } JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { + HC_A_N.MEDICAL_QUESTION_ANSWERING: 'MedicalQuestionAnswering', HC_A_N.MEDICAL_TEXT_GENERATOR: 'MedicalTextGenerator', HC_A_N.MEDICAL_SUMMARIZER:'MedicalSummarizer', HC_A_N.ZERO_SHOT_NER: 'ZeroShotNerModel', diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py index 6d8062c6..e94e4f4e 100644 --- a/nlu/universe/component_universes.py +++ b/nlu/universe/component_universes.py @@ -45,6 +45,7 @@ from nlu.components.classifiers.span_longformer.span_longformer import SpanLongFormerClassifier from nlu.components.classifiers.span_roberta.span_roberta import SpanRobertaClassifier from nlu.components.classifiers.span_xlm_roberta.span_xlm_roberta import SpanXlmRobertaClassifier +from nlu.components.classifiers.span_medical.span_medical import SpanMedical from nlu.components.classifiers.token_albert.token_albert import TokenAlbert from nlu.components.classifiers.token_bert.token_bert import TokenBert from nlu.components.classifiers.token_bert_healthcare.token_bert_healthcare import TokenBertHealthcare @@ -3278,6 +3279,27 @@ class ComponentUniverse: computation_context=ComputeContexts.spark, output_context=ComputeContexts.spark, ), + H_A.MEDICAL_QUESTION_ANSWERING: partial(NluComponent, + name=H_A.MEDICAL_QUESTION_ANSWERING, + jsl_anno_class_id= H_A.MEDICAL_QUESTION_ANSWERING, + jsl_anno_py_class= ACR.JSL_anno_HC_ref_2_py_class[ + H_A.MEDICAL_QUESTION_ANSWERING], + node= NLP_HC_FEATURE_NODES.nodes[ + H_A.MEDICAL_QUESTION_ANSWERING], + get_default_model= SpanMedical.get_default_model, + get_pretrained_model= SpanMedical.get_pretrained_model, + type= T.QUESTION_SPAN_CLASSIFIER, + pdf_extractor_methods={ + 'default': default_span_classifier_config, + 'default_full': default_full_span_classifier_config, }, + pdf_col_name_substitutor=substitute_hc_span_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + description='TODO', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + ), A.MULTI_DOCUMENT_ASSEMBLER: partial(NluComponent, name=A.MULTI_DOCUMENT_ASSEMBLER, @@ -3880,6 +3902,7 @@ class ComponentUniverse: ), H_A.CHUNK_MAPPER_MODEL: partial(NluComponent, + prefer_light_pipe=True, name=H_A.CHUNK_MAPPER_MODEL, type=T.CHUNK_MAPPER, get_default_model=ChunkMapper.get_default_model, diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py index 39edaf9c..c7fbf027 100644 --- a/nlu/universe/feature_node_ids.py +++ b/nlu/universe/feature_node_ids.py @@ -303,6 +303,7 @@ class NLP_HC_NODE_IDS: # or Mode Node? ENTITY_CHUNK_EMBEDDING = JslAnnoId('entity_chunk_embedding') MEDICAL_SUMMARIZER = JslAnnoId('med_summarizer') MEDICAL_TEXT_GENERATOR = JslAnnoId('med_text_generator') + MEDICAL_QUESTION_ANSWERING = JslAnnoId('med_question_answering') class OCR_NODE_IDS: """All available Feature nodes in OCR diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py index 6035d5ba..88c54a28 100644 --- a/nlu/universe/feature_node_universes.py +++ b/nlu/universe/feature_node_universes.py @@ -392,6 +392,8 @@ class NLP_HC_FEATURE_NODES(): H_F = NLP_HC_FEATURES # HC Feature Nodes nodes = { + A.MEDICAL_QUESTION_ANSWERING: NlpFeatureNode(A.MEDICAL_QUESTION_ANSWERING, [F.DOCUMENT_QUESTION, F.DOCUMENT_QUESTION_CONTEXT], [F.CLASSIFIED_SPAN]), + A.MEDICAL_TEXT_GENERATOR: NlpFeatureNode(A.MEDICAL_TEXT_GENERATOR, [F.DOCUMENT], [F.DOCUMENT_GENERATED]), A.MEDICAL_SUMMARIZER: NlpFeatureNode(A.MEDICAL_SUMMARIZER, [F.DOCUMENT], [F.DOCUMENT_GENERATED]), diff --git a/setup.py b/setup.py index 8c2ae3f2..743cff6a 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,7 @@ name='nlu', - version='5.1.3', - + version='5.1.4', description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.',