Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/514 #247

Merged
merged 9 commits into from
Feb 8, 2024
3 changes: 1 addition & 2 deletions nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
__version__ = '5.1.3'

__version__ = '5.1.4'


import nlu.utils.environment.env_utils as env_utils
Expand Down
Empty file.
18 changes: 18 additions & 0 deletions nlu/components/classifiers/span_medical/span_medical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
class SpanMedical:
@staticmethod
def get_default_model():
from sparknlp_jsl.annotator import MedicalQuestionAnswering

return MedicalQuestionAnswering.pretrained() \
.setInputCols(["document_question", "context"]) \
.setOutputCol("answer")



@staticmethod
def get_pretrained_model(name, language, bucket=None):
from sparknlp_jsl.annotator import MedicalQuestionAnswering

return MedicalQuestionAnswering.pretrained(name, language, bucket) \
.setInputCols(["document_question", "context"]) \
.setOutputCol("answer")
25 changes: 25 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_HC.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,28 @@ def substitute_generic_classifier_parser_cols(c, cols, is_unique=True, nlu_ident
logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
# new_cols[col]= f"{new_base_name}_confidence"
return new_cols
def substitute_hc_span_classifier_cols(c, cols, nlu_identifier=True):
"""
QA classifier
"""
new_cols = {}
#new_base_name = 'answer' if nlu_identifier == 'UNIQUE' else f'{nlu_identifier}_answer'
new_base_name = 'answer'
for col in cols:
if 'answer_results' in col:
new_cols[col] = f'{new_base_name}'
if 'answer_results_score' in col:
new_cols[col] = f'{new_base_name}_confidence'

elif 'span_start_score' in col:
new_cols[col] = f'{new_base_name}_start_confidence'
elif 'span_end_score' in col:
new_cols[col] = f'{new_base_name}_end_confidence'
elif 'start' in col and not 'score' in col:
new_cols[col] = f'{new_base_name}_start'
elif 'end' in col and not 'score' in col:
new_cols[col] = f'{new_base_name}_end'
elif 'sentence' in col:
new_cols[col] = f'{new_base_name}_sentence'

return new_cols
2 changes: 2 additions & 0 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self,
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
is_light_pipe_incompatible: bool = False, # Set to true for OCR annotators that require image format
prefer_light_pipe: bool = False, # Set True for annos that should run in light pipe
):
self.name = name
self.type = type
Expand Down Expand Up @@ -118,6 +119,7 @@ def __init__(self,
self.requires_image_format = requires_image_format
self.is_visual_annotator = is_visual_annotator
self.is_light_pipe_incompatible = is_light_pipe_incompatible
self.prefer_light_pipe = prefer_light_pipe

def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
nlu_ref: str,
Expand Down
14 changes: 11 additions & 3 deletions nlu/pipe/utils/data_conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,25 @@
from pyspark.sql.types import StringType, StructType, StructField


class NluDataParseException(Exception):
"""Custom exception class"""

def __init__(self, message="An error occurred parsing data with NLU"):
self.message = message
super().__init__(self.message)

class DataConversionUtils:
# Modin aswell but optional, so we dont import the type yet
supported_types = [pyspark.sql.DataFrame, pd.DataFrame, pd.Series, np.ndarray]

@staticmethod
def except_text_col_not_found(cols):
raise ValueError(
raise NluDataParseException(
f'Could not find column named "text" in input Pandas Dataframe. Please ensure one column named such exists. Columns in DF are : {cols} ')

@staticmethod
def except_invalid_question_data_format(cols):
raise ValueError(
raise NluDataParseException(
f'You input data format is invalid for question answering with span classification.'
f'Make sure you have at least 2 columns in you dataset, named context/question for pandas Dataframes'
f'For Strings/Iterables/Tuples make sure to use the format `question|||context` or (question,context) ')
Expand Down Expand Up @@ -301,7 +308,6 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
# TODO invalid Table Data Format Exception
pass
if isinstance(data[0], str):

return DataConversionUtils.table_question_str_to_sdf(data, spark_sess)
if isinstance(data[0], pd.DataFrame):
return DataConversionUtils.table_question_pdf_to_sdf(data, spark_sess)
Expand All @@ -321,6 +327,8 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
return DataConversionUtils.question_tuple_iterable_to_sdf(data, spark_sess)
elif isinstance(data[0], str):
return DataConversionUtils.question_str_iterable_to_sdf(data, spark_sess)
except NluDataParseException as err :
raise err
except:
ValueError("Data could not be converted to Spark Dataframe for internal conversion.")
else:
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,9 @@ def add_metadata_to_pipe(pipe: NLUPipeline):

for c in pipe.components:
# Check for OCR componments
if c.prefer_light_pipe:
pipe.prefer_light = True

if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
pipe.contains_ocr_components = True
if c.requires_image_format:
Expand Down
8 changes: 4 additions & 4 deletions nlu/pipe/utils/predict_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sparknlp.common import AnnotatorType

from nlu.pipe.utils.audio_data_conversion_utils import AudioDataConversionUtils
from nlu.pipe.utils.data_conversion_utils import DataConversionUtils
from nlu.pipe.utils.data_conversion_utils import DataConversionUtils, NluDataParseException
from nlu.pipe.utils.ocr_data_conversion_utils import OcrDataConversionUtils

logger = logging.getLogger('nlu')
Expand Down Expand Up @@ -211,8 +211,6 @@ def __db_endpoint_predict__(pipe, data):
1) parse pred params from first row maybe
2) serialize/deserialize img
"""
print("CUSOTM NLU MODE!")
print(data.columns)
params = PredictParams.maybe_from_pandas_df(data)
if params:
params = params.dict()
Expand Down Expand Up @@ -366,12 +364,14 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
try:
return __predict_standard_spark(pipe, data, output_level, positions, keep_stranger_features, metadata,
drop_irrelevant_cols, return_spark_df, get_embeddings)
except NluDataParseException as err:
logger.warning(f"Predictions Failed={err}")
raise err
except Exception as err:
logger.warning(f"Predictions Failed={err}")
pipe.print_exception_err(err)
raise Exception("Failure to process data with NLU")


def debug_print_pipe_cols(pipe):
for c in pipe.components:
print(f'{c.spark_input_column_names}->{c.name}->{c.spark_output_column_names}')
7 changes: 6 additions & 1 deletion nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -10598,7 +10598,7 @@ class Spellbook:
'de.deid.pipeline': 'german_deid_pipeline_spark24',
'de.med_ner.deid_generic.pipeline': 'ner_deid_generic_pipeline'},
'en': {

'en.answer_question.clinical_notes_onnx.pipeline': 'clinical_notes_qa_base_onnx_pipeline',
'en.classify.bert_sequence.binary_rct_biobert.pipeline': 'bert_sequence_classifier_binary_rct_biobert_pipeline',
'en.classify.bert_sequence.vop_hcp_consult.pipeline': 'bert_sequence_classifier_vop_hcp_consult_pipeline',
'en.classify.bert_sequence.vop_drug_side_effect.pipeline': 'bert_sequence_classifier_vop_drug_side_effect_pipeline',
Expand Down Expand Up @@ -10634,6 +10634,7 @@ class Spellbook:
'en.explain_doc.clinical_ade': 'explain_clinical_doc_ade',
'en.explain_doc.clinical_radiology.pipeline': 'explain_clinical_doc_radiology',
'en.explain_doc.era': 'explain_clinical_doc_era',
'en.explain_doc.clinical_granular': 'explain_clinical_doc_granular',
'en.icd10_icd9.mapping': 'icd10_icd9_mapping',
'en.icd10cm.umls.mapping': 'icd10cm_umls_mapping',
'en.icd10cm_resolver.pipeline': 'icd10cm_resolver_pipeline',
Expand Down Expand Up @@ -10761,10 +10762,12 @@ class Spellbook:
'en.resolve.medication': 'medication_resolver_pipeline',
'en.resolve.medication_transform.pipeline': 'medication_resolver_transform_pipeline',
'en.rxnorm.umls.mapping': 'rxnorm_umls_mapping',
'en.rxnorm.mes.mapping': 'rxnorm_mesh_mapping',
'en.snomed.umls.mapping': 'snomed_umls_mapping',
'en.spell.clinical.pipeline': 'spellcheck_clinical_pipeline',
'en.summarize.biomedical_pubmed.pipeline':'summarizer_biomedical_pubmed_pipeline',
'en.summarize.clinical_guidelines_large.pipeline': 'summarizer_clinical_guidelines_large_pipeline',
'en.summarize.clinical_laymen_onnx.pipeline': 'summarizer_clinical_laymen_onnx_pipeline',
'en.summarize.clinical_jsl_augmented.pipeline': 'summarizer_clinical_jsl_augmented_pipeline',
'en.summarize.clinical_questions.pipeline': 'summarizer_clinical_questions_pipeline',
'en.summarize.generic_jsl.pipeline': 'summarizer_generic_jsl_pipeline',
Expand Down Expand Up @@ -11105,6 +11108,7 @@ class Spellbook:
'en.med_ner.tumour': 'nerdl_tumour_demo',
'en.med_ner.vop': 'ner_vop',
'en.med_ner.vop_emb_clinical_large': 'ner_vop_emb_clinical_large',
'en.med_ner.vop_langtest': 'ner_vop_langtest',
'en.mesh_to_umls': 'mesh_umls_mapper',
'en.ner.clinical_trials_abstracts': 'ner_clinical_trials_abstracts',
'en.ner.drug_development_trials': 'bert_token_classifier_drug_development_trials',
Expand Down Expand Up @@ -16830,6 +16834,7 @@ class Spellbook:
'nerdl_tumour_demo': 'MedicalNerModel',
'ner_vop': 'MedicalNerModel',
'ner_vop_emb_clinical_large': 'MedicalNerModel',
'ner_vop_langtest': 'MedicalNerModel',
'ngram': 'NGramGenerator',
'nl': 'RoBertaEmbeddings',
'nli_mpnet_base_v2': 'MPNetEmbeddings',
Expand Down
3 changes: 2 additions & 1 deletion nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class AnnoClassRef:
JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {

A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings',
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings',
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings',

A_N.WHISPER_FOR_CTC: 'WhisperForCTC',
A_N.HUBERT_FOR_CTC: 'HubertForCTC',
Expand Down Expand Up @@ -240,6 +240,7 @@ class AnnoClassRef:

}
JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
HC_A_N.MEDICAL_QUESTION_ANSWERING: 'MedicalQuestionAnswering',
HC_A_N.MEDICAL_TEXT_GENERATOR: 'MedicalTextGenerator',
HC_A_N.MEDICAL_SUMMARIZER:'MedicalSummarizer',
HC_A_N.ZERO_SHOT_NER: 'ZeroShotNerModel',
Expand Down
23 changes: 23 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from nlu.components.classifiers.span_longformer.span_longformer import SpanLongFormerClassifier
from nlu.components.classifiers.span_roberta.span_roberta import SpanRobertaClassifier
from nlu.components.classifiers.span_xlm_roberta.span_xlm_roberta import SpanXlmRobertaClassifier
from nlu.components.classifiers.span_medical.span_medical import SpanMedical
from nlu.components.classifiers.token_albert.token_albert import TokenAlbert
from nlu.components.classifiers.token_bert.token_bert import TokenBert
from nlu.components.classifiers.token_bert_healthcare.token_bert_healthcare import TokenBertHealthcare
Expand Down Expand Up @@ -3278,6 +3279,27 @@ class ComponentUniverse:
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
),
H_A.MEDICAL_QUESTION_ANSWERING: partial(NluComponent,
name=H_A.MEDICAL_QUESTION_ANSWERING,
jsl_anno_class_id= H_A.MEDICAL_QUESTION_ANSWERING,
jsl_anno_py_class= ACR.JSL_anno_HC_ref_2_py_class[
H_A.MEDICAL_QUESTION_ANSWERING],
node= NLP_HC_FEATURE_NODES.nodes[
H_A.MEDICAL_QUESTION_ANSWERING],
get_default_model= SpanMedical.get_default_model,
get_pretrained_model= SpanMedical.get_pretrained_model,
type= T.QUESTION_SPAN_CLASSIFIER,
pdf_extractor_methods={
'default': default_span_classifier_config,
'default_full': default_full_span_classifier_config, },
pdf_col_name_substitutor=substitute_hc_span_classifier_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER,
description='TODO',
provider=ComponentBackends.hc,
license=Licenses.hc,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
),

A.MULTI_DOCUMENT_ASSEMBLER: partial(NluComponent,
name=A.MULTI_DOCUMENT_ASSEMBLER,
Expand Down Expand Up @@ -3880,6 +3902,7 @@ class ComponentUniverse:
),

H_A.CHUNK_MAPPER_MODEL: partial(NluComponent,
prefer_light_pipe=True,
name=H_A.CHUNK_MAPPER_MODEL,
type=T.CHUNK_MAPPER,
get_default_model=ChunkMapper.get_default_model,
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ class NLP_HC_NODE_IDS: # or Mode Node?
ENTITY_CHUNK_EMBEDDING = JslAnnoId('entity_chunk_embedding')
MEDICAL_SUMMARIZER = JslAnnoId('med_summarizer')
MEDICAL_TEXT_GENERATOR = JslAnnoId('med_text_generator')
MEDICAL_QUESTION_ANSWERING = JslAnnoId('med_question_answering')

class OCR_NODE_IDS:
"""All available Feature nodes in OCR
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,8 @@ class NLP_HC_FEATURE_NODES():
H_F = NLP_HC_FEATURES
# HC Feature Nodes
nodes = {
A.MEDICAL_QUESTION_ANSWERING: NlpFeatureNode(A.MEDICAL_QUESTION_ANSWERING, [F.DOCUMENT_QUESTION, F.DOCUMENT_QUESTION_CONTEXT], [F.CLASSIFIED_SPAN]),

A.MEDICAL_TEXT_GENERATOR: NlpFeatureNode(A.MEDICAL_TEXT_GENERATOR, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),

A.MEDICAL_SUMMARIZER: NlpFeatureNode(A.MEDICAL_SUMMARIZER, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@

name='nlu',

version='5.1.3',

version='5.1.4',

description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.',

Expand Down
Loading