From d7813b3e60d24a8d550e801f65d716579da41bf0 Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Thu, 6 Jun 2024 17:28:55 +0200 Subject: [PATCH 1/5] store all relevant pipe attributes in UID when storing to disk --- nlu/pipe/pipeline.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index e6a1f3af..d33e29df 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -62,6 +62,7 @@ def __init__(self): self.requires_image_format = False self.requires_binary_format = False self.is_light_pipe_incompatible = False + def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False, name_to_add='', idx=None): ''' @@ -281,7 +282,7 @@ def unpack_and_apply_extractors(self, pdf: Union[pyspark.sql.DataFrame, pd.DataF Can process Spark DF output from Vanilla pipes and Pandas Converts outputs of Lightpipeline """ - if isinstance(pdf,pyspark.sql.dataframe.DataFrame): + if isinstance(pdf, pyspark.sql.dataframe.DataFrame): if 'modificationTime' in pdf.columns: # drop because of # 'TypeError: Casting to unit-less dtype 'datetime64' is not supported. @@ -435,7 +436,24 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False): def save(self, path, component='entire_pipeline', overwrite=True): # serialize data data = {} - data[0] = {'nlu_ref': self.nlu_ref} + data[0] = { + 'nlu_ref': self.nlu_ref, + 'contains_ocr_components': self.contains_ocr_components, + 'contains_audio_components': self.contains_audio_components, + 'has_nlp_components': self.has_nlp_components, + 'has_span_classifiers': self.has_span_classifiers, + 'prefer_light': self.prefer_light, + 'has_table_qa_models': self.has_table_qa_models, + 'requires_image_format': self.requires_image_format, + 'requires_binary_format': self.requires_binary_format, + 'is_light_pipe_incompatible': self.is_light_pipe_incompatible, + + + # 'output_positions' :self.output_positions, + # 'prediction_output_level' :self.prediction_output_level, + # 'component_output_level' :self.component_output_level, + + } data['is_nlu_pipe'] = True for i, c in enumerate(self.components): data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref, @@ -496,6 +514,7 @@ def predict(self, from nlu.pipe.utils.predict_helper import __predict__ return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread, drop_irrelevant_cols, return_spark_df, get_embeddings) + def predict_embeds(self, data, multithread=True, @@ -517,6 +536,7 @@ def predict_embeds(self, multithread=multithread, drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True, embed_only=True) + def print_info(self, minimal=True): ''' Print out information about every component_to_resolve currently loaded in the component_list and their configurable parameters. @@ -982,7 +1002,7 @@ def __get_finisher_conf(finisher: NluComponent) -> Dict[str, FinisherExtractorCo """ confs = {} - m = finisher.model + m = finisher.model as_arr = m.getOutputAsArray() for c in m.getOutputCols(): confs[c] = FinisherExtractorConfig(output_as_array=as_arr, From ca10e43c50b79d08fe4b0ed2f5d502667ee3733c Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Fri, 7 Jun 2024 05:35:21 +0200 Subject: [PATCH 2/5] store all relevant pipe attributes in UID when storing to disk --- .../base_extractor_methods.py | 20 +++++++++++- nlu/pipe/pipeline.py | 32 ++++++++++--------- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py index 7450697f..e83066de 100644 --- a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py +++ b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py @@ -137,6 +137,24 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon return {} +def extract_finisher_rows(row: pd.Series, configs: FinisherExtractorConfig): + keys = [d['_1'] for d in row] + keys + keys = [d['_1'] for d in row] + + values = [d['_2'] for d in row] + keys, values + d = {} + for r in row: + key = r['_1'] + key = f'{configs.source_col_name}_{key}' + value = r['_2'] + if key not in d: + d[key] = [] + d[key].append(value) + return d + + def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict: """ Extract base features common in all saprk NLP annotators @@ -280,7 +298,7 @@ def extract_master(row: pd.Series, configs: SparkNLPExtractorConfig) -> pd.Serie if configs.is_meta_field: return pd.Series( { - configs.source_col_name: row + **extract_finisher_rows(row, configs) }) else: return pd.Series( diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index d33e29df..2664ac64 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -433,8 +433,7 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False): if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index') return cols - def save(self, path, component='entire_pipeline', overwrite=True): - # serialize data + def _get_uid_payload(self): data = {} data[0] = { 'nlu_ref': self.nlu_ref, @@ -447,28 +446,21 @@ def save(self, path, component='entire_pipeline', overwrite=True): 'requires_image_format': self.requires_image_format, 'requires_binary_format': self.requires_binary_format, 'is_light_pipe_incompatible': self.is_light_pipe_incompatible, - - - # 'output_positions' :self.output_positions, - # 'prediction_output_level' :self.prediction_output_level, - # 'component_output_level' :self.component_output_level, - } data['is_nlu_pipe'] = True for i, c in enumerate(self.components): data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref, 'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe} - data = json.dumps(data) + return json.dumps(data) + def save(self, path, component='entire_pipeline', overwrite=True): + # serialize data + data = self._get_uid_payload() if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'): self.fit() self.is_fitted = True # self.vanilla_transformer_pipe.extractParamMap() if hasattr(self, 'nlu_ref'): - """ ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE: - - nlp ref/nlu ref - - is loaded_form_pipe - """ self.vanilla_transformer_pipe._resetUid(data) if component == 'entire_pipeline': if overwrite: @@ -1003,13 +995,23 @@ def __get_finisher_conf(finisher: NluComponent) -> Dict[str, FinisherExtractorCo confs = {} m = finisher.model + m.setIncludeMetadata(True) as_arr = m.getOutputAsArray() + # hotfix because finisher has no getIncludeMetadata() + # For now we always extract all meta + has_meta = True for c in m.getOutputCols(): confs[c] = FinisherExtractorConfig(output_as_array=as_arr, - is_meta_field=True if c.endswith('_metadata') else False, + is_meta_field=False, annotation_split_symbol=m.getAnnotationSplitSymbol(), value_split_symbol=m.getValueSplitSymbol(), source_col_name=c, ) - + if has_meta: # since metadata fields dont show up in getOutputCols we need to add them manually + confs[f'{c}_metadata'] = FinisherExtractorConfig(output_as_array=as_arr, + is_meta_field=True, + annotation_split_symbol=m.getAnnotationSplitSymbol(), + value_split_symbol=m.getValueSplitSymbol(), + source_col_name=f'{c}_metadata', + ) return confs From febdfeee8585c1f92ea048401c117ff61ce26d27 Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Fri, 7 Jun 2024 06:39:47 +0200 Subject: [PATCH 3/5] properly track columns after finisher extraction when substituting cols --- nlu/pipe/col_substitution/col_name_substitution_utils.py | 3 +-- .../extractors/extractor_methods/base_extractor_methods.py | 6 ------ nlu/pipe/pipeline.py | 6 ++++-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/nlu/pipe/col_substitution/col_name_substitution_utils.py b/nlu/pipe/col_substitution/col_name_substitution_utils.py index 7cb7a15b..00cea6ef 100644 --- a/nlu/pipe/col_substitution/col_name_substitution_utils.py +++ b/nlu/pipe/col_substitution/col_name_substitution_utils.py @@ -130,8 +130,7 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]: if c.name == NLP_NODE_IDS.FINISHER: result_cols = c.model.getOutputCols() - if c.model.getIncludeMetadata(): - result_cols = result_cols + [f'{col}_metadata' for col in result_cols] + result_cols = [c for c in df.columns if any(c.startswith(s) for s in result_cols)] return result_cols result_cols = [] if isinstance(configs, SparkOCRExtractorConfig): diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py index e83066de..1d06d182 100644 --- a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py +++ b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py @@ -138,12 +138,6 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon def extract_finisher_rows(row: pd.Series, configs: FinisherExtractorConfig): - keys = [d['_1'] for d in row] - keys - keys = [d['_1'] for d in row] - - values = [d['_2'] for d in row] - keys, values d = {} for r in row: key = r['_1'] diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index 2664ac64..ffee5a1f 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -1008,10 +1008,12 @@ def __get_finisher_conf(finisher: NluComponent) -> Dict[str, FinisherExtractorCo source_col_name=c, ) if has_meta: # since metadata fields dont show up in getOutputCols we need to add them manually - confs[f'{c}_metadata'] = FinisherExtractorConfig(output_as_array=as_arr, + meta_col = f'{c}_metadata' + finisher.spark_output_column_names.append(meta_col) + confs[meta_col] = FinisherExtractorConfig(output_as_array=as_arr, is_meta_field=True, annotation_split_symbol=m.getAnnotationSplitSymbol(), value_split_symbol=m.getValueSplitSymbol(), - source_col_name=f'{c}_metadata', + source_col_name=meta_col, ) return confs From d2b30ada852bef6dca2afa31c57da9791a3a8e26 Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Tue, 18 Jun 2024 00:40:56 +0200 Subject: [PATCH 4/5] fix Bug with save-reload ConvNextClassifier --- nlu/universe/annotator_class_universe.py | 1 + nlu/universe/component_universes.py | 24 ++++++++++++++++++++++++ nlu/universe/feature_node_ids.py | 1 + 3 files changed, 26 insertions(+) diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py index 92157e8a..31c8f0a8 100644 --- a/nlu/universe/annotator_class_universe.py +++ b/nlu/universe/annotator_class_universe.py @@ -246,6 +246,7 @@ class AnnoClassRef: A_N.PARTIAL_Normalizer: 'Normalizer', A_N.VIT_IMAGE_CLASSIFICATION: 'ViTForImageClassification', A_N.CONVNEXT_IMAGE_CLASSIFICATION: 'ConvNextImageClassifier', + A_N.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: 'ConvNextForImageClassification', } JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py index f46f257b..2a325f13 100644 --- a/nlu/universe/component_universes.py +++ b/nlu/universe/component_universes.py @@ -3509,6 +3509,30 @@ class ComponentUniverse: is_visual_annotator=True, ), + A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: partial(NluComponent, + name=A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED, + type=T.IMAGE_CLASSIFICATION, + get_default_model=ConvNextImageClassifier.get_default_model, + get_pretrained_model=ConvNextImageClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_document_config, + 'default_full': default_full_config}, + pdf_col_name_substitutor=substitute_recognized_text_cols, + output_level=L.DOCUMENT, + node=NLP_FEATURE_NODES.nodes[ + A.CONVNEXT_IMAGE_CLASSIFICATION], + description='TODO', + provider=ComponentBackends.open_source, + + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CONVNEXT_IMAGE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED], + requires_image_format=True, + is_visual_annotator=True, + + ), A.CONVNEXT_IMAGE_CLASSIFICATION: partial(NluComponent, name=A.CONVNEXT_IMAGE_CLASSIFICATION, type=T.IMAGE_CLASSIFICATION, diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py index 9b8310f4..15ea6533 100644 --- a/nlu/universe/feature_node_ids.py +++ b/nlu/universe/feature_node_ids.py @@ -107,6 +107,7 @@ class NLP_NODE_IDS: T5_TRANSFORMER = JslAnnoId('t5_transformer') VIT_IMAGE_CLASSIFICATION = JslAnnoId("vit_image_classification") CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification") + CONVNEXT_IMAGE_CLASSIFICATIONFITTED = JslAnnoId("convnext_image_classification_fitted") SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification") BART_TRANSFORMER = JslAnnoId("bart_transformer") INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings') From f15e452cf0e9c10766b0e312565bbc0b709b9e0f Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Tue, 18 Jun 2024 00:42:09 +0200 Subject: [PATCH 5/5] fix Bug missing attributes saving/reloading pipe --- nlu/__init__.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nlu/__init__.py b/nlu/__init__.py index a905b751..bf4ee145 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -1,4 +1,4 @@ -__version__ = '5.3.2' +__version__ = '5.3.3rc5' import nlu.utils.environment.env_utils as env_utils @@ -351,8 +351,17 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline: pipe.add(c, nlu_ref, pretrained_pipe_component=True) if is_nlu_uid(uid): data = json.loads(uid) - print(data) pipe.nlu_ref = data['0']['nlu_ref'] + pipe.contains_ocr_components = data['0']['contains_ocr_components'] + pipe.contains_audio_components = data['0']['contains_audio_components'] + pipe.has_nlp_components = data['0']['has_nlp_components'] + pipe.has_span_classifiers = data['0']['has_span_classifiers'] + pipe.prefer_light = data['0']['prefer_light'] + pipe.has_table_qa_models = data['0']['has_table_qa_models'] + pipe.requires_image_format = data['0']['requires_image_format'] + pipe.requires_binary_format = data['0']['requires_binary_format'] + pipe.is_light_pipe_incompatible = data['0']['is_light_pipe_incompatible'] + for i, c in enumerate(pipe.components): c.nlu_ref = data[str(i + 1)]['nlu_ref'] c.nlp_ref = data[str(i + 1)]['nlp_ref']