Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/missing metadata fields on saved pipes #269

Merged
merged 5 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '5.3.2'
__version__ = '5.3.3rc5'


import nlu.utils.environment.env_utils as env_utils
Expand Down Expand Up @@ -351,8 +351,17 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
pipe.add(c, nlu_ref, pretrained_pipe_component=True)
if is_nlu_uid(uid):
data = json.loads(uid)
print(data)
pipe.nlu_ref = data['0']['nlu_ref']
pipe.contains_ocr_components = data['0']['contains_ocr_components']
pipe.contains_audio_components = data['0']['contains_audio_components']
pipe.has_nlp_components = data['0']['has_nlp_components']
pipe.has_span_classifiers = data['0']['has_span_classifiers']
pipe.prefer_light = data['0']['prefer_light']
pipe.has_table_qa_models = data['0']['has_table_qa_models']
pipe.requires_image_format = data['0']['requires_image_format']
pipe.requires_binary_format = data['0']['requires_binary_format']
pipe.is_light_pipe_incompatible = data['0']['is_light_pipe_incompatible']

for i, c in enumerate(pipe.components):
c.nlu_ref = data[str(i + 1)]['nlu_ref']
c.nlp_ref = data[str(i + 1)]['nlp_ref']
Expand Down
3 changes: 1 addition & 2 deletions nlu/pipe/col_substitution/col_name_substitution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:

if c.name == NLP_NODE_IDS.FINISHER:
result_cols = c.model.getOutputCols()
if c.model.getIncludeMetadata():
result_cols = result_cols + [f'{col}_metadata' for col in result_cols]
result_cols = [c for c in df.columns if any(c.startswith(s) for s in result_cols)]
return result_cols
result_cols = []
if isinstance(configs, SparkOCRExtractorConfig):
Expand Down
14 changes: 13 additions & 1 deletion nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
return {}


def extract_finisher_rows(row: pd.Series, configs: FinisherExtractorConfig):
d = {}
for r in row:
key = r['_1']
key = f'{configs.source_col_name}_{key}'
value = r['_2']
if key not in d:
d[key] = []
d[key].append(value)
return d


def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
"""
Extract base features common in all saprk NLP annotators
Expand Down Expand Up @@ -280,7 +292,7 @@ def extract_master(row: pd.Series, configs: SparkNLPExtractorConfig) -> pd.Serie
if configs.is_meta_field:
return pd.Series(
{
configs.source_col_name: row
**extract_finisher_rows(row, configs)
})
else:
return pd.Series(
Expand Down
48 changes: 36 additions & 12 deletions nlu/pipe/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self):
self.requires_image_format = False
self.requires_binary_format = False
self.is_light_pipe_incompatible = False

def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
name_to_add='', idx=None):
'''
Expand Down Expand Up @@ -281,7 +282,7 @@ def unpack_and_apply_extractors(self, pdf: Union[pyspark.sql.DataFrame, pd.DataF

Can process Spark DF output from Vanilla pipes and Pandas Converts outputs of Lightpipeline
"""
if isinstance(pdf,pyspark.sql.dataframe.DataFrame):
if isinstance(pdf, pyspark.sql.dataframe.DataFrame):
if 'modificationTime' in pdf.columns:
# drop because of
# 'TypeError: Casting to unit-less dtype 'datetime64' is not supported.
Expand Down Expand Up @@ -432,25 +433,34 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False):
if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index')
return cols

def save(self, path, component='entire_pipeline', overwrite=True):
# serialize data
def _get_uid_payload(self):
data = {}
data[0] = {'nlu_ref': self.nlu_ref}
data[0] = {
'nlu_ref': self.nlu_ref,
'contains_ocr_components': self.contains_ocr_components,
'contains_audio_components': self.contains_audio_components,
'has_nlp_components': self.has_nlp_components,
'has_span_classifiers': self.has_span_classifiers,
'prefer_light': self.prefer_light,
'has_table_qa_models': self.has_table_qa_models,
'requires_image_format': self.requires_image_format,
'requires_binary_format': self.requires_binary_format,
'is_light_pipe_incompatible': self.is_light_pipe_incompatible,
}
data['is_nlu_pipe'] = True
for i, c in enumerate(self.components):
data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref,
'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe}

data = json.dumps(data)
return json.dumps(data)
def save(self, path, component='entire_pipeline', overwrite=True):
# serialize data
data = self._get_uid_payload()
if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'):
self.fit()
self.is_fitted = True
# self.vanilla_transformer_pipe.extractParamMap()
if hasattr(self, 'nlu_ref'):
""" ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE:
- nlp ref/nlu ref
- is loaded_form_pipe
"""
self.vanilla_transformer_pipe._resetUid(data)
if component == 'entire_pipeline':
if overwrite:
Expand Down Expand Up @@ -496,6 +506,7 @@ def predict(self,
from nlu.pipe.utils.predict_helper import __predict__
return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread,
drop_irrelevant_cols, return_spark_df, get_embeddings)

def predict_embeds(self,
data,
multithread=True,
Expand All @@ -517,6 +528,7 @@ def predict_embeds(self,
multithread=multithread,
drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
embed_only=True)

def print_info(self, minimal=True):
'''
Print out information about every component_to_resolve currently loaded in the component_list and their configurable parameters.
Expand Down Expand Up @@ -982,14 +994,26 @@ def __get_finisher_conf(finisher: NluComponent) -> Dict[str, FinisherExtractorCo
"""

confs = {}
m = finisher.model
m = finisher.model
m.setIncludeMetadata(True)
as_arr = m.getOutputAsArray()
# hotfix because finisher has no getIncludeMetadata()
# For now we always extract all meta
has_meta = True
for c in m.getOutputCols():
confs[c] = FinisherExtractorConfig(output_as_array=as_arr,
is_meta_field=True if c.endswith('_metadata') else False,
is_meta_field=False,
annotation_split_symbol=m.getAnnotationSplitSymbol(),
value_split_symbol=m.getValueSplitSymbol(),
source_col_name=c,
)

if has_meta: # since metadata fields dont show up in getOutputCols we need to add them manually
meta_col = f'{c}_metadata'
finisher.spark_output_column_names.append(meta_col)
confs[meta_col] = FinisherExtractorConfig(output_as_array=as_arr,
is_meta_field=True,
annotation_split_symbol=m.getAnnotationSplitSymbol(),
value_split_symbol=m.getValueSplitSymbol(),
source_col_name=meta_col,
)
return confs
1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ class AnnoClassRef:
A_N.PARTIAL_Normalizer: 'Normalizer',
A_N.VIT_IMAGE_CLASSIFICATION: 'ViTForImageClassification',
A_N.CONVNEXT_IMAGE_CLASSIFICATION: 'ConvNextImageClassifier',
A_N.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: 'ConvNextForImageClassification',

}
JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
Expand Down
24 changes: 24 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3509,6 +3509,30 @@ class ComponentUniverse:
is_visual_annotator=True,
),

A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: partial(NluComponent,
name=A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED,
type=T.IMAGE_CLASSIFICATION,
get_default_model=ConvNextImageClassifier.get_default_model,
get_pretrained_model=ConvNextImageClassifier.get_pretrained_model,
pdf_extractor_methods={'default': default_document_config,
'default_full': default_full_config},
pdf_col_name_substitutor=substitute_recognized_text_cols,
output_level=L.DOCUMENT,
node=NLP_FEATURE_NODES.nodes[
A.CONVNEXT_IMAGE_CLASSIFICATION],
description='TODO',
provider=ComponentBackends.open_source,

license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.CONVNEXT_IMAGE_CLASSIFICATION,
jsl_anno_py_class=ACR.JSL_anno2_py_class[
A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED],
requires_image_format=True,
is_visual_annotator=True,

),
A.CONVNEXT_IMAGE_CLASSIFICATION: partial(NluComponent,
name=A.CONVNEXT_IMAGE_CLASSIFICATION,
type=T.IMAGE_CLASSIFICATION,
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class NLP_NODE_IDS:
T5_TRANSFORMER = JslAnnoId('t5_transformer')
VIT_IMAGE_CLASSIFICATION = JslAnnoId("vit_image_classification")
CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
CONVNEXT_IMAGE_CLASSIFICATIONFITTED = JslAnnoId("convnext_image_classification_fitted")
SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
BART_TRANSFORMER = JslAnnoId("bart_transformer")
INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')
Expand Down
Loading