JohnSnowLabs · C-K-Loan · Jul 11, 2024 · Jun 6, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '5.3.2'
+__version__ = '5.3.3rc5'
 
 
 import nlu.utils.environment.env_utils as env_utils
@@ -351,8 +351,17 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
             pipe.add(c, nlu_ref, pretrained_pipe_component=True)
         if is_nlu_uid(uid):
             data = json.loads(uid)
-            print(data)
             pipe.nlu_ref = data['0']['nlu_ref']
+            pipe.contains_ocr_components = data['0']['contains_ocr_components']
+            pipe.contains_audio_components = data['0']['contains_audio_components']
+            pipe.has_nlp_components = data['0']['has_nlp_components']
+            pipe.has_span_classifiers = data['0']['has_span_classifiers']
+            pipe.prefer_light = data['0']['prefer_light']
+            pipe.has_table_qa_models = data['0']['has_table_qa_models']
+            pipe.requires_image_format = data['0']['requires_image_format']
+            pipe.requires_binary_format = data['0']['requires_binary_format']
+            pipe.is_light_pipe_incompatible = data['0']['is_light_pipe_incompatible']
+
             for i, c in enumerate(pipe.components):
                 c.nlu_ref = data[str(i + 1)]['nlu_ref']
                 c.nlp_ref = data[str(i + 1)]['nlp_ref']

diff --git a/nlu/pipe/col_substitution/col_name_substitution_utils.py b/nlu/pipe/col_substitution/col_name_substitution_utils.py
@@ -130,8 +130,7 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
 
         if c.name == NLP_NODE_IDS.FINISHER:
             result_cols = c.model.getOutputCols()
-            if c.model.getIncludeMetadata():
-                result_cols = result_cols + [f'{col}_metadata' for col in result_cols]
+            result_cols = [c for c in df.columns if any(c.startswith(s) for s in result_cols)]
             return result_cols
         result_cols = []
         if isinstance(configs, SparkOCRExtractorConfig):

diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
@@ -137,6 +137,18 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
         return {}
 
 
+def extract_finisher_rows(row: pd.Series, configs: FinisherExtractorConfig):
+    d = {}
+    for r in row:
+        key = r['_1']
+        key = f'{configs.source_col_name}_{key}'
+        value = r['_2']
+        if key not in d:
+            d[key] = []
+        d[key].append(value)
+    return d
+
+
 def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
     """
     Extract base features common in all saprk NLP annotators
@@ -280,7 +292,7 @@ def extract_master(row: pd.Series, configs: SparkNLPExtractorConfig) -> pd.Serie
             if configs.is_meta_field:
                 return pd.Series(
                     {
-                        configs.source_col_name: row
+                        **extract_finisher_rows(row, configs)
                     })
             else:
                 return pd.Series(

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -62,6 +62,7 @@ def __init__(self):
         self.requires_image_format = False
         self.requires_binary_format = False
         self.is_light_pipe_incompatible = False
+
     def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
             name_to_add='', idx=None):
         '''
@@ -281,7 +282,7 @@ def unpack_and_apply_extractors(self, pdf: Union[pyspark.sql.DataFrame, pd.DataF
 
            Can process Spark DF output from Vanilla pipes and Pandas Converts outputs of Lightpipeline
            """
-        if isinstance(pdf,pyspark.sql.dataframe.DataFrame):
+        if isinstance(pdf, pyspark.sql.dataframe.DataFrame):
             if 'modificationTime' in pdf.columns:
                 # drop because of
                 # 'TypeError: Casting to unit-less dtype 'datetime64' is not supported.
@@ -432,25 +433,34 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False):
         if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index')
         return cols
 
-    def save(self, path, component='entire_pipeline', overwrite=True):
-        # serialize data
+    def _get_uid_payload(self):
         data = {}
-        data[0] = {'nlu_ref': self.nlu_ref}
+        data[0] = {
+            'nlu_ref': self.nlu_ref,
+            'contains_ocr_components': self.contains_ocr_components,
+            'contains_audio_components': self.contains_audio_components,
+            'has_nlp_components': self.has_nlp_components,
+            'has_span_classifiers': self.has_span_classifiers,
+            'prefer_light': self.prefer_light,
+            'has_table_qa_models': self.has_table_qa_models,
+            'requires_image_format': self.requires_image_format,
+            'requires_binary_format': self.requires_binary_format,
+            'is_light_pipe_incompatible': self.is_light_pipe_incompatible,
+        }
         data['is_nlu_pipe'] = True
         for i, c in enumerate(self.components):
             data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref,
                            'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe}
 
-        data = json.dumps(data)
+        return json.dumps(data)
+    def save(self, path, component='entire_pipeline', overwrite=True):
+        # serialize data
+        data = self._get_uid_payload()
         if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'):
             self.fit()
             self.is_fitted = True
         # self.vanilla_transformer_pipe.extractParamMap()
         if hasattr(self, 'nlu_ref'):
-            """ ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE: 
-            - nlp ref/nlu ref
-            - is loaded_form_pipe
-            """
             self.vanilla_transformer_pipe._resetUid(data)
         if component == 'entire_pipeline':
             if overwrite:
@@ -496,6 +506,7 @@ def predict(self,
         from nlu.pipe.utils.predict_helper import __predict__
         return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread,
                            drop_irrelevant_cols, return_spark_df, get_embeddings)
+
     def predict_embeds(self,
                        data,
                        multithread=True,
@@ -517,6 +528,7 @@ def predict_embeds(self,
                            multithread=multithread,
                            drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
                            embed_only=True)
+
     def print_info(self, minimal=True):
         '''
         Print out information about every component_to_resolve currently loaded in the component_list and their configurable parameters.
@@ -982,14 +994,26 @@ def __get_finisher_conf(finisher: NluComponent) -> Dict[str, FinisherExtractorCo
         """
 
         confs = {}
-        m =  finisher.model
+        m = finisher.model
+        m.setIncludeMetadata(True)
         as_arr = m.getOutputAsArray()
+        # hotfix because finisher has no getIncludeMetadata()
+        # For now we always extract all meta
+        has_meta = True
         for c in m.getOutputCols():
             confs[c] = FinisherExtractorConfig(output_as_array=as_arr,
-                                               is_meta_field=True if c.endswith('_metadata') else False,
+                                               is_meta_field=False,
                                                annotation_split_symbol=m.getAnnotationSplitSymbol(),
                                                value_split_symbol=m.getValueSplitSymbol(),
                                                source_col_name=c,
                                                )
-
+            if has_meta: # since metadata fields dont show up in getOutputCols we need to add them manually
+                meta_col = f'{c}_metadata'
+                finisher.spark_output_column_names.append(meta_col)
+                confs[meta_col] = FinisherExtractorConfig(output_as_array=as_arr,
+                                                   is_meta_field=True,
+                                                   annotation_split_symbol=m.getAnnotationSplitSymbol(),
+                                                   value_split_symbol=m.getValueSplitSymbol(),
+                                                   source_col_name=meta_col,
+                                                   )
         return confs
diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -246,6 +246,7 @@ class AnnoClassRef:
         A_N.PARTIAL_Normalizer: 'Normalizer',
         A_N.VIT_IMAGE_CLASSIFICATION: 'ViTForImageClassification',
         A_N.CONVNEXT_IMAGE_CLASSIFICATION: 'ConvNextImageClassifier',
+        A_N.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: 'ConvNextForImageClassification',
 
     }
     JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -3509,6 +3509,30 @@ class ComponentUniverse:
                                             is_visual_annotator=True,
                                             ),
 
+        A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED: partial(NluComponent,
+                                                       name=A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED,
+                                                       type=T.IMAGE_CLASSIFICATION,
+                                                       get_default_model=ConvNextImageClassifier.get_default_model,
+                                                       get_pretrained_model=ConvNextImageClassifier.get_pretrained_model,
+                                                       pdf_extractor_methods={'default': default_document_config,
+                                                                              'default_full': default_full_config},
+                                                       pdf_col_name_substitutor=substitute_recognized_text_cols,
+                                                       output_level=L.DOCUMENT,
+                                                       node=NLP_FEATURE_NODES.nodes[
+                                                           A.CONVNEXT_IMAGE_CLASSIFICATION],
+                                                       description='TODO',
+                                                       provider=ComponentBackends.open_source,
+
+                                                       license=Licenses.open_source,
+                                                       computation_context=ComputeContexts.spark,
+                                                       output_context=ComputeContexts.spark,
+                                                       jsl_anno_class_id=A.CONVNEXT_IMAGE_CLASSIFICATION,
+                                                       jsl_anno_py_class=ACR.JSL_anno2_py_class[
+                                                           A.CONVNEXT_IMAGE_CLASSIFICATIONFITTED],
+                                                       requires_image_format=True,
+                                                       is_visual_annotator=True,
+
+                                                       ),
         A.CONVNEXT_IMAGE_CLASSIFICATION: partial(NluComponent,
                                                  name=A.CONVNEXT_IMAGE_CLASSIFICATION,
                                                  type=T.IMAGE_CLASSIFICATION,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -107,6 +107,7 @@ class NLP_NODE_IDS:
     T5_TRANSFORMER = JslAnnoId('t5_transformer')
     VIT_IMAGE_CLASSIFICATION = JslAnnoId("vit_image_classification")
     CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
+    CONVNEXT_IMAGE_CLASSIFICATIONFITTED = JslAnnoId("convnext_image_classification_fitted")
     SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
     BART_TRANSFORMER = JslAnnoId("bart_transformer")
     INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')