From 4005998b950b72febe56bbb810f164ff9ca50f9f Mon Sep 17 00:00:00 2001
From: zcy <congyao119@gmail.com>
Date: Mon, 15 Jan 2024 16:00:31 +0100
Subject: [PATCH 1/4] add tabular pipeline

---
 giskard_cicd/loaders/huggingface_loader.py |  6 +++++
 giskard_cicd/loaders/tabular_loader.py     | 27 ++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 giskard_cicd/loaders/tabular_loader.py

diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py
index 1b2fcc2..4cecfb1 100644
--- a/giskard_cicd/loaders/huggingface_loader.py
+++ b/giskard_cicd/loaders/huggingface_loader.py
@@ -170,8 +170,12 @@ def load_dataset(
 
     def load_model(self, model_id):
         from transformers import pipeline
+        from .tabular_loader import TabularClassificationPipeline
 
         task = huggingface_hub.model_info(model_id).pipeline_tag
+        print(task)
+        if "tabular-classification" in task:
+            return TabularClassificationPipeline(task=task, model=model_id, device=self.device)
 
         return pipeline(task=task, model=model_id, device=self.device)
 
@@ -264,6 +268,8 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None):
     def _get_feature_mapping(self, hf_model, hf_dataset):
         if isinstance(hf_model, TextClassificationPipeline):
             task_features = {"text": "string", "label": "class_label"}
+        elif "tabular" in hf_model.pipeline_tag:
+            print(hf_model.model.config)
         else:
             msg = "Unsupported model type."
             raise NotImplementedError(msg)
diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_loader.py
new file mode 100644
index 0000000..f49e9c2
--- /dev/null
+++ b/giskard_cicd/loaders/tabular_loader.py
@@ -0,0 +1,27 @@
+from transformers import Pipeline
+
+class TabularClassificationPipeline(Pipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._model_type = "tabular-classification"
+        self._check_model_type(self._model_type)
+        self.pipeline_tag = "tabular-classification"
+
+    def _sanitize_parameters(self, **kwargs):
+        kwargs = super()._sanitize_parameters(**kwargs)
+        return kwargs
+    
+    def _check_model_type(self, model_type):
+        if model_type != self._model_type:
+            raise ValueError(
+                f"Pipeline is not of type {self._model_type} but {model_type}"
+            )
+        
+    def _forward(self, *args, **kwargs):
+        raise NotImplementedError("Not implemented yet")
+    
+    def preporocess(self, *args, **kwargs):
+        return super().preprocess(*args, **kwargs)
+    
+    def postprocess(self, *args, **kwargs):
+        return super().postprocess(*args, **kwargs)
\ No newline at end of file

From 09ae625a6a90b91129041c6f806506f32fe58482 Mon Sep 17 00:00:00 2001
From: zcy <congyao119@gmail.com>
Date: Mon, 15 Jan 2024 17:17:55 +0100
Subject: [PATCH 2/4] handle errors for tabular

---
 giskard_cicd/loaders/huggingface_loader.py | 30 ++++++++++++----------
 giskard_cicd/loaders/tabular_loader.py     | 21 +++++++++++----
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py
index 4cecfb1..60c87de 100644
--- a/giskard_cicd/loaders/huggingface_loader.py
+++ b/giskard_cicd/loaders/huggingface_loader.py
@@ -74,7 +74,7 @@ def load_giskard_model_dataset(
 
         # Check that the dataset has the good feature names for the task.
         logger.debug("Retrieving feature mapping")
-        if manual_feature_mapping is None:
+        if manual_feature_mapping is None and isinstance(hf_model, TextClassificationPipeline):
             feature_mapping = self._get_feature_mapping(hf_model, hf_dataset)
             logger.warn(
                 f'Feature mapping is not provided, using extracted "{feature_mapping}"'
@@ -82,9 +82,12 @@ def load_giskard_model_dataset(
         else:
             feature_mapping = manual_feature_mapping
 
-        df = hf_dataset.to_pandas().rename(
-            columns={v: k for k, v in feature_mapping.items()}
-        )
+        if feature_mapping is not None:
+            df = hf_dataset.to_pandas().rename(
+                columns={v: k for k, v in feature_mapping.items()}
+            )
+        else:
+            df = hf_dataset.to_pandas()
 
         # remove the rows have multiple labels
         # this is a hacky way to do it
@@ -96,7 +99,7 @@ def load_giskard_model_dataset(
 
         # @TODO: currently for classification models only.
         logger.debug("Retrieving classification label mapping")
-        if classification_label_mapping is None:
+        if classification_label_mapping is None and isinstance(hf_model, TextClassificationPipeline):
             id2label = hf_model.model.config.id2label
             logger.warn(f'Label mapping is not provided, using "{id2label}" from model')
         else:
@@ -106,9 +109,10 @@ def load_giskard_model_dataset(
             # need to include all labels
             # rewrite this lambda function to include all labels
             df.label = df.label.apply(lambda x: id2label[x[0]])
-        else:
+        elif getattr(df, "label", None) is not None:
             # TODO: when the label for test is not provided, what do we do?
             df["label"] = df.label.apply(lambda x: id2label[x] if x >= 0 else "-1")
+
         # map the list of label ids to the list of labels
         # df["label"] = df.label.apply(lambda x: [id2label[i] for i in x])
         logger.debug("Wrapping dataset")
@@ -173,9 +177,8 @@ def load_model(self, model_id):
         from .tabular_loader import TabularClassificationPipeline
 
         task = huggingface_hub.model_info(model_id).pipeline_tag
-        print(task)
         if "tabular-classification" in task:
-            return TabularClassificationPipeline(task=task, model=model_id, device=self.device)
+            return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id)
 
         return pipeline(task=task, model=model_id, device=self.device)
 
@@ -240,6 +243,8 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None):
         Flatten the dataset to a pandas dataframe
         """
         flat_dataset = pd.DataFrame()
+        if isinstance(hf_dataset, datasets.Dataset):
+            return hf_dataset
         if isinstance(hf_dataset, datasets.DatasetDict):
             keys = list(hf_dataset.keys())
             for k in keys:
@@ -249,16 +254,15 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None):
                     break
 
                 # Otherwise infer one data split
-                if k.startswith("train"):
-                    continue
-                elif k.startswith(data_split):
+                if k.startswith(data_split):
                     # TODO: only support one split for now
                     # Maybe we can merge all the datasets into one
                     flat_dataset = hf_dataset[k]
                     break
+                elif k.startswith("train"):
+                    continue
                 else:
                     flat_dataset = hf_dataset[k]
-
             # If there are only train datasets
             if isinstance(flat_dataset, pd.DataFrame) and flat_dataset.empty:
                 flat_dataset = hf_dataset[keys[0]]
@@ -269,7 +273,7 @@ def _get_feature_mapping(self, hf_model, hf_dataset):
         if isinstance(hf_model, TextClassificationPipeline):
             task_features = {"text": "string", "label": "class_label"}
         elif "tabular" in hf_model.pipeline_tag:
-            print(hf_model.model.config)
+            raise NotImplementedError("Tabular model features cannot be auto-mapped.")
         else:
             msg = "Unsupported model type."
             raise NotImplementedError(msg)
diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_loader.py
index f49e9c2..7b83340 100644
--- a/giskard_cicd/loaders/tabular_loader.py
+++ b/giskard_cicd/loaders/tabular_loader.py
@@ -1,11 +1,21 @@
-from transformers import Pipeline
+from transformers import Pipeline, AutoModel
+import huggingface_hub
+import joblib
+import os
 
 class TabularClassificationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         self._model_type = "tabular-classification"
         self._check_model_type(self._model_type)
         self.pipeline_tag = "tabular-classification"
+        # get model parameter from args
+        model_id = kwargs.pop("model", None)
+        self.model_dir = huggingface_hub.snapshot_download(model_id)
+        for f in os.listdir(self.model_dir):
+            if ".joblib" in f:
+                self.model = joblib.load(self.model_dir + "/" + f)
+            if ".pb" in f:
+                self.model = AutoModel.from_pretrained(self.model_dir)
 
     def _sanitize_parameters(self, **kwargs):
         kwargs = super()._sanitize_parameters(**kwargs)
@@ -18,10 +28,11 @@ def _check_model_type(self, model_type):
             )
         
     def _forward(self, *args, **kwargs):
-        raise NotImplementedError("Not implemented yet")
+        return super()._forward(*args, **kwargs)
     
-    def preporocess(self, *args, **kwargs):
+    def preprocess(self, *args, **kwargs):
         return super().preprocess(*args, **kwargs)
     
     def postprocess(self, *args, **kwargs):
-        return super().postprocess(*args, **kwargs)
\ No newline at end of file
+        return super().postprocess(*args, **kwargs)
+        
\ No newline at end of file

From 94e8153309036ac6d3f1da42434ebfb6fc3d6aa3 Mon Sep 17 00:00:00 2001
From: zcy <congyao119@gmail.com>
Date: Tue, 16 Jan 2024 19:54:05 +0100
Subject: [PATCH 3/4] add tabular regression and tabular base pipeline

---
 giskard_cicd/loaders/huggingface_loader.py    | 43 +++++++++++----
 ....py => tabular_classification_pipeline.py} | 17 ++----
 giskard_cicd/loaders/tabular_pipeline.py      | 55 +++++++++++++++++++
 .../loaders/tabular_regression_pipeline.py    | 29 ++++++++++
 4 files changed, 120 insertions(+), 24 deletions(-)
 rename giskard_cicd/loaders/{tabular_loader.py => tabular_classification_pipeline.py} (60%)
 create mode 100644 giskard_cicd/loaders/tabular_pipeline.py
 create mode 100644 giskard_cicd/loaders/tabular_regression_pipeline.py

diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py
index 60c87de..3b867ba 100644
--- a/giskard_cicd/loaders/huggingface_loader.py
+++ b/giskard_cicd/loaders/huggingface_loader.py
@@ -12,6 +12,9 @@
 from giskard.models.base import BaseModel
 from giskard.models.huggingface import HuggingFaceModel
 from transformers.pipelines import TextClassificationPipeline
+from .tabular_classification_pipeline import TabularClassificationPipeline
+from .tabular_regression_pipeline import TabularRegressionPipeline
+from .tabular_pipeline import TabularPipeline
 import requests
 
 from .huggingface_inf_model import classification_model_from_inference_api
@@ -116,6 +119,7 @@ def load_giskard_model_dataset(
         # map the list of label ids to the list of labels
         # df["label"] = df.label.apply(lambda x: [id2label[i] for i in x])
         logger.debug("Wrapping dataset")
+
         gsk_dataset = gsk.Dataset(
             df,
             name=f"HF {dataset}[{dataset_config}]({dataset_split}) for {model} model",
@@ -123,17 +127,26 @@ def load_giskard_model_dataset(
             column_types={"text": "text"},
             validation=False,
         )
+        
 
         logger.debug("Wrapping model")
 
-        gsk_model = self._get_gsk_model(
-            hf_model,
-            [id2label[i] for i in range(len(id2label))],
-            features=feature_mapping,
-            inference_type=inference_type,
-            device=self.device,
-            hf_token=inference_api_token,
-        )
+        if id2label is None and isinstance(hf_model, TabularPipeline):
+            gsk_model = gsk.Model(
+                lambda data: hf_model.predict(data),
+                model_type=hf_model._model_type,
+                name=f"{hf_model.model_id} HF pipeline",
+                feature_names=hf_model.config["features"],
+            )
+        else:
+            gsk_model = self._get_gsk_model(
+                hf_model,
+                [id2label[i] for i in range(len(id2label))],
+                features=feature_mapping,
+                inference_type=inference_type,
+                device=self.device,
+                hf_token=inference_api_token,
+            )
 
         # Optimize batch size
         if self.device.startswith("cuda"):
@@ -174,11 +187,19 @@ def load_dataset(
 
     def load_model(self, model_id):
         from transformers import pipeline
-        from .tabular_loader import TabularClassificationPipeline
-
         task = huggingface_hub.model_info(model_id).pipeline_tag
+        tags = huggingface_hub.model_info(model_id).tags
+        serialization = None
+        if not task and ("tabular" in tags and "classification" in tags):
+            task = ["tabular-classification"]
+
+        if "skops" in tags:
+            serialization = "skops"
+
         if "tabular-classification" in task:
-            return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id)
+            return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id, serialization=serialization)
+        if "tabular-regression" in task:
+            return TabularRegressionPipeline(task=task, model=model_id, model_id=model_id, serialization=serialization)
 
         return pipeline(task=task, model=model_id, device=self.device)
 
diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_classification_pipeline.py
similarity index 60%
rename from giskard_cicd/loaders/tabular_loader.py
rename to giskard_cicd/loaders/tabular_classification_pipeline.py
index 7b83340..e4f21aa 100644
--- a/giskard_cicd/loaders/tabular_loader.py
+++ b/giskard_cicd/loaders/tabular_classification_pipeline.py
@@ -1,21 +1,12 @@
-from transformers import Pipeline, AutoModel
-import huggingface_hub
-import joblib
-import os
+from .tabular_pipeline import TabularPipeline
 
-class TabularClassificationPipeline(Pipeline):
+class TabularClassificationPipeline(TabularPipeline):
     def __init__(self, *args, **kwargs):
-        self._model_type = "tabular-classification"
+        self._model_type = "classification"
         self._check_model_type(self._model_type)
         self.pipeline_tag = "tabular-classification"
         # get model parameter from args
-        model_id = kwargs.pop("model", None)
-        self.model_dir = huggingface_hub.snapshot_download(model_id)
-        for f in os.listdir(self.model_dir):
-            if ".joblib" in f:
-                self.model = joblib.load(self.model_dir + "/" + f)
-            if ".pb" in f:
-                self.model = AutoModel.from_pretrained(self.model_dir)
+        super().__init__(*args, **kwargs)
 
     def _sanitize_parameters(self, **kwargs):
         kwargs = super()._sanitize_parameters(**kwargs)
diff --git a/giskard_cicd/loaders/tabular_pipeline.py b/giskard_cicd/loaders/tabular_pipeline.py
new file mode 100644
index 0000000..5271e21
--- /dev/null
+++ b/giskard_cicd/loaders/tabular_pipeline.py
@@ -0,0 +1,55 @@
+from typing import Any, Dict
+from transformers import Pipeline, AutoModel, AutoConfig
+import huggingface_hub
+import joblib
+import os
+import json
+from skops.io import load
+from transformers.pipelines.base import GenericTensor
+from transformers.utils import ModelOutput
+
+class TabularPipeline(Pipeline):
+    def __init__(self, *args, **kwargs):
+        self._num_workers = 0
+        # get model parameter from args
+        self.model_id = kwargs.pop("model", None)
+        self.model_dir = huggingface_hub.snapshot_download(self.model_id)
+        serialization = kwargs.pop("serialization", None)
+        for f in os.listdir(self.model_dir):
+            if serialization == "skops" and ".pkl" in f:
+                print(f"Loading {self.model_dir}/{f}")
+                self.model = load(self.model_dir + "/" + f)
+            if ".joblib" in f: # joblib
+                self.model = joblib.load(self.model_dir + "/" + f)
+            if "config.json" in f:
+                config_file = json.load(open(self.model_dir + "/" + f))
+                if "sklearn" in config_file.keys():
+                    self.config = config_file["sklearn"]
+                    if "columns" in self.config.keys():
+                        self.config["features"] = self.config["columns"]
+                else:
+                    self.config = config_file
+            if ".pb" in f: # keras
+                self.model = AutoModel.from_pretrained(self.model_dir)
+                if "config.json" in f:
+                    self.model.config = AutoConfig.from_pretrained(self.model_dir)
+            if "modelRun.json" in f:
+                raise ValueError(
+                    "MLConsole models are not suppoerted."
+                )
+    
+    def predict(self, *args, **kwargs):
+        return self.model.predict(*args, **kwargs)
+        
+    def _sanitize_parameters(self, **kwargs):
+        kwargs = super()._sanitize_parameters(**kwargs)
+        return kwargs
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        return super()._forward(input_tensors, **forward_parameters)
+    
+    def preprocess(self, *args, **kwargs):
+        return super().preprocess(*args, **kwargs)
+    
+    def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
+        return super().postprocess(model_outputs, **postprocess_parameters)
+    
\ No newline at end of file
diff --git a/giskard_cicd/loaders/tabular_regression_pipeline.py b/giskard_cicd/loaders/tabular_regression_pipeline.py
new file mode 100644
index 0000000..5ab394d
--- /dev/null
+++ b/giskard_cicd/loaders/tabular_regression_pipeline.py
@@ -0,0 +1,29 @@
+from .tabular_pipeline import TabularPipeline
+
+class TabularRegressionPipeline(TabularPipeline):
+    def __init__(self, *args, **kwargs):
+        self._model_type = "regression"
+        self._check_model_type(self._model_type)
+        self.pipeline_tag = "tabular-regression"
+        # get model parameter from args
+        super().__init__(*args, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        kwargs = super()._sanitize_parameters(**kwargs)
+        return kwargs
+    
+    def _check_model_type(self, model_type):
+        if model_type != self._model_type:
+            raise ValueError(
+                f"Pipeline is not of type {self._model_type} but {model_type}"
+            )
+        
+    def _forward(self, *args, **kwargs):
+        return super()._forward(*args, **kwargs)
+    
+    def preprocess(self, *args, **kwargs):
+        return super().preprocess(*args, **kwargs)
+    
+    def postprocess(self, *args, **kwargs):
+        return super().postprocess(*args, **kwargs)
+        
\ No newline at end of file

From a60e9bbfe0c1ecb919089345b174f5ed4e67b01a Mon Sep 17 00:00:00 2001
From: zcy <congyao119@gmail.com>
Date: Wed, 17 Jan 2024 21:47:06 +0100
Subject: [PATCH 4/4] add classification labels and keras

---
 giskard_cicd/loaders/huggingface_loader.py | 1 +
 giskard_cicd/loaders/tabular_pipeline.py   | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py
index 3b867ba..76f4449 100644
--- a/giskard_cicd/loaders/huggingface_loader.py
+++ b/giskard_cicd/loaders/huggingface_loader.py
@@ -137,6 +137,7 @@ def load_giskard_model_dataset(
                 model_type=hf_model._model_type,
                 name=f"{hf_model.model_id} HF pipeline",
                 feature_names=hf_model.config["features"],
+                classification_labels=hf_model.config["target_mapping"].values(),
             )
         else:
             gsk_model = self._get_gsk_model(
diff --git a/giskard_cicd/loaders/tabular_pipeline.py b/giskard_cicd/loaders/tabular_pipeline.py
index 5271e21..180a040 100644
--- a/giskard_cicd/loaders/tabular_pipeline.py
+++ b/giskard_cicd/loaders/tabular_pipeline.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict
 from transformers import Pipeline, AutoModel, AutoConfig
+import keras
 import huggingface_hub
 import joblib
 import os
@@ -17,7 +18,6 @@ def __init__(self, *args, **kwargs):
         serialization = kwargs.pop("serialization", None)
         for f in os.listdir(self.model_dir):
             if serialization == "skops" and ".pkl" in f:
-                print(f"Loading {self.model_dir}/{f}")
                 self.model = load(self.model_dir + "/" + f)
             if ".joblib" in f: # joblib
                 self.model = joblib.load(self.model_dir + "/" + f)
@@ -29,10 +29,12 @@ def __init__(self, *args, **kwargs):
                         self.config["features"] = self.config["columns"]
                 else:
                     self.config = config_file
-            if ".pb" in f: # keras
+            if ".pt" in f: # pytorch
                 self.model = AutoModel.from_pretrained(self.model_dir)
                 if "config.json" in f:
                     self.model.config = AutoConfig.from_pretrained(self.model_dir)
+            if "model.pb" in f: # keras
+                self.model = keras.models.load_model(self.model_dir)
             if "modelRun.json" in f:
                 raise ValueError(
                     "MLConsole models are not suppoerted."