From 4005998b950b72febe56bbb810f164ff9ca50f9f Mon Sep 17 00:00:00 2001 From: zcy Date: Mon, 15 Jan 2024 16:00:31 +0100 Subject: [PATCH 1/4] add tabular pipeline --- giskard_cicd/loaders/huggingface_loader.py | 6 +++++ giskard_cicd/loaders/tabular_loader.py | 27 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 giskard_cicd/loaders/tabular_loader.py diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py index 1b2fcc2..4cecfb1 100644 --- a/giskard_cicd/loaders/huggingface_loader.py +++ b/giskard_cicd/loaders/huggingface_loader.py @@ -170,8 +170,12 @@ def load_dataset( def load_model(self, model_id): from transformers import pipeline + from .tabular_loader import TabularClassificationPipeline task = huggingface_hub.model_info(model_id).pipeline_tag + print(task) + if "tabular-classification" in task: + return TabularClassificationPipeline(task=task, model=model_id, device=self.device) return pipeline(task=task, model=model_id, device=self.device) @@ -264,6 +268,8 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None): def _get_feature_mapping(self, hf_model, hf_dataset): if isinstance(hf_model, TextClassificationPipeline): task_features = {"text": "string", "label": "class_label"} + elif "tabular" in hf_model.pipeline_tag: + print(hf_model.model.config) else: msg = "Unsupported model type." raise NotImplementedError(msg) diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_loader.py new file mode 100644 index 0000000..f49e9c2 --- /dev/null +++ b/giskard_cicd/loaders/tabular_loader.py @@ -0,0 +1,27 @@ +from transformers import Pipeline + +class TabularClassificationPipeline(Pipeline): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._model_type = "tabular-classification" + self._check_model_type(self._model_type) + self.pipeline_tag = "tabular-classification" + + def _sanitize_parameters(self, **kwargs): + kwargs = super()._sanitize_parameters(**kwargs) + return kwargs + + def _check_model_type(self, model_type): + if model_type != self._model_type: + raise ValueError( + f"Pipeline is not of type {self._model_type} but {model_type}" + ) + + def _forward(self, *args, **kwargs): + raise NotImplementedError("Not implemented yet") + + def preporocess(self, *args, **kwargs): + return super().preprocess(*args, **kwargs) + + def postprocess(self, *args, **kwargs): + return super().postprocess(*args, **kwargs) \ No newline at end of file From 09ae625a6a90b91129041c6f806506f32fe58482 Mon Sep 17 00:00:00 2001 From: zcy Date: Mon, 15 Jan 2024 17:17:55 +0100 Subject: [PATCH 2/4] handle errors for tabular --- giskard_cicd/loaders/huggingface_loader.py | 30 ++++++++++++---------- giskard_cicd/loaders/tabular_loader.py | 21 +++++++++++---- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py index 4cecfb1..60c87de 100644 --- a/giskard_cicd/loaders/huggingface_loader.py +++ b/giskard_cicd/loaders/huggingface_loader.py @@ -74,7 +74,7 @@ def load_giskard_model_dataset( # Check that the dataset has the good feature names for the task. logger.debug("Retrieving feature mapping") - if manual_feature_mapping is None: + if manual_feature_mapping is None and isinstance(hf_model, TextClassificationPipeline): feature_mapping = self._get_feature_mapping(hf_model, hf_dataset) logger.warn( f'Feature mapping is not provided, using extracted "{feature_mapping}"' @@ -82,9 +82,12 @@ def load_giskard_model_dataset( else: feature_mapping = manual_feature_mapping - df = hf_dataset.to_pandas().rename( - columns={v: k for k, v in feature_mapping.items()} - ) + if feature_mapping is not None: + df = hf_dataset.to_pandas().rename( + columns={v: k for k, v in feature_mapping.items()} + ) + else: + df = hf_dataset.to_pandas() # remove the rows have multiple labels # this is a hacky way to do it @@ -96,7 +99,7 @@ def load_giskard_model_dataset( # @TODO: currently for classification models only. logger.debug("Retrieving classification label mapping") - if classification_label_mapping is None: + if classification_label_mapping is None and isinstance(hf_model, TextClassificationPipeline): id2label = hf_model.model.config.id2label logger.warn(f'Label mapping is not provided, using "{id2label}" from model') else: @@ -106,9 +109,10 @@ def load_giskard_model_dataset( # need to include all labels # rewrite this lambda function to include all labels df.label = df.label.apply(lambda x: id2label[x[0]]) - else: + elif getattr(df, "label", None) is not None: # TODO: when the label for test is not provided, what do we do? df["label"] = df.label.apply(lambda x: id2label[x] if x >= 0 else "-1") + # map the list of label ids to the list of labels # df["label"] = df.label.apply(lambda x: [id2label[i] for i in x]) logger.debug("Wrapping dataset") @@ -173,9 +177,8 @@ def load_model(self, model_id): from .tabular_loader import TabularClassificationPipeline task = huggingface_hub.model_info(model_id).pipeline_tag - print(task) if "tabular-classification" in task: - return TabularClassificationPipeline(task=task, model=model_id, device=self.device) + return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id) return pipeline(task=task, model=model_id, device=self.device) @@ -240,6 +243,8 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None): Flatten the dataset to a pandas dataframe """ flat_dataset = pd.DataFrame() + if isinstance(hf_dataset, datasets.Dataset): + return hf_dataset if isinstance(hf_dataset, datasets.DatasetDict): keys = list(hf_dataset.keys()) for k in keys: @@ -249,16 +254,15 @@ def _flatten_hf_dataset(self, hf_dataset, data_split=None): break # Otherwise infer one data split - if k.startswith("train"): - continue - elif k.startswith(data_split): + if k.startswith(data_split): # TODO: only support one split for now # Maybe we can merge all the datasets into one flat_dataset = hf_dataset[k] break + elif k.startswith("train"): + continue else: flat_dataset = hf_dataset[k] - # If there are only train datasets if isinstance(flat_dataset, pd.DataFrame) and flat_dataset.empty: flat_dataset = hf_dataset[keys[0]] @@ -269,7 +273,7 @@ def _get_feature_mapping(self, hf_model, hf_dataset): if isinstance(hf_model, TextClassificationPipeline): task_features = {"text": "string", "label": "class_label"} elif "tabular" in hf_model.pipeline_tag: - print(hf_model.model.config) + raise NotImplementedError("Tabular model features cannot be auto-mapped.") else: msg = "Unsupported model type." raise NotImplementedError(msg) diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_loader.py index f49e9c2..7b83340 100644 --- a/giskard_cicd/loaders/tabular_loader.py +++ b/giskard_cicd/loaders/tabular_loader.py @@ -1,11 +1,21 @@ -from transformers import Pipeline +from transformers import Pipeline, AutoModel +import huggingface_hub +import joblib +import os class TabularClassificationPipeline(Pipeline): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) self._model_type = "tabular-classification" self._check_model_type(self._model_type) self.pipeline_tag = "tabular-classification" + # get model parameter from args + model_id = kwargs.pop("model", None) + self.model_dir = huggingface_hub.snapshot_download(model_id) + for f in os.listdir(self.model_dir): + if ".joblib" in f: + self.model = joblib.load(self.model_dir + "/" + f) + if ".pb" in f: + self.model = AutoModel.from_pretrained(self.model_dir) def _sanitize_parameters(self, **kwargs): kwargs = super()._sanitize_parameters(**kwargs) @@ -18,10 +28,11 @@ def _check_model_type(self, model_type): ) def _forward(self, *args, **kwargs): - raise NotImplementedError("Not implemented yet") + return super()._forward(*args, **kwargs) - def preporocess(self, *args, **kwargs): + def preprocess(self, *args, **kwargs): return super().preprocess(*args, **kwargs) def postprocess(self, *args, **kwargs): - return super().postprocess(*args, **kwargs) \ No newline at end of file + return super().postprocess(*args, **kwargs) + \ No newline at end of file From 94e8153309036ac6d3f1da42434ebfb6fc3d6aa3 Mon Sep 17 00:00:00 2001 From: zcy Date: Tue, 16 Jan 2024 19:54:05 +0100 Subject: [PATCH 3/4] add tabular regression and tabular base pipeline --- giskard_cicd/loaders/huggingface_loader.py | 43 +++++++++++---- ....py => tabular_classification_pipeline.py} | 17 ++---- giskard_cicd/loaders/tabular_pipeline.py | 55 +++++++++++++++++++ .../loaders/tabular_regression_pipeline.py | 29 ++++++++++ 4 files changed, 120 insertions(+), 24 deletions(-) rename giskard_cicd/loaders/{tabular_loader.py => tabular_classification_pipeline.py} (60%) create mode 100644 giskard_cicd/loaders/tabular_pipeline.py create mode 100644 giskard_cicd/loaders/tabular_regression_pipeline.py diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py index 60c87de..3b867ba 100644 --- a/giskard_cicd/loaders/huggingface_loader.py +++ b/giskard_cicd/loaders/huggingface_loader.py @@ -12,6 +12,9 @@ from giskard.models.base import BaseModel from giskard.models.huggingface import HuggingFaceModel from transformers.pipelines import TextClassificationPipeline +from .tabular_classification_pipeline import TabularClassificationPipeline +from .tabular_regression_pipeline import TabularRegressionPipeline +from .tabular_pipeline import TabularPipeline import requests from .huggingface_inf_model import classification_model_from_inference_api @@ -116,6 +119,7 @@ def load_giskard_model_dataset( # map the list of label ids to the list of labels # df["label"] = df.label.apply(lambda x: [id2label[i] for i in x]) logger.debug("Wrapping dataset") + gsk_dataset = gsk.Dataset( df, name=f"HF {dataset}[{dataset_config}]({dataset_split}) for {model} model", @@ -123,17 +127,26 @@ def load_giskard_model_dataset( column_types={"text": "text"}, validation=False, ) + logger.debug("Wrapping model") - gsk_model = self._get_gsk_model( - hf_model, - [id2label[i] for i in range(len(id2label))], - features=feature_mapping, - inference_type=inference_type, - device=self.device, - hf_token=inference_api_token, - ) + if id2label is None and isinstance(hf_model, TabularPipeline): + gsk_model = gsk.Model( + lambda data: hf_model.predict(data), + model_type=hf_model._model_type, + name=f"{hf_model.model_id} HF pipeline", + feature_names=hf_model.config["features"], + ) + else: + gsk_model = self._get_gsk_model( + hf_model, + [id2label[i] for i in range(len(id2label))], + features=feature_mapping, + inference_type=inference_type, + device=self.device, + hf_token=inference_api_token, + ) # Optimize batch size if self.device.startswith("cuda"): @@ -174,11 +187,19 @@ def load_dataset( def load_model(self, model_id): from transformers import pipeline - from .tabular_loader import TabularClassificationPipeline - task = huggingface_hub.model_info(model_id).pipeline_tag + tags = huggingface_hub.model_info(model_id).tags + serialization = None + if not task and ("tabular" in tags and "classification" in tags): + task = ["tabular-classification"] + + if "skops" in tags: + serialization = "skops" + if "tabular-classification" in task: - return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id) + return TabularClassificationPipeline(task=task, model=model_id, model_id=model_id, serialization=serialization) + if "tabular-regression" in task: + return TabularRegressionPipeline(task=task, model=model_id, model_id=model_id, serialization=serialization) return pipeline(task=task, model=model_id, device=self.device) diff --git a/giskard_cicd/loaders/tabular_loader.py b/giskard_cicd/loaders/tabular_classification_pipeline.py similarity index 60% rename from giskard_cicd/loaders/tabular_loader.py rename to giskard_cicd/loaders/tabular_classification_pipeline.py index 7b83340..e4f21aa 100644 --- a/giskard_cicd/loaders/tabular_loader.py +++ b/giskard_cicd/loaders/tabular_classification_pipeline.py @@ -1,21 +1,12 @@ -from transformers import Pipeline, AutoModel -import huggingface_hub -import joblib -import os +from .tabular_pipeline import TabularPipeline -class TabularClassificationPipeline(Pipeline): +class TabularClassificationPipeline(TabularPipeline): def __init__(self, *args, **kwargs): - self._model_type = "tabular-classification" + self._model_type = "classification" self._check_model_type(self._model_type) self.pipeline_tag = "tabular-classification" # get model parameter from args - model_id = kwargs.pop("model", None) - self.model_dir = huggingface_hub.snapshot_download(model_id) - for f in os.listdir(self.model_dir): - if ".joblib" in f: - self.model = joblib.load(self.model_dir + "/" + f) - if ".pb" in f: - self.model = AutoModel.from_pretrained(self.model_dir) + super().__init__(*args, **kwargs) def _sanitize_parameters(self, **kwargs): kwargs = super()._sanitize_parameters(**kwargs) diff --git a/giskard_cicd/loaders/tabular_pipeline.py b/giskard_cicd/loaders/tabular_pipeline.py new file mode 100644 index 0000000..5271e21 --- /dev/null +++ b/giskard_cicd/loaders/tabular_pipeline.py @@ -0,0 +1,55 @@ +from typing import Any, Dict +from transformers import Pipeline, AutoModel, AutoConfig +import huggingface_hub +import joblib +import os +import json +from skops.io import load +from transformers.pipelines.base import GenericTensor +from transformers.utils import ModelOutput + +class TabularPipeline(Pipeline): + def __init__(self, *args, **kwargs): + self._num_workers = 0 + # get model parameter from args + self.model_id = kwargs.pop("model", None) + self.model_dir = huggingface_hub.snapshot_download(self.model_id) + serialization = kwargs.pop("serialization", None) + for f in os.listdir(self.model_dir): + if serialization == "skops" and ".pkl" in f: + print(f"Loading {self.model_dir}/{f}") + self.model = load(self.model_dir + "/" + f) + if ".joblib" in f: # joblib + self.model = joblib.load(self.model_dir + "/" + f) + if "config.json" in f: + config_file = json.load(open(self.model_dir + "/" + f)) + if "sklearn" in config_file.keys(): + self.config = config_file["sklearn"] + if "columns" in self.config.keys(): + self.config["features"] = self.config["columns"] + else: + self.config = config_file + if ".pb" in f: # keras + self.model = AutoModel.from_pretrained(self.model_dir) + if "config.json" in f: + self.model.config = AutoConfig.from_pretrained(self.model_dir) + if "modelRun.json" in f: + raise ValueError( + "MLConsole models are not suppoerted." + ) + + def predict(self, *args, **kwargs): + return self.model.predict(*args, **kwargs) + + def _sanitize_parameters(self, **kwargs): + kwargs = super()._sanitize_parameters(**kwargs) + return kwargs + def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput: + return super()._forward(input_tensors, **forward_parameters) + + def preprocess(self, *args, **kwargs): + return super().preprocess(*args, **kwargs) + + def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any: + return super().postprocess(model_outputs, **postprocess_parameters) + \ No newline at end of file diff --git a/giskard_cicd/loaders/tabular_regression_pipeline.py b/giskard_cicd/loaders/tabular_regression_pipeline.py new file mode 100644 index 0000000..5ab394d --- /dev/null +++ b/giskard_cicd/loaders/tabular_regression_pipeline.py @@ -0,0 +1,29 @@ +from .tabular_pipeline import TabularPipeline + +class TabularRegressionPipeline(TabularPipeline): + def __init__(self, *args, **kwargs): + self._model_type = "regression" + self._check_model_type(self._model_type) + self.pipeline_tag = "tabular-regression" + # get model parameter from args + super().__init__(*args, **kwargs) + + def _sanitize_parameters(self, **kwargs): + kwargs = super()._sanitize_parameters(**kwargs) + return kwargs + + def _check_model_type(self, model_type): + if model_type != self._model_type: + raise ValueError( + f"Pipeline is not of type {self._model_type} but {model_type}" + ) + + def _forward(self, *args, **kwargs): + return super()._forward(*args, **kwargs) + + def preprocess(self, *args, **kwargs): + return super().preprocess(*args, **kwargs) + + def postprocess(self, *args, **kwargs): + return super().postprocess(*args, **kwargs) + \ No newline at end of file From a60e9bbfe0c1ecb919089345b174f5ed4e67b01a Mon Sep 17 00:00:00 2001 From: zcy Date: Wed, 17 Jan 2024 21:47:06 +0100 Subject: [PATCH 4/4] add classification labels and keras --- giskard_cicd/loaders/huggingface_loader.py | 1 + giskard_cicd/loaders/tabular_pipeline.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/giskard_cicd/loaders/huggingface_loader.py b/giskard_cicd/loaders/huggingface_loader.py index 3b867ba..76f4449 100644 --- a/giskard_cicd/loaders/huggingface_loader.py +++ b/giskard_cicd/loaders/huggingface_loader.py @@ -137,6 +137,7 @@ def load_giskard_model_dataset( model_type=hf_model._model_type, name=f"{hf_model.model_id} HF pipeline", feature_names=hf_model.config["features"], + classification_labels=hf_model.config["target_mapping"].values(), ) else: gsk_model = self._get_gsk_model( diff --git a/giskard_cicd/loaders/tabular_pipeline.py b/giskard_cicd/loaders/tabular_pipeline.py index 5271e21..180a040 100644 --- a/giskard_cicd/loaders/tabular_pipeline.py +++ b/giskard_cicd/loaders/tabular_pipeline.py @@ -1,5 +1,6 @@ from typing import Any, Dict from transformers import Pipeline, AutoModel, AutoConfig +import keras import huggingface_hub import joblib import os @@ -17,7 +18,6 @@ def __init__(self, *args, **kwargs): serialization = kwargs.pop("serialization", None) for f in os.listdir(self.model_dir): if serialization == "skops" and ".pkl" in f: - print(f"Loading {self.model_dir}/{f}") self.model = load(self.model_dir + "/" + f) if ".joblib" in f: # joblib self.model = joblib.load(self.model_dir + "/" + f) @@ -29,10 +29,12 @@ def __init__(self, *args, **kwargs): self.config["features"] = self.config["columns"] else: self.config = config_file - if ".pb" in f: # keras + if ".pt" in f: # pytorch self.model = AutoModel.from_pretrained(self.model_dir) if "config.json" in f: self.model.config = AutoConfig.from_pretrained(self.model_dir) + if "model.pb" in f: # keras + self.model = keras.models.load_model(self.model_dir) if "modelRun.json" in f: raise ValueError( "MLConsole models are not suppoerted."