Add peft support to default deepspeed and huggingface handlers

deepjavalibrary · Jun 30, 2023 · 24b855e · 24b855e
1 parent cfb23d8
commit 24b855e
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 48 deletions.
diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
@@ -170,6 +170,15 @@ jobs:
           serve
           python3 llm/client.py huggingface gpt-j-6b
           docker rm -f $(docker ps -aq)
+      - name: Test gpt4all-lora
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          python3 llm/prepare.py huggingface gpt4all-lora
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
+          serve
+          python3 llm/client.py huggingface gpt4all-lora
+          docker rm -f $(docker ps -aq)
       - name: Test streaming bigscience/bloom-3b
         working-directory: tests/integration
         run: |
@@ -292,6 +301,15 @@ jobs:
           serve
           python3 llm/client.py deepspeed opt-13b
           docker rm -f $(docker ps -aq)
+      - name: Test gpt4all-lora
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          python3 llm/prepare.py deepspeed gpt4all-lora
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
+          serve
+          python3 llm/client.py deepspeed gpt4all-lora
+          docker rm -f $(docker ps -aq)
       - name: Test streaming gpt-neo-1.3b
         working-directory: tests/integration
         run: |

diff --git a/engines/python/setup/djl_python/deepspeed.py b/engines/python/setup/djl_python/deepspeed.py
@@ -25,6 +25,7 @@
 from djl_python.outputs import Output
 from djl_python.streaming_utils import StreamingUtils
 from typing import Optional
+from peft import PeftConfig, PeftModel
 
 OPTIMIZED_MODEL_TYPES = {
     "roberta",
@@ -111,11 +112,13 @@ def __init__(self):
         self.enable_streaming = None
         self.trust_remote_code = os.environ.get("HF_TRUST_REMOTE_CODE",
                                                 "FALSE").lower() == 'true'
+        self.peft_config = None
         self.model = None
         self.tokenizer = None
 
     def initialize(self, properties: dict):
         self._parse_properties(properties)
+        self._read_model_config()
         self._validate_model_type_and_task()
         self.create_model_pipeline()
         self.logger.info(
@@ -181,19 +184,6 @@ def _get_ds_config(self, properties: dict):
         return ds_config
 
     def _validate_model_type_and_task(self):
-        if os.path.exists(self.model_id_or_path):
-            config_file = os.path.join(self.model_id_or_path, "config.json")
-            if not os.path.exists(config_file):
-                raise ValueError(
-                    f"{self.model_id_or_path} does not contain a config.json. "
-                    f"This is required for loading models from local storage")
-            self.model_config = AutoConfig.from_pretrained(
-                config_file, trust_remote_code=self.trust_remote_code)
-        else:
-            self.model_config = AutoConfig.from_pretrained(
-                self.model_id_or_path,
-                trust_remote_code=self.trust_remote_code)
-
         if self.model_config.model_type not in OPTIMIZED_MODEL_TYPES:
             self.logger.warning(
                 f"DeepSpeed does not currently support optimized CUDA kernels for the model type "
@@ -210,6 +200,25 @@ def _validate_model_type_and_task(self):
             raise ValueError(
                 f"task: {self.task} is not currently supported by DeepSpeed")
 
+    def _read_model_config(self):
+        try:
+            self.model_config = AutoConfig.from_pretrained(
+                self.model_id_or_path,
+                trust_remote_code=self.trust_remote_code)
+        except OSError:
+            self.logger.warning(
+                f"config.json not found for {self.model_id_or_path}. Attempting to load with peft"
+            )
+            self.peft_config = PeftConfig.from_pretrained(
+                self.model_id_or_path)
+            self.model_config = AutoConfig.from_pretrained(
+                self.peft_config.base_model_name_or_path)
+        except Exception as e:
+            self.logger.error(
+                f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. "
+                f"This is required for loading huggingface models")
+            raise e
+
     def infer_task_from_model_architecture(self, config: PretrainedConfig):
         architecture = config.architectures[0]
         for arch_option in ARCHITECTURES_TO_TASK:
@@ -230,6 +239,21 @@ def create_model_pipeline(self):
             with deepspeed.OnDevice(dtype=dtype, device="meta"):
                 model = TASK_TO_MODEL[self.task].from_config(
                     self.model_config, **kwargs)
+        elif self.peft_config is not None:
+            self.logger.info(
+                f"Peft Model detected. Instantiating base model {self.peft_config.base_model_name_or_path}"
+            )
+            base_model = TASK_TO_MODEL[self.task].from_pretrained(
+                self.peft_config.base_model_name_or_path,
+                low_cpu_mem_usage=self.low_cpu_mem_usage,
+                trust_remote_code=self.trust_remote_code,
+                **kwargs)
+            lora_model = PeftModel.from_pretrained(base_model,
+                                                   self.model_id_or_path)
+            model = lora_model.merge_and_unload()
+            self.logger.info(
+                f"Peft Model merged into base model for deepspeed compatibility"
+            )
         else:
             model = TASK_TO_MODEL[self.task].from_pretrained(
                 self.model_id_or_path,
@@ -243,7 +267,12 @@ def create_model_pipeline(self):
         if self.model_config.model_type in OPTIMIZED_MODEL_TYPES:
             self.ds_config["replace_with_kernel_inject"] = True
         self.model = deepspeed.init_inference(model, config=self.ds_config)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path)
+        if self.peft_config:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.peft_config.base_model_name_or_path)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_id_or_path)
         if self.enable_streaming:
             return
         # Optimization for text-generation batch processing

diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py
@@ -15,8 +15,13 @@
 import os
 
 import torch
-from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
+from transformers import (pipeline, Conversation, AutoModelForCausalLM,
+                          AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig,
+                          AutoModelForSequenceClassification,
+                          AutoModelForTokenClassification,
+                          AutoModelForQuestionAnswering)
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from peft import PeftConfig, PeftModel
 
 from djl_python.encode_decode import encode, decode
 from djl_python.inputs import Input
@@ -42,12 +47,18 @@
 }
 
 LMI_DIST_ADV_MODEL = {
-    "RWForCausalLM",
-    "GPTNeoXForCausalLM",
-    "T5ForConditionalGeneration",
+    "RWForCausalLM", "GPTNeoXForCausalLM", "T5ForConditionalGeneration",
     "LlamaForCausalLM"
 }
 
+PEFT_MODEL_TASK_TO_CLS = {
+    "SEQ_CLS": AutoModelForSequenceClassification,
+    "SEQ_2_SEQ_LM": AutoModelForSeq2SeqLM,
+    "CAUSAL_LM": AutoModelForCausalLM,
+    "TOKEN_CLS": AutoModelForTokenClassification,
+    "QUESTION_ANS": AutoModelForQuestionAnswering,
+}
+
 
 def get_torch_dtype_from_str(dtype: str):
     if dtype == "auto":
@@ -65,7 +76,8 @@ def get_torch_dtype_from_str(dtype: str):
     raise ValueError(f"Invalid data type: {dtype}")
 
 
-def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool, model_config):
+def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool,
+                                     model_config):
     if rolling_batch_type == "auto":
         architecture = model_config.architectures[0]
         if architecture in LMI_DIST_ADV_MODEL and is_mpi:
@@ -95,6 +107,7 @@ def __init__(self):
         self.rolling_batch_type = None
         self.rolling_batch = None
         self.model_config = None
+        self.peft_config = None
 
     def initialize(self, properties: dict):
         # model_id can point to huggingface model_id or local directory.
@@ -146,17 +159,18 @@ def initialize(self, properties: dict):
                 properties.get("dtype"))
         self.rolling_batch_type = properties.get("rolling_batch", None)
 
+        self._read_model_config(model_id_or_path)
+
         if self.rolling_batch_type:
             self.rolling_batch_type = self.rolling_batch_type.lower()
             is_mpi = properties.get("engine") != "Python"
             if is_mpi:
                 self.device = int(os.getenv("LOCAL_RANK", 0))
-            model_config = AutoConfig.from_pretrained(model_id_or_path, **kwargs)
-            _rolling_batch_cls = get_rolling_batch_class_from_str(self.rolling_batch_type, is_mpi, model_config)
+            _rolling_batch_cls = get_rolling_batch_class_from_str(
+                self.rolling_batch_type, is_mpi, self.model_config)
             self.rolling_batch = _rolling_batch_cls(model_id_or_path,
                                                     self.device, properties,
                                                     **kwargs)
-
             self.initialized = True
             return
         elif self.enable_streaming:
@@ -165,7 +179,7 @@ def initialize(self, properties: dict):
             return
 
         if not task:
-            task = self.infer_task_from_model_architecture(model_id_or_path)
+            task = self.infer_task_from_model_architecture()
 
         self.hf_pipeline = self.get_pipeline(task=task,
                                              model_id_or_path=model_id_or_path,
@@ -249,12 +263,12 @@ def inference(self, inputs):
     def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
         # define tokenizer or feature extractor as kwargs to load it the pipeline correctly
         if task in {
-            "automatic-speech-recognition",
-            "image-segmentation",
-            "image-classification",
-            "audio-classification",
-            "object-detection",
-            "zero-shot-image-classification",
+                "automatic-speech-recognition",
+                "image-segmentation",
+                "image-classification",
+                "audio-classification",
+                "object-detection",
+                "zero-shot-image-classification",
         }:
             kwargs["feature_extractor"] = model_id_or_path
         else:
@@ -266,10 +280,25 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
                 use_pipeline = False
         # build pipeline
         if use_pipeline:
-            hf_pipeline = pipeline(task=task,
-                                   model=model_id_or_path,
-                                   device=self.device,
-                                   **kwargs)
+            if self.peft_config is not None:
+                kwargs.pop("tokenizer", None)
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    self.peft_config.base_model_name_or_path)
+                base_model = PEFT_MODEL_TASK_TO_CLS[
+                    self.peft_config.task_type].from_pretrained(
+                        self.peft_config.base_model_name_or_path, **kwargs)
+                lora_model = PeftModel.from_pretrained(base_model,
+                                                       model_id_or_path)
+                self.model = lora_model.merge_and_unload()
+                hf_pipeline = pipeline(task=task,
+                                       tokenizer=self.tokenizer,
+                                       model=self.model,
+                                       device=self.device)
+            else:
+                hf_pipeline = pipeline(task=task,
+                                       model=model_id_or_path,
+                                       device=self.device,
+                                       **kwargs)
         else:
             kwargs.pop("tokenizer", None)
             self._init_model_and_tokenizer(model_id_or_path, **kwargs)
@@ -293,19 +322,27 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
         return hf_pipeline
 
     def _init_model_and_tokenizer(self, model_id_or_path: str, **kwargs):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path,
-                                                       padding_side="left")
-        model_config = AutoConfig.from_pretrained(model_id_or_path,
-                                                  **kwargs)
-        self.model_config = model_config
-        architectures = model_config.architectures
+        if self.peft_config is not None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.peft_config.base_model_name_or_path, padding_size="left")
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path,
+                                                           padding_side="left")
+        architectures = self.model_config.architectures
         if architectures and architectures[0].endswith(
                 "ForConditionalGeneration"):
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(
-                model_id_or_path, **kwargs)
+            model_cls = AutoModelForSeq2SeqLM
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_id_or_path, **kwargs)
+            model_cls = AutoModelForCausalLM
+
+        if self.peft_config is not None:
+            base_model = model_cls.from_pretrained(
+                self.peft_config.base_model_name_or_path, **kwargs)
+            lora_model = PeftModel.from_pretrained(base_model,
+                                                   model_id_or_path)
+            self.model = lora_model.merge_and_unload()
+        else:
+            self.model = model_cls.from_pretrained(model_id_or_path, **kwargs)
 
         if self.device:
             self.model.to(self.device)
@@ -352,10 +389,8 @@ def wrapped_pipeline(inputs, *args, **kwargs):
 
         return wrapped_pipeline
 
-    def infer_task_from_model_architecture(self, model_config_path: str):
-        model_config = AutoConfig.from_pretrained(
-            model_config_path, trust_remote_code=self.trust_remote_code)
-        architecture = model_config.architectures[0]
+    def infer_task_from_model_architecture(self):
+        architecture = self.model_config.architectures[0]
 
         task = None
         for arch_options in ARCHITECTURES_2_TASK:
@@ -368,6 +403,23 @@ def infer_task_from_model_architecture(self, model_config_path: str):
             )
         return task
 
+    def _read_model_config(self, model_config_path: str):
+        try:
+            self.model_config = AutoConfig.from_pretrained(
+                model_config_path, trust_remote_code=self.trust_remote_code)
+        except OSError:
+            logging.warning(
+                f"config.json not found for {model_config_path}. Attempting to load with peft"
+            )
+            self.peft_config = PeftConfig.from_pretrained(model_config_path)
+            self.model_config = AutoConfig.from_pretrained(
+                self.peft_config.base_model_name_or_path)
+        except Exception as e:
+            self.logger.error(
+                f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. "
+                f"This is required for loading huggingface models")
+            raise e
+
 
 _service = HuggingFaceService()
 

diff --git a/serving/docker/deepspeed.Dockerfile b/serving/docker/deepspeed.Dockerfile
@@ -22,6 +22,7 @@ ARG transformers_version=4.29.2
 ARG accelerate_version=0.19.0
 ARG diffusers_version=0.15.0
 ARG bitsandbytes_version=0.39.1
+ARG peft_version=0.3.0
 
 EXPOSE 8080
 
@@ -60,7 +61,7 @@ RUN apt-get update && \
     pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 \
     ${deepspeed_wheel} ${lmi_dist_wheel} protobuf==${protobuf_version} transformers==${transformers_version} \
     mpi4py sentencepiece einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version}\
-    diffusers[torch]==${diffusers_version} opencv-contrib-python-headless safetensors scipy && \
+    diffusers[torch]==${diffusers_version} peft==${peft_version} opencv-contrib-python-headless safetensors scipy && \
     scripts/install_flash_attn.sh && \
     scripts/install_aitemplate.sh && \
     scripts/patch_oss_dlc.sh python && \

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -133,6 +133,12 @@ def get_model_name():
         "batch_size": [1, 4],
         "seq_length": [16, 32],
         "worker": 2,
+    },
+    "gpt4all-lora": {
+        "max_memory_per_gpu": [6.0, 8.0],
+        "batch_size": [1, 4],
+        "seq_length": [16, 32],
+        "worker": 1,
     }
 }
 
@@ -165,6 +171,12 @@ def get_model_name():
         "seq_length": [16],
         "worker": 1,
         "stream_output": True,
+    },
+    "gpt4all-lora": {
+        "max_memory_per_gpu": [6.0, 8.0],
+        "batch_size": [1, 4],
+        "seq_length": [16, 32],
+        "worker": 1,
     }
 }