diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index a24b1827f..6e4a438a7 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -170,6 +170,15 @@ jobs: serve python3 llm/client.py huggingface gpt-j-6b docker rm -f $(docker ps -aq) + - name: Test gpt4all-lora + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py huggingface gpt4all-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ + serve + python3 llm/client.py huggingface gpt4all-lora + docker rm -f $(docker ps -aq) - name: Test streaming bigscience/bloom-3b working-directory: tests/integration run: | @@ -292,6 +301,15 @@ jobs: serve python3 llm/client.py deepspeed opt-13b docker rm -f $(docker ps -aq) + - name: Test gpt4all-lora + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py deepspeed gpt4all-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ + serve + python3 llm/client.py deepspeed gpt4all-lora + docker rm -f $(docker ps -aq) - name: Test streaming gpt-neo-1.3b working-directory: tests/integration run: | diff --git a/engines/python/setup/djl_python/deepspeed.py b/engines/python/setup/djl_python/deepspeed.py index 9dad7e7e3..32fe44273 100644 --- a/engines/python/setup/djl_python/deepspeed.py +++ b/engines/python/setup/djl_python/deepspeed.py @@ -25,6 +25,7 @@ from djl_python.outputs import Output from djl_python.streaming_utils import StreamingUtils from typing import Optional +from peft import PeftConfig, PeftModel OPTIMIZED_MODEL_TYPES = { "roberta", @@ -111,11 +112,13 @@ def __init__(self): self.enable_streaming = None self.trust_remote_code = os.environ.get("HF_TRUST_REMOTE_CODE", "FALSE").lower() == 'true' + self.peft_config = None self.model = None self.tokenizer = None def initialize(self, properties: dict): self._parse_properties(properties) + self._read_model_config() self._validate_model_type_and_task() self.create_model_pipeline() self.logger.info( @@ -181,19 +184,6 @@ def _get_ds_config(self, properties: dict): return ds_config def _validate_model_type_and_task(self): - if os.path.exists(self.model_id_or_path): - config_file = os.path.join(self.model_id_or_path, "config.json") - if not os.path.exists(config_file): - raise ValueError( - f"{self.model_id_or_path} does not contain a config.json. " - f"This is required for loading models from local storage") - self.model_config = AutoConfig.from_pretrained( - config_file, trust_remote_code=self.trust_remote_code) - else: - self.model_config = AutoConfig.from_pretrained( - self.model_id_or_path, - trust_remote_code=self.trust_remote_code) - if self.model_config.model_type not in OPTIMIZED_MODEL_TYPES: self.logger.warning( f"DeepSpeed does not currently support optimized CUDA kernels for the model type " @@ -210,6 +200,25 @@ def _validate_model_type_and_task(self): raise ValueError( f"task: {self.task} is not currently supported by DeepSpeed") + def _read_model_config(self): + try: + self.model_config = AutoConfig.from_pretrained( + self.model_id_or_path, + trust_remote_code=self.trust_remote_code) + except OSError: + self.logger.warning( + f"config.json not found for {self.model_id_or_path}. Attempting to load with peft" + ) + self.peft_config = PeftConfig.from_pretrained( + self.model_id_or_path) + self.model_config = AutoConfig.from_pretrained( + self.peft_config.base_model_name_or_path) + except Exception as e: + self.logger.error( + f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. " + f"This is required for loading huggingface models") + raise e + def infer_task_from_model_architecture(self, config: PretrainedConfig): architecture = config.architectures[0] for arch_option in ARCHITECTURES_TO_TASK: @@ -230,6 +239,21 @@ def create_model_pipeline(self): with deepspeed.OnDevice(dtype=dtype, device="meta"): model = TASK_TO_MODEL[self.task].from_config( self.model_config, **kwargs) + elif self.peft_config is not None: + self.logger.info( + f"Peft Model detected. Instantiating base model {self.peft_config.base_model_name_or_path}" + ) + base_model = TASK_TO_MODEL[self.task].from_pretrained( + self.peft_config.base_model_name_or_path, + low_cpu_mem_usage=self.low_cpu_mem_usage, + trust_remote_code=self.trust_remote_code, + **kwargs) + lora_model = PeftModel.from_pretrained(base_model, + self.model_id_or_path) + model = lora_model.merge_and_unload() + self.logger.info( + f"Peft Model merged into base model for deepspeed compatibility" + ) else: model = TASK_TO_MODEL[self.task].from_pretrained( self.model_id_or_path, @@ -243,7 +267,12 @@ def create_model_pipeline(self): if self.model_config.model_type in OPTIMIZED_MODEL_TYPES: self.ds_config["replace_with_kernel_inject"] = True self.model = deepspeed.init_inference(model, config=self.ds_config) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path) + if self.peft_config: + self.tokenizer = AutoTokenizer.from_pretrained( + self.peft_config.base_model_name_or_path) + else: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id_or_path) if self.enable_streaming: return # Optimization for text-generation batch processing diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py index 72029f604..632b8aa30 100644 --- a/engines/python/setup/djl_python/huggingface.py +++ b/engines/python/setup/djl_python/huggingface.py @@ -15,8 +15,13 @@ import os import torch -from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig +from transformers import (pipeline, Conversation, AutoModelForCausalLM, + AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelForQuestionAnswering) from transformers.tokenization_utils_base import PreTrainedTokenizerBase +from peft import PeftConfig, PeftModel from djl_python.encode_decode import encode, decode from djl_python.inputs import Input @@ -42,12 +47,18 @@ } LMI_DIST_ADV_MODEL = { - "RWForCausalLM", - "GPTNeoXForCausalLM", - "T5ForConditionalGeneration", + "RWForCausalLM", "GPTNeoXForCausalLM", "T5ForConditionalGeneration", "LlamaForCausalLM" } +PEFT_MODEL_TASK_TO_CLS = { + "SEQ_CLS": AutoModelForSequenceClassification, + "SEQ_2_SEQ_LM": AutoModelForSeq2SeqLM, + "CAUSAL_LM": AutoModelForCausalLM, + "TOKEN_CLS": AutoModelForTokenClassification, + "QUESTION_ANS": AutoModelForQuestionAnswering, +} + def get_torch_dtype_from_str(dtype: str): if dtype == "auto": @@ -65,7 +76,8 @@ def get_torch_dtype_from_str(dtype: str): raise ValueError(f"Invalid data type: {dtype}") -def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool, model_config): +def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool, + model_config): if rolling_batch_type == "auto": architecture = model_config.architectures[0] if architecture in LMI_DIST_ADV_MODEL and is_mpi: @@ -95,6 +107,7 @@ def __init__(self): self.rolling_batch_type = None self.rolling_batch = None self.model_config = None + self.peft_config = None def initialize(self, properties: dict): # model_id can point to huggingface model_id or local directory. @@ -146,17 +159,18 @@ def initialize(self, properties: dict): properties.get("dtype")) self.rolling_batch_type = properties.get("rolling_batch", None) + self._read_model_config(model_id_or_path) + if self.rolling_batch_type: self.rolling_batch_type = self.rolling_batch_type.lower() is_mpi = properties.get("engine") != "Python" if is_mpi: self.device = int(os.getenv("LOCAL_RANK", 0)) - model_config = AutoConfig.from_pretrained(model_id_or_path, **kwargs) - _rolling_batch_cls = get_rolling_batch_class_from_str(self.rolling_batch_type, is_mpi, model_config) + _rolling_batch_cls = get_rolling_batch_class_from_str( + self.rolling_batch_type, is_mpi, self.model_config) self.rolling_batch = _rolling_batch_cls(model_id_or_path, self.device, properties, **kwargs) - self.initialized = True return elif self.enable_streaming: @@ -165,7 +179,7 @@ def initialize(self, properties: dict): return if not task: - task = self.infer_task_from_model_architecture(model_id_or_path) + task = self.infer_task_from_model_architecture() self.hf_pipeline = self.get_pipeline(task=task, model_id_or_path=model_id_or_path, @@ -249,12 +263,12 @@ def inference(self, inputs): def get_pipeline(self, task: str, model_id_or_path: str, kwargs): # define tokenizer or feature extractor as kwargs to load it the pipeline correctly if task in { - "automatic-speech-recognition", - "image-segmentation", - "image-classification", - "audio-classification", - "object-detection", - "zero-shot-image-classification", + "automatic-speech-recognition", + "image-segmentation", + "image-classification", + "audio-classification", + "object-detection", + "zero-shot-image-classification", }: kwargs["feature_extractor"] = model_id_or_path else: @@ -266,10 +280,25 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs): use_pipeline = False # build pipeline if use_pipeline: - hf_pipeline = pipeline(task=task, - model=model_id_or_path, - device=self.device, - **kwargs) + if self.peft_config is not None: + kwargs.pop("tokenizer", None) + self.tokenizer = AutoTokenizer.from_pretrained( + self.peft_config.base_model_name_or_path) + base_model = PEFT_MODEL_TASK_TO_CLS[ + self.peft_config.task_type].from_pretrained( + self.peft_config.base_model_name_or_path, **kwargs) + lora_model = PeftModel.from_pretrained(base_model, + model_id_or_path) + self.model = lora_model.merge_and_unload() + hf_pipeline = pipeline(task=task, + tokenizer=self.tokenizer, + model=self.model, + device=self.device) + else: + hf_pipeline = pipeline(task=task, + model=model_id_or_path, + device=self.device, + **kwargs) else: kwargs.pop("tokenizer", None) self._init_model_and_tokenizer(model_id_or_path, **kwargs) @@ -293,19 +322,27 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs): return hf_pipeline def _init_model_and_tokenizer(self, model_id_or_path: str, **kwargs): - self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, - padding_side="left") - model_config = AutoConfig.from_pretrained(model_id_or_path, - **kwargs) - self.model_config = model_config - architectures = model_config.architectures + if self.peft_config is not None: + self.tokenizer = AutoTokenizer.from_pretrained( + self.peft_config.base_model_name_or_path, padding_size="left") + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, + padding_side="left") + architectures = self.model_config.architectures if architectures and architectures[0].endswith( "ForConditionalGeneration"): - self.model = AutoModelForSeq2SeqLM.from_pretrained( - model_id_or_path, **kwargs) + model_cls = AutoModelForSeq2SeqLM else: - self.model = AutoModelForCausalLM.from_pretrained( - model_id_or_path, **kwargs) + model_cls = AutoModelForCausalLM + + if self.peft_config is not None: + base_model = model_cls.from_pretrained( + self.peft_config.base_model_name_or_path, **kwargs) + lora_model = PeftModel.from_pretrained(base_model, + model_id_or_path) + self.model = lora_model.merge_and_unload() + else: + self.model = model_cls.from_pretrained(model_id_or_path, **kwargs) if self.device: self.model.to(self.device) @@ -352,10 +389,8 @@ def wrapped_pipeline(inputs, *args, **kwargs): return wrapped_pipeline - def infer_task_from_model_architecture(self, model_config_path: str): - model_config = AutoConfig.from_pretrained( - model_config_path, trust_remote_code=self.trust_remote_code) - architecture = model_config.architectures[0] + def infer_task_from_model_architecture(self): + architecture = self.model_config.architectures[0] task = None for arch_options in ARCHITECTURES_2_TASK: @@ -368,6 +403,23 @@ def infer_task_from_model_architecture(self, model_config_path: str): ) return task + def _read_model_config(self, model_config_path: str): + try: + self.model_config = AutoConfig.from_pretrained( + model_config_path, trust_remote_code=self.trust_remote_code) + except OSError: + logging.warning( + f"config.json not found for {model_config_path}. Attempting to load with peft" + ) + self.peft_config = PeftConfig.from_pretrained(model_config_path) + self.model_config = AutoConfig.from_pretrained( + self.peft_config.base_model_name_or_path) + except Exception as e: + self.logger.error( + f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. " + f"This is required for loading huggingface models") + raise e + _service = HuggingFaceService() diff --git a/serving/docker/deepspeed.Dockerfile b/serving/docker/deepspeed.Dockerfile index 07088e404..762b85a75 100644 --- a/serving/docker/deepspeed.Dockerfile +++ b/serving/docker/deepspeed.Dockerfile @@ -22,6 +22,7 @@ ARG transformers_version=4.29.2 ARG accelerate_version=0.19.0 ARG diffusers_version=0.15.0 ARG bitsandbytes_version=0.39.1 +ARG peft_version=0.3.0 EXPOSE 8080 @@ -60,7 +61,7 @@ RUN apt-get update && \ pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 \ ${deepspeed_wheel} ${lmi_dist_wheel} protobuf==${protobuf_version} transformers==${transformers_version} \ mpi4py sentencepiece einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version}\ - diffusers[torch]==${diffusers_version} opencv-contrib-python-headless safetensors scipy && \ + diffusers[torch]==${diffusers_version} peft==${peft_version} opencv-contrib-python-headless safetensors scipy && \ scripts/install_flash_attn.sh && \ scripts/install_aitemplate.sh && \ scripts/patch_oss_dlc.sh python && \ diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index aac3fa7e6..b0686da95 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -133,6 +133,12 @@ def get_model_name(): "batch_size": [1, 4], "seq_length": [16, 32], "worker": 2, + }, + "gpt4all-lora": { + "max_memory_per_gpu": [6.0, 8.0], + "batch_size": [1, 4], + "seq_length": [16, 32], + "worker": 1, } } @@ -165,6 +171,12 @@ def get_model_name(): "seq_length": [16], "worker": 1, "stream_output": True, + }, + "gpt4all-lora": { + "max_memory_per_gpu": [6.0, 8.0], + "batch_size": [1, 4], + "seq_length": [16, 32], + "worker": 1, } } diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 502c7754e..b257fd0bd 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -120,6 +120,12 @@ "option.device_map": "auto", "option.enable_streaming": True, }, + "gpt4all-lora": { + "option.model_id": "nomic-ai/gpt4all-lora", + "option.tensor_parallel_degree": 4, + "option.task": "text-generation", + "option.dtype": "fp16" + } } ds_handler_list = { @@ -153,6 +159,12 @@ "option.dtype": "fp16", "option.enable_streaming": True }, + "gpt4all-lora": { + "option.model_id": "nomic-ai/gpt4all-lora", + "option.tensor_parallel_degree": 4, + "option.task": "text-generation", + "option.dtype": "fp16" + } } sd_handler_list = {