Skip to content

Commit

Permalink
Add peft support to default deepspeed and huggingface handlers
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk committed Jun 30, 2023
1 parent cfb23d8 commit 24b855e
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 48 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/llm_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,15 @@ jobs:
serve
python3 llm/client.py huggingface gpt-j-6b
docker rm -f $(docker ps -aq)
- name: Test gpt4all-lora
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py huggingface gpt4all-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve
python3 llm/client.py huggingface gpt4all-lora
docker rm -f $(docker ps -aq)
- name: Test streaming bigscience/bloom-3b
working-directory: tests/integration
run: |
Expand Down Expand Up @@ -292,6 +301,15 @@ jobs:
serve
python3 llm/client.py deepspeed opt-13b
docker rm -f $(docker ps -aq)
- name: Test gpt4all-lora
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py deepspeed gpt4all-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve
python3 llm/client.py deepspeed gpt4all-lora
docker rm -f $(docker ps -aq)
- name: Test streaming gpt-neo-1.3b
working-directory: tests/integration
run: |
Expand Down
57 changes: 43 additions & 14 deletions engines/python/setup/djl_python/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from djl_python.outputs import Output
from djl_python.streaming_utils import StreamingUtils
from typing import Optional
from peft import PeftConfig, PeftModel

OPTIMIZED_MODEL_TYPES = {
"roberta",
Expand Down Expand Up @@ -111,11 +112,13 @@ def __init__(self):
self.enable_streaming = None
self.trust_remote_code = os.environ.get("HF_TRUST_REMOTE_CODE",
"FALSE").lower() == 'true'
self.peft_config = None
self.model = None
self.tokenizer = None

def initialize(self, properties: dict):
self._parse_properties(properties)
self._read_model_config()
self._validate_model_type_and_task()
self.create_model_pipeline()
self.logger.info(
Expand Down Expand Up @@ -181,19 +184,6 @@ def _get_ds_config(self, properties: dict):
return ds_config

def _validate_model_type_and_task(self):
if os.path.exists(self.model_id_or_path):
config_file = os.path.join(self.model_id_or_path, "config.json")
if not os.path.exists(config_file):
raise ValueError(
f"{self.model_id_or_path} does not contain a config.json. "
f"This is required for loading models from local storage")
self.model_config = AutoConfig.from_pretrained(
config_file, trust_remote_code=self.trust_remote_code)
else:
self.model_config = AutoConfig.from_pretrained(
self.model_id_or_path,
trust_remote_code=self.trust_remote_code)

if self.model_config.model_type not in OPTIMIZED_MODEL_TYPES:
self.logger.warning(
f"DeepSpeed does not currently support optimized CUDA kernels for the model type "
Expand All @@ -210,6 +200,25 @@ def _validate_model_type_and_task(self):
raise ValueError(
f"task: {self.task} is not currently supported by DeepSpeed")

def _read_model_config(self):
try:
self.model_config = AutoConfig.from_pretrained(
self.model_id_or_path,
trust_remote_code=self.trust_remote_code)
except OSError:
self.logger.warning(
f"config.json not found for {self.model_id_or_path}. Attempting to load with peft"
)
self.peft_config = PeftConfig.from_pretrained(
self.model_id_or_path)
self.model_config = AutoConfig.from_pretrained(
self.peft_config.base_model_name_or_path)
except Exception as e:
self.logger.error(
f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. "
f"This is required for loading huggingface models")
raise e

def infer_task_from_model_architecture(self, config: PretrainedConfig):
architecture = config.architectures[0]
for arch_option in ARCHITECTURES_TO_TASK:
Expand All @@ -230,6 +239,21 @@ def create_model_pipeline(self):
with deepspeed.OnDevice(dtype=dtype, device="meta"):
model = TASK_TO_MODEL[self.task].from_config(
self.model_config, **kwargs)
elif self.peft_config is not None:
self.logger.info(
f"Peft Model detected. Instantiating base model {self.peft_config.base_model_name_or_path}"
)
base_model = TASK_TO_MODEL[self.task].from_pretrained(
self.peft_config.base_model_name_or_path,
low_cpu_mem_usage=self.low_cpu_mem_usage,
trust_remote_code=self.trust_remote_code,
**kwargs)
lora_model = PeftModel.from_pretrained(base_model,
self.model_id_or_path)
model = lora_model.merge_and_unload()
self.logger.info(
f"Peft Model merged into base model for deepspeed compatibility"
)
else:
model = TASK_TO_MODEL[self.task].from_pretrained(
self.model_id_or_path,
Expand All @@ -243,7 +267,12 @@ def create_model_pipeline(self):
if self.model_config.model_type in OPTIMIZED_MODEL_TYPES:
self.ds_config["replace_with_kernel_inject"] = True
self.model = deepspeed.init_inference(model, config=self.ds_config)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path)
if self.peft_config:
self.tokenizer = AutoTokenizer.from_pretrained(
self.peft_config.base_model_name_or_path)
else:
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_id_or_path)
if self.enable_streaming:
return
# Optimization for text-generation batch processing
Expand Down
118 changes: 85 additions & 33 deletions engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
import os

import torch
from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
from transformers import (pipeline, Conversation, AutoModelForCausalLM,
AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
AutoModelForQuestionAnswering)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from peft import PeftConfig, PeftModel

from djl_python.encode_decode import encode, decode
from djl_python.inputs import Input
Expand All @@ -42,12 +47,18 @@
}

LMI_DIST_ADV_MODEL = {
"RWForCausalLM",
"GPTNeoXForCausalLM",
"T5ForConditionalGeneration",
"RWForCausalLM", "GPTNeoXForCausalLM", "T5ForConditionalGeneration",
"LlamaForCausalLM"
}

PEFT_MODEL_TASK_TO_CLS = {
"SEQ_CLS": AutoModelForSequenceClassification,
"SEQ_2_SEQ_LM": AutoModelForSeq2SeqLM,
"CAUSAL_LM": AutoModelForCausalLM,
"TOKEN_CLS": AutoModelForTokenClassification,
"QUESTION_ANS": AutoModelForQuestionAnswering,
}


def get_torch_dtype_from_str(dtype: str):
if dtype == "auto":
Expand All @@ -65,7 +76,8 @@ def get_torch_dtype_from_str(dtype: str):
raise ValueError(f"Invalid data type: {dtype}")


def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool, model_config):
def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool,
model_config):
if rolling_batch_type == "auto":
architecture = model_config.architectures[0]
if architecture in LMI_DIST_ADV_MODEL and is_mpi:
Expand Down Expand Up @@ -95,6 +107,7 @@ def __init__(self):
self.rolling_batch_type = None
self.rolling_batch = None
self.model_config = None
self.peft_config = None

def initialize(self, properties: dict):
# model_id can point to huggingface model_id or local directory.
Expand Down Expand Up @@ -146,17 +159,18 @@ def initialize(self, properties: dict):
properties.get("dtype"))
self.rolling_batch_type = properties.get("rolling_batch", None)

self._read_model_config(model_id_or_path)

if self.rolling_batch_type:
self.rolling_batch_type = self.rolling_batch_type.lower()
is_mpi = properties.get("engine") != "Python"
if is_mpi:
self.device = int(os.getenv("LOCAL_RANK", 0))
model_config = AutoConfig.from_pretrained(model_id_or_path, **kwargs)
_rolling_batch_cls = get_rolling_batch_class_from_str(self.rolling_batch_type, is_mpi, model_config)
_rolling_batch_cls = get_rolling_batch_class_from_str(
self.rolling_batch_type, is_mpi, self.model_config)
self.rolling_batch = _rolling_batch_cls(model_id_or_path,
self.device, properties,
**kwargs)

self.initialized = True
return
elif self.enable_streaming:
Expand All @@ -165,7 +179,7 @@ def initialize(self, properties: dict):
return

if not task:
task = self.infer_task_from_model_architecture(model_id_or_path)
task = self.infer_task_from_model_architecture()

self.hf_pipeline = self.get_pipeline(task=task,
model_id_or_path=model_id_or_path,
Expand Down Expand Up @@ -249,12 +263,12 @@ def inference(self, inputs):
def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
# define tokenizer or feature extractor as kwargs to load it the pipeline correctly
if task in {
"automatic-speech-recognition",
"image-segmentation",
"image-classification",
"audio-classification",
"object-detection",
"zero-shot-image-classification",
"automatic-speech-recognition",
"image-segmentation",
"image-classification",
"audio-classification",
"object-detection",
"zero-shot-image-classification",
}:
kwargs["feature_extractor"] = model_id_or_path
else:
Expand All @@ -266,10 +280,25 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
use_pipeline = False
# build pipeline
if use_pipeline:
hf_pipeline = pipeline(task=task,
model=model_id_or_path,
device=self.device,
**kwargs)
if self.peft_config is not None:
kwargs.pop("tokenizer", None)
self.tokenizer = AutoTokenizer.from_pretrained(
self.peft_config.base_model_name_or_path)
base_model = PEFT_MODEL_TASK_TO_CLS[
self.peft_config.task_type].from_pretrained(
self.peft_config.base_model_name_or_path, **kwargs)
lora_model = PeftModel.from_pretrained(base_model,
model_id_or_path)
self.model = lora_model.merge_and_unload()
hf_pipeline = pipeline(task=task,
tokenizer=self.tokenizer,
model=self.model,
device=self.device)
else:
hf_pipeline = pipeline(task=task,
model=model_id_or_path,
device=self.device,
**kwargs)
else:
kwargs.pop("tokenizer", None)
self._init_model_and_tokenizer(model_id_or_path, **kwargs)
Expand All @@ -293,19 +322,27 @@ def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
return hf_pipeline

def _init_model_and_tokenizer(self, model_id_or_path: str, **kwargs):
self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path,
padding_side="left")
model_config = AutoConfig.from_pretrained(model_id_or_path,
**kwargs)
self.model_config = model_config
architectures = model_config.architectures
if self.peft_config is not None:
self.tokenizer = AutoTokenizer.from_pretrained(
self.peft_config.base_model_name_or_path, padding_size="left")
else:
self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path,
padding_side="left")
architectures = self.model_config.architectures
if architectures and architectures[0].endswith(
"ForConditionalGeneration"):
self.model = AutoModelForSeq2SeqLM.from_pretrained(
model_id_or_path, **kwargs)
model_cls = AutoModelForSeq2SeqLM
else:
self.model = AutoModelForCausalLM.from_pretrained(
model_id_or_path, **kwargs)
model_cls = AutoModelForCausalLM

if self.peft_config is not None:
base_model = model_cls.from_pretrained(
self.peft_config.base_model_name_or_path, **kwargs)
lora_model = PeftModel.from_pretrained(base_model,
model_id_or_path)
self.model = lora_model.merge_and_unload()
else:
self.model = model_cls.from_pretrained(model_id_or_path, **kwargs)

if self.device:
self.model.to(self.device)
Expand Down Expand Up @@ -352,10 +389,8 @@ def wrapped_pipeline(inputs, *args, **kwargs):

return wrapped_pipeline

def infer_task_from_model_architecture(self, model_config_path: str):
model_config = AutoConfig.from_pretrained(
model_config_path, trust_remote_code=self.trust_remote_code)
architecture = model_config.architectures[0]
def infer_task_from_model_architecture(self):
architecture = self.model_config.architectures[0]

task = None
for arch_options in ARCHITECTURES_2_TASK:
Expand All @@ -368,6 +403,23 @@ def infer_task_from_model_architecture(self, model_config_path: str):
)
return task

def _read_model_config(self, model_config_path: str):
try:
self.model_config = AutoConfig.from_pretrained(
model_config_path, trust_remote_code=self.trust_remote_code)
except OSError:
logging.warning(
f"config.json not found for {model_config_path}. Attempting to load with peft"
)
self.peft_config = PeftConfig.from_pretrained(model_config_path)
self.model_config = AutoConfig.from_pretrained(
self.peft_config.base_model_name_or_path)
except Exception as e:
self.logger.error(
f"{self.model_id_or_path} does not contain a config.json or adapter_config.json for lora models. "
f"This is required for loading huggingface models")
raise e


_service = HuggingFaceService()

Expand Down
3 changes: 2 additions & 1 deletion serving/docker/deepspeed.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ ARG transformers_version=4.29.2
ARG accelerate_version=0.19.0
ARG diffusers_version=0.15.0
ARG bitsandbytes_version=0.39.1
ARG peft_version=0.3.0

EXPOSE 8080

Expand Down Expand Up @@ -60,7 +61,7 @@ RUN apt-get update && \
pip3 install torch==${torch_version} torchvision==${torch_vision_version} --extra-index-url https://download.pytorch.org/whl/cu118 \
${deepspeed_wheel} ${lmi_dist_wheel} protobuf==${protobuf_version} transformers==${transformers_version} \
mpi4py sentencepiece einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version}\
diffusers[torch]==${diffusers_version} opencv-contrib-python-headless safetensors scipy && \
diffusers[torch]==${diffusers_version} peft==${peft_version} opencv-contrib-python-headless safetensors scipy && \
scripts/install_flash_attn.sh && \
scripts/install_aitemplate.sh && \
scripts/patch_oss_dlc.sh python && \
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ def get_model_name():
"batch_size": [1, 4],
"seq_length": [16, 32],
"worker": 2,
},
"gpt4all-lora": {
"max_memory_per_gpu": [6.0, 8.0],
"batch_size": [1, 4],
"seq_length": [16, 32],
"worker": 1,
}
}

Expand Down Expand Up @@ -165,6 +171,12 @@ def get_model_name():
"seq_length": [16],
"worker": 1,
"stream_output": True,
},
"gpt4all-lora": {
"max_memory_per_gpu": [6.0, 8.0],
"batch_size": [1, 4],
"seq_length": [16, 32],
"worker": 1,
}
}

Expand Down
Loading

0 comments on commit 24b855e

Please sign in to comment.