From d4995e54680e79ead5557b025e7f6c6a01875260 Mon Sep 17 00:00:00 2001
From: Jack Zhou <zhoushunjie@baidu.com>
Date: Thu, 10 Nov 2022 14:59:07 +0800
Subject: [PATCH] [Model] Add stable diffusion model based on fastdeploy (#297)

* Add stable diffusion model base on fastdeploy

* Add sd infer

* pipelines->multimodal

* add create_ort_runtime

* use fp16 input

* fix pil

* Add optimize unet model

* add hf license

* Add workspace args

* Add profile func

* Add schedulers

* usrelace torch.Tenosr  byp.ndarray

* Add readme

* Add trt shape setting

* add dynamic shape

* Add dynamic shape for stable diffusion

* fix max shape setting

* rename tensorrt file suffix

* update dynamic shape setting

* Add scheduler output

* Add inference_steps and benchmark steps

* add diffuser benchmark

* Add paddle infer script

* Rename 1

* Rename infer.py to torch_onnx_infer.py

* Add export torch to onnx model

* renmove export model

* Add paddle export model for diffusion

* Fix export model

* mv torch onnx infer to infer

* Fix export model

* Fix infer

* modif create_trt_runtime create_ort_runtime

* update export torch

* update requirements

* add paddle inference backend

* Fix unet pp run

* remove print

* Add paddle model export and infer

* Add device id

* remove profile to utils

* Add -1 device id

* Add safety checker args

* remove safety checker temporarily

* Add export model description

* Add predict description

* Fix readme

* Fix device_id description

* add timestep shape

* add use fp16 precision

* move use gpu

* Add EulerAncestralDiscreteScheduler

* Use EulerAncestralDiscreteScheduler with v1-5 model

* Add export model readme

* Add link of exported model

* Update scheduler on README

* Addd stable-diffusion-v1-5
---
 .../multimodal/stable_diffusion/README.md     |   59 +
 .../stable_diffusion/config_utils.py          |  156 +++
 .../multimodal/stable_diffusion/export.md     |  105 ++
 .../stable_diffusion/export_model.py          |  100 ++
 .../export_torch_to_onnx_model.py             |  159 +++
 examples/multimodal/stable_diffusion/infer.py |  320 +++++
 .../pipeline_stable_diffusion.py              |  236 ++++
 .../stable_diffusion/requirements_paddle.txt  |    3 +
 .../stable_diffusion/requirements_torch.txt   |    5 +
 .../stable_diffusion/scheduling_utils.py      | 1128 +++++++++++++++++
 python/fastdeploy/__init__.py                 |    1 +
 python/fastdeploy/utils/__init__.py           |    1 +
 python/fastdeploy/utils/profile.py            |   28 +
 13 files changed, 2301 insertions(+)
 create mode 100644 examples/multimodal/stable_diffusion/README.md
 create mode 100644 examples/multimodal/stable_diffusion/config_utils.py
 create mode 100644 examples/multimodal/stable_diffusion/export.md
 create mode 100644 examples/multimodal/stable_diffusion/export_model.py
 create mode 100644 examples/multimodal/stable_diffusion/export_torch_to_onnx_model.py
 create mode 100644 examples/multimodal/stable_diffusion/infer.py
 create mode 100644 examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py
 create mode 100644 examples/multimodal/stable_diffusion/requirements_paddle.txt
 create mode 100644 examples/multimodal/stable_diffusion/requirements_torch.txt
 create mode 100644 examples/multimodal/stable_diffusion/scheduling_utils.py
 create mode 100644 python/fastdeploy/utils/profile.py

diff --git a/examples/multimodal/stable_diffusion/README.md b/examples/multimodal/stable_diffusion/README.md
new file mode 100644
index 0000000000..0143e009ce
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/README.md
@@ -0,0 +1,59 @@
+# FastDeploy Diffusion模型高性能部署
+
+本部署示例使用⚡️`FastDeploy`在Huggingface团队[Diffusers](https://github.com/huggingface/diffusers)项目设计的`DiffusionPipeline`基础上，完成Diffusion模型的高性能部署。
+
+### 部署模型准备
+
+本示例需要使用训练模型导出后的部署模型。有两种部署模型的获取方式：
+
+- 模型导出方式，可参考[模型导出文档](./export.md)导出部署模型。
+- 下载部署模型。为了方便开发者快速测试本示例，我们已经将部分`Diffusion`模型预先导出，开发者只要下载模型就可以快速测试:
+
+| 模型 | Scheduler |
+|----------|--------------|
+| [CompVis/stable-diffusion-v1-4](https://bj.bcebos.com/fastdeploy/models/stable-diffusion/CompVis/stable-diffusion-v1-4.tgz) | PNDM |
+| [runwayml/stable-diffusion-v1-5](https://bj.bcebos.com/fastdeploy/models/stable-diffusion/runwayml/stable-diffusion-v1-5.tgz) | EulerAncestral |
+
+## 环境依赖
+
+在示例中使用了PaddleNLP的CLIP模型的分词器，所以需要执行以下命令安装依赖。
+
+```shell
+pip install paddlenlp paddlepaddle-gpu
+```
+
+### 快速体验
+
+我们经过部署模型准备，可以开始进行测试。下面将指定模型目录以及推理引擎后端，运行`infer.py`脚本，完成推理。
+
+```
+python infer.py --model_dir stable-diffusion-v1-4/ --scheduler "pndm" --backend paddle
+```
+
+得到的图像文件为fd_astronaut_rides_horse.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
+
+![fd_astronaut_rides_horse.png](https://user-images.githubusercontent.com/10826371/200261112-68e53389-e0a0-42d1-8c3a-f35faa6627d7.png)
+
+如果使用stable-diffusion-v1-5模型，则可执行以下命令完成推理：
+
+```
+python infer.py --model_dir stable-diffusion-v1-5/ --scheduler "euler_ancestral" --backend paddle
+```
+
+#### 参数说明
+
+`infer.py` 除了以上示例的命令行参数，还支持更多命令行参数的设置。以下为各命令行参数的说明。
+
+| 参数 |参数说明 |
+|----------|--------------|
+| --model_dir | 导出后模型的目录。 |
+| --model_format | 模型格式。默认为`'paddle'`，可选列表：`['paddle', 'onnx']`。 |
+| --backend | 推理引擎后端。默认为`paddle`，可选列表：`['onnx_runtime', 'paddle']`，当模型格式为`onnx`时，可选列表为`['onnx_runtime']`。 |
+| --scheduler | StableDiffusion 模型的scheduler。默认为`'pndm'`。可选列表：`['pndm', 'euler_ancestral']`，StableDiffusio模型对应的scheduler可参考[ppdiffuser模型列表](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/textual_inversion)。|
+| --unet_model_prefix | UNet模型前缀。默认为`unet`。 |
+| --vae_model_prefix | VAE模型前缀。默认为`vae_decoder`。 |
+| --text_encoder_model_prefix | TextEncoder模型前缀。默认为`text_encoder`。 |
+| --inference_steps | UNet模型运行的次数，默认为100。 |
+| --image_path | 生成图片的路径。默认为`fd_astronaut_rides_horse.png`。  |
+| --device_id | gpu设备的id。若`device_id`为-1，视为使用cpu推理。 |
+| --use_fp16 | 是否使用fp16精度。默认为`False`。使用tensorrt或者paddle-tensorrt后端时可以设为`True`开启。 |
diff --git a/examples/multimodal/stable_diffusion/config_utils.py b/examples/multimodal/stable_diffusion/config_utils.py
new file mode 100644
index 0000000000..b045e573aa
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/config_utils.py
@@ -0,0 +1,156 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import inspect
+from collections import OrderedDict
+from typing import Any, Dict, Tuple, Union
+
+
+class ConfigMixin:
+    r"""
+    Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
+    methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
+        - [`~ConfigMixin.from_config`]
+        - [`~ConfigMixin.save_config`]
+
+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by parent class).
+    """
+    config_name = None
+    ignore_for_config = []
+
+    def register_to_config(self, **kwargs):
+        if self.config_name is None:
+            raise NotImplementedError(
+                f"Make sure that {self.__class__} has defined a class name `config_name`"
+            )
+        kwargs["_class_name"] = self.__class__.__name__
+
+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+        if not hasattr(self, "_internal_dict"):
+            internal_dict = kwargs
+        else:
+            previous_dict = dict(self._internal_dict)
+            internal_dict = { ** self._internal_dict, ** kwargs}
+            logger.debug(
+                f"Updating config from {previous_dict} to {internal_dict}")
+
+        self._internal_dict = FrozenDict(internal_dict)
+
+    @property
+    def config(self) -> Dict[str, Any]:
+        return self._internal_dict
+
+
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        for key, value in self.items():
+            setattr(self, key, value)
+
+        self.__frozen = True
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
+        )
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
+        )
+
+    def pop(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
+        )
+
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(
+                f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
+            )
+        super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(
+                f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
+            )
+        super().__setitem__(name, value)
+
+
+def register_to_config(init):
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable
+
+    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+    """
+
+    @functools.wraps(init)
+    def inner_init(self, *args, **kwargs):
+        # Ignore private kwargs in the init.
+        init_kwargs = {
+            k: v
+            for k, v in kwargs.items() if not k.startswith("_")
+        }
+        init(self, *args, **init_kwargs)
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`.")
+        ignore = getattr(self, "ignore_for_config", [])
+        # Get positional arguments aligned with kwargs
+        new_kwargs = {}
+        signature = inspect.signature(init)
+        parameters = {
+            name: p.default
+            for i, (name, p) in enumerate(signature.parameters.items())
+            if i > 0 and name not in ignore
+        }
+        for arg, name in zip(args, parameters.keys()):
+            new_kwargs[name] = arg
+
+        # Then add all kwargs
+        new_kwargs.update({
+            k: init_kwargs.get(k, default)
+            for k, default in parameters.items()
+            if k not in ignore and k not in new_kwargs
+        })
+        getattr(self, "register_to_config")(**new_kwargs)
+
+    return inner_init
diff --git a/examples/multimodal/stable_diffusion/export.md b/examples/multimodal/stable_diffusion/export.md
new file mode 100644
index 0000000000..ba2b4faf11
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/export.md
@@ -0,0 +1,105 @@
+# Diffusion模型导出教程
+
+本项目支持两种模型导出方式：[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)模型导出以及[Diffusers](https://github.com/huggingface/diffusers)模型导出。下面分别介绍这两种模型导出方式。
+
+## PPDiffusers 模型导出
+
+[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)是一款支持跨模态（如图像与语音）训练和推理的扩散模型（Diffusion Model）工具箱，其借鉴了🤗 Huggingface团队的[Diffusers](https://github.com/huggingface/diffusers)的优秀设计，并且依托[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)框架和[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理库。下面介绍如何使用FastDeploy将PPDiffusers提供的Diffusion模型进行高性能部署。
+
+### 依赖安装
+
+模型导出需要依赖`paddlepaddle`, `paddlenlp`以及`ppdiffusers`，可使用`pip`执行下面的命令进行快速安装。
+
+```shell
+pip install -r requirements_paddle.txt
+```
+
+### 模型导出
+
+___注意：模型导出过程中，需要下载StableDiffusion模型。为了使用该模型与权重，你必须接受该模型所要求的License，请访问HuggingFace的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的License，然后签署该协议。___
+
+___Tips: Stable Diffusion是基于以下的License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___
+
+可执行以下命令行完成模型导出。
+
+```shell
+python export_model.py --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 --output_path stable-diffusion-v1-4
+```
+
+输出的模型目录结构如下：
+```shell
+stable-diffusion-v1-4/
+├── text_encoder
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+├── unet
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+└── vae_decoder
+    ├── inference.pdiparams
+    ├── inference.pdiparams.info
+    └── inference.pdmodel
+```
+
+#### 参数说明
+
+`export_model.py` 各命令行参数的说明。
+
+| 参数 |参数说明 |
+|----------|--------------|
+|<div style="width: 230pt">--pretrained_model_name_or_path </div> | ppdiffuers提供的diffusion预训练模型。默认为："CompVis/stable-diffusion-v1-4	"。更多diffusion预训练模型可参考[ppdiffuser模型列表](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/textual_inversion)。|
+|--output_path | 导出的模型目录。 |
+
+
+## Diffusers 模型导出
+
+[Diffusers](https://github.com/huggingface/diffusers)是一款由HuggingFace打造的支持跨模态（如图像与语音）训练和推理的扩散模型（Diffusion Model）工具箱。其底层的模型代码提供PyTorch实现的版本以及Flax实现的版本两种版本。本示例将介绍如何使用FastDeploy将PyTorch实现的Diffusion模型进行高性能部署。
+
+### 依赖安装
+
+模型导出需要依赖`onnx`, `torch`, `diffusers`以及`transformers`，可使用`pip`执行下面的命令进行快速安装。
+
+```shell
+pip install -r requirements_torch.txt
+```
+
+### 模型导出
+
+___注意：模型导出过程中，需要下载StableDiffusion模型。为了使用该模型与权重，你必须接受该模型所要求的License，并且获取HF Hub授予的Token。请访问HuggingFace的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的License，然后签署该协议。___
+
+___Tips: Stable Diffusion是基于以下的License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___
+
+若第一次导出模型，需要先登录HuggingFace客户端。执行以下命令进行登录：
+
+```shell
+huggingface-cli login
+```
+
+完成登录后，执行以下命令行完成模型导出。
+
+```shell
+python export_torch_to_onnx_model.py --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 --output_path torch_diffusion_model
+```
+
+输出的模型目录结构如下：
+
+```shell
+torch_diffusion_model/
+├── text_encoder
+│   └── inference.onnx
+├── unet
+│   └── inference.onnx
+└── vae_decoder
+    └── inference.onnx
+```
+
+#### 参数说明
+
+`export_torch_to_onnx_model.py` 各命令行参数的说明。
+
+| 参数 |参数说明 |
+|----------|--------------|
+|<div style="width: 230pt">--pretrained_model_name_or_path </div> | ppdiffuers提供的diffusion预训练模型。默认为："CompVis/stable-diffusion-v1-4	"。更多diffusion预训练模型可参考[HuggingFace模型列表说明](https://huggingface.co/CompVis/stable-diffusion-v1-4)。|
+|--output_path | 导出的模型目录。 |
diff --git a/examples/multimodal/stable_diffusion/export_model.py b/examples/multimodal/stable_diffusion/export_model.py
new file mode 100644
index 0000000000..00c6361e7e
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/export_model.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddlenlp
+
+from ppdiffusers import UNet2DConditionModel, AutoencoderKL
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from paddlenlp.transformers import CLIPTextModel
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        default='CompVis/stable-diffusion-v1-4',
+        help="The pretrained diffusion model.")
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="The pretrained diffusion model.")
+    return parser.parse_args()
+
+
+class VAEDecoder(AutoencoderKL):
+    def forward(self, z):
+        return self.decode(z, True).sample
+
+
+if __name__ == "__main__":
+    paddle.set_device('cpu')
+    args = parse_arguments()
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        os.path.join(args.pretrained_model_name_or_path, "text_encoder"))
+    vae_decoder = VAEDecoder.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet")
+
+    # Convert to static graph with specific input description
+    text_encoder = paddle.jit.to_static(
+        text_encoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64",
+                name="input_ids")  # input_ids
+        ])
+
+    # Save text_encoder in static graph model.
+    save_path = os.path.join(args.output_path, "text_encoder", "inference")
+    paddle.jit.save(text_encoder, save_path)
+    print(f"Save text_encoder model in {save_path} successfully.")
+
+    # Convert to static graph with specific input description
+    vae_decoder = paddle.jit.to_static(
+        vae_decoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, 4, 64, 64], dtype="float32",
+                name="latent"),  # latent
+        ])
+    # Save vae_decoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+    paddle.jit.save(vae_decoder, save_path)
+    print(f"Save vae_decoder model in {save_path} successfully.")
+
+    # Convert to static graph with specific input description
+    unet = paddle.jit.to_static(
+        unet,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, 4, None, None],
+                dtype="float32",
+                name="latent_input"),  # latent
+            paddle.static.InputSpec(
+                shape=[1], dtype="int64", name="timestep"),  # timesteps
+            paddle.static.InputSpec(
+                shape=[None, None, 768],
+                dtype="float32",
+                name="encoder_embedding")  # encoder_embedding
+        ])
+    save_path = os.path.join(args.output_path, "unet", "inference")
+    paddle.jit.save(unet, save_path)
+    print(f"Save unet model in {save_path} successfully.")
diff --git a/examples/multimodal/stable_diffusion/export_torch_to_onnx_model.py b/examples/multimodal/stable_diffusion/export_torch_to_onnx_model.py
new file mode 100644
index 0000000000..a831ba803c
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/export_torch_to_onnx_model.py
@@ -0,0 +1,159 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx
+import torch
+import onnxsim
+from typing import Optional, Tuple, Union
+from diffusers import UNet2DConditionModel, AutoencoderKL
+from transformers import CLIPTextModel
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        default='CompVis/stable-diffusion-v1-4',
+        help="The pretrained diffusion model.")
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="The pretrained diffusion model.")
+    return parser.parse_args()
+
+
+class VAEDecoder(AutoencoderKL):
+    def forward(self, z):
+        return self.decode(z, True).sample
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    # 1. Load VAE model
+    vae_decoder = VAEDecoder.from_pretrained(
+        args.pretrained_model_name_or_path,
+        torch_dtype=torch.float16,
+        revision="fp16",
+        subfolder="vae",
+        use_auth_token=True)
+
+    # 2. Load UNet model
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        torch_dtype=torch.float16,
+        revision="fp16",
+        subfolder="unet",
+        use_auth_token=True)
+
+    # 3. Load CLIP model
+    text_encoder = CLIPTextModel.from_pretrained(
+        "openai/clip-vit-large-patch14")
+
+    vae_decoder.cuda()
+    unet.cuda()
+    text_encoder.cuda()
+
+    os.makedirs(args.output_path, exist_ok=True)
+    vae_decoder_path = os.path.join(args.output_path, "vae_decoder")
+    text_encoder_path = os.path.join(args.output_path, "text_encoder")
+    unet_path = os.path.join(args.output_path, "unet")
+    for p in [vae_decoder_path, text_encoder_path, unet_path]:
+        os.makedirs(p, exist_ok=True)
+
+    with torch.inference_mode():
+        # Export vae decoder model
+        vae_inputs = (torch.randn(
+            1, 4, 64, 64, dtype=torch.half, device='cuda'), )
+        torch.onnx.export(
+            vae_decoder,  # model being run
+            vae_inputs,  # model input (or a tuple for multiple inputs)
+            os.path.join(
+                vae_decoder_path, "inference.onnx"
+            ),  # where to save the model (can be a file or file-like object)
+            export_params=True,  # store the trained parameter weights inside the model file
+            opset_version=12,  # the ONNX version to export the model to
+            do_constant_folding=True,  # whether to execute constant folding for optimization
+            input_names=['latent'],
+            dynamic_axes={
+                'latent': {
+                    0: 'batch_size',
+                },
+                'image': {
+                    0: 'batch_size',
+                },
+            },
+            output_names=['image'])
+        print("Finish exporting vae decoder.")
+
+        # Export the unet model
+        unet_inputs = (torch.randn(
+            2, 4, 64, 64, dtype=torch.half, device='cuda'), torch.randn(
+                1, dtype=torch.half, device='cuda'), torch.randn(
+                    2, 77, 768, dtype=torch.half, device='cuda'))
+        torch.onnx.export(
+            unet,  # model being run
+            unet_inputs,  # model input (or a tuple for multiple inputs)
+            os.path.join(
+                unet_path, "inference.onnx"
+            ),  # where to save the model (can be a file or file-like object)
+            export_params=True,  # store the trained parameter weights inside the model file
+            opset_version=12,  # the ONNX version to export the model to
+            do_constant_folding=True,  # whether to execute constant folding for optimization
+            input_names=['latent_input', 'timestep', 'encoder_embedding'],
+            dynamic_axes={
+                'latent_input': {
+                    0: 'batch_size',
+                },
+                'encoder_embedding': {
+                    0: 'batch_size',
+                    1: 'sequence'
+                },
+                'latent_output': {
+                    0: 'batch_size',
+                },
+            },
+            output_names=['latent_output'])
+        print("Finish exporting unet.")
+
+        # Export the text_encoder
+        text_encoder_inputs = (torch.randint(0, 1, (2, 77), device='cuda'), )
+        torch.onnx.export(
+            text_encoder,  # model being run
+            text_encoder_inputs,  # model input (or a tuple for multiple inputs)
+            os.path.join(
+                text_encoder_path, "inference.onnx"
+            ),  # where to save the model (can be a file or file-like object)
+            export_params=True,  # store the trained parameter weights inside the model file
+            opset_version=14,  # the ONNX version to export the model to
+            do_constant_folding=True,  # whether to execute constant folding for optimization
+            input_names=['input_ids'],
+            dynamic_axes={
+                'input_ids': {
+                    0: 'batch_size',
+                    1: 'sequence'
+                },
+                'logits': {
+                    0: 'batch_size',
+                    1: 'sequence'
+                }
+            },
+            output_names=['logits'])
+        print("Finish exporting text encoder.")
diff --git a/examples/multimodal/stable_diffusion/infer.py b/examples/multimodal/stable_diffusion/infer.py
new file mode 100644
index 0000000000..148990996a
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/infer.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+
+from pipeline_stable_diffusion import StableDiffusionFastDeployPipeline
+from scheduling_utils import PNDMScheduler, EulerAncestralDiscreteScheduler
+
+try:
+    from paddlenlp.transformers import CLIPTokenizer
+except ImportError:
+    from transformers import CLIPTokenizer
+
+import fastdeploy as fd
+from fastdeploy import ModelFormat
+import numpy as np
+import distutils.util
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        default="paddle_diffusion_model",
+        help="The model directory of diffusion_model.")
+    parser.add_argument(
+        "--model_format",
+        default="paddle",
+        choices=['paddle', 'onnx'],
+        help="The model format.")
+    parser.add_argument(
+        "--unet_model_prefix",
+        default='unet',
+        help="The file prefix of unet model.")
+    parser.add_argument(
+        "--vae_model_prefix",
+        default='vae_decoder',
+        help="The file prefix of vae model.")
+    parser.add_argument(
+        "--text_encoder_model_prefix",
+        default='text_encoder',
+        help="The file prefix of text_encoder model.")
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=100,
+        help="The number of unet inference steps.")
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=1,
+        help="The number of performance benchmark steps.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default='paddle',
+        # Note(zhoushunjie): Will support 'tensorrt', 'paddle-tensorrt' soon.
+        choices=[
+            'onnx_runtime',
+            'paddle',
+        ],
+        help="The inference runtime backend of unet model and text encoder model."
+    )
+    parser.add_argument(
+        "--image_path",
+        default="fd_astronaut_rides_horse.png",
+        help="The model directory of diffusion_model.")
+    parser.add_argument(
+        "--use_fp16",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Wheter to use FP16 mode")
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default='pndm',
+        choices=['pndm', 'euler_ancestral'],
+        help="The scheduler type of stable diffusion.")
+    return parser.parse_args()
+
+
+def create_ort_runtime(model_dir, model_prefix, model_format, device_id=0):
+    option = fd.RuntimeOption()
+    option.use_ort_backend()
+    option.use_gpu(device_id)
+    if model_format == "paddle":
+        model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel")
+        params_file = os.path.join(model_dir, model_prefix,
+                                   "inference.pdiparams")
+        option.set_model_path(model_file, params_file)
+    else:
+        onnx_file = os.path.join(model_dir, model_prefix, "inference.onnx")
+        option.set_model_path(onnx_file, model_format=ModelFormat.ONNX)
+    return fd.Runtime(option)
+
+
+def create_paddle_inference_runtime(model_dir,
+                                    model_prefix,
+                                    use_trt=False,
+                                    dynamic_shape=None,
+                                    use_fp16=False,
+                                    device_id=0):
+    option = fd.RuntimeOption()
+    option.use_paddle_backend()
+    if device_id == -1:
+        option.use_cpu()
+    else:
+        option.use_gpu(device_id)
+    if use_trt:
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
+        if use_fp16:
+            option.enable_trt_fp16()
+        cache_file = os.path.join(model_dir, model_prefix, "inference.trt")
+        option.set_trt_cache_file(cache_file)
+        # Need to enable collect shape for ernie
+        if dynamic_shape is not None:
+            option.enable_paddle_trt_collect_shape()
+            for key, shape_dict in dynamic_shape.items():
+                option.set_trt_input_shape(
+                    key,
+                    min_shape=shape_dict["min_shape"],
+                    opt_shape=shape_dict.get("opt_shape", None),
+                    max_shape=shape_dict.get("max_shape", None))
+    model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel")
+    params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams")
+    option.set_model_path(model_file, params_file)
+    return fd.Runtime(option)
+
+
+def create_trt_runtime(model_dir,
+                       model_prefix,
+                       model_format,
+                       workspace=(1 << 31),
+                       dynamic_shape=None,
+                       device_id=0):
+    option = fd.RuntimeOption()
+    option.use_trt_backend()
+    option.use_gpu(device_id)
+    option.enable_trt_fp16()
+    option.set_trt_max_workspace_size(workspace)
+    if dynamic_shape is not None:
+        for key, shape_dict in dynamic_shape.items():
+            option.set_trt_input_shape(
+                key,
+                min_shape=shape_dict["min_shape"],
+                opt_shape=shape_dict.get("opt_shape", None),
+                max_shape=shape_dict.get("max_shape", None))
+    if model_format == "paddle":
+        model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel")
+        params_file = os.path.join(model_dir, model_prefix,
+                                   "inference.pdiparams")
+        option.set_model_path(model_file, params_file)
+    else:
+        onnx_file = os.path.join(model_dir, model_prefix, "inference.onnx")
+        option.set_model_path(onnx_file, model_format=ModelFormat.ONNX)
+    cache_file = os.path.join(model_dir, model_prefix, "inference.trt")
+    option.set_trt_cache_file(cache_file)
+    return fd.Runtime(option)
+
+
+def get_scheduler(args):
+    if args.scheduler == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            beta_start=0.00085,
+            num_train_timesteps=1000,
+            skip_prk_steps=True)
+    elif args.scheduler == "euler_ancestral":
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+    else:
+        raise ValueError(
+            f"Scheduler '{args.scheduler}' is not supportted right now.")
+    return scheduler
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    # 1. Init scheduler
+    scheduler = get_scheduler(args)
+
+    # 2. Init tokenizer
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+    # 3. Set dynamic shape for trt backend
+    vae_dynamic_shape = {
+        "latent": {
+            "min_shape": [1, 4, 64, 64],
+            "max_shape": [2, 4, 64, 64],
+            "opt_shape": [2, 4, 64, 64],
+        }
+    }
+
+    unet_dynamic_shape = {
+        "latent_input": {
+            "min_shape": [1, 4, 64, 64],
+            "max_shape": [2, 4, 64, 64],
+            "opt_shape": [2, 4, 64, 64],
+        },
+        "timestep": {
+            "min_shape": [1],
+            "max_shape": [1],
+            "opt_shape": [1],
+        },
+        "encoder_embedding": {
+            "min_shape": [1, 77, 768],
+            "max_shape": [2, 77, 768],
+            "opt_shape": [2, 77, 768],
+        },
+    }
+
+    # 4. Init runtime
+    if args.backend == "onnx_runtime":
+        text_encoder_runtime = create_ort_runtime(
+            args.model_dir,
+            args.text_encoder_model_prefix,
+            args.model_format,
+            device_id=args.device_id)
+        vae_decoder_runtime = create_ort_runtime(
+            args.model_dir,
+            args.vae_model_prefix,
+            args.model_format,
+            device_id=args.device_id)
+        start = time.time()
+        unet_runtime = create_ort_runtime(
+            args.model_dir,
+            args.unet_model_prefix,
+            args.model_format,
+            device_id=args.device_id)
+        print(f"Spend {time.time() - start : .2f} s to load unet model.")
+    elif args.backend == "paddle" or args.backend == "paddle-tensorrt":
+        use_trt = True if args.backend == "paddle-tensorrt" else False
+        # Note(zhoushunjie): Will change to paddle runtime later
+        text_encoder_runtime = create_ort_runtime(
+            args.model_dir,
+            args.text_encoder_model_prefix,
+            args.model_format,
+            device_id=args.device_id)
+        vae_decoder_runtime = create_paddle_inference_runtime(
+            args.model_dir,
+            args.vae_model_prefix,
+            use_trt,
+            vae_dynamic_shape,
+            use_fp16=args.use_fp16,
+            device_id=args.device_id)
+        start = time.time()
+        unet_runtime = create_paddle_inference_runtime(
+            args.model_dir,
+            args.unet_model_prefix,
+            use_trt,
+            unet_dynamic_shape,
+            use_fp16=args.use_fp16,
+            device_id=args.device_id)
+        print(f"Spend {time.time() - start : .2f} s to load unet model.")
+    elif args.backend == "tensorrt":
+        text_encoder_runtime = create_ort_runtime(
+            args.model_dir, args.text_encoder_model_prefix, args.model_format)
+        vae_decoder_runtime = create_trt_runtime(
+            args.model_dir,
+            args.vae_model_prefix,
+            args.model_format,
+            workspace=(1 << 30),
+            dynamic_shape=vae_dynamic_shape,
+            device_id=args.device_id)
+        start = time.time()
+        unet_runtime = create_trt_runtime(
+            args.model_dir,
+            args.unet_model_prefix,
+            args.model_format,
+            dynamic_shape=unet_dynamic_shape,
+            device_id=args.device_id)
+        print(f"Spend {time.time() - start : .2f} s to load unet model.")
+    pipe = StableDiffusionFastDeployPipeline(
+        vae_decoder_runtime=vae_decoder_runtime,
+        text_encoder_runtime=text_encoder_runtime,
+        tokenizer=tokenizer,
+        unet_runtime=unet_runtime,
+        scheduler=scheduler)
+
+    prompt = "a photo of an astronaut riding a horse on mars"
+    # Warm up
+    pipe(prompt, num_inference_steps=10)
+
+    time_costs = []
+    print(
+        f"Run the stable diffusion pipeline {args.benchmark_steps} times to test the performance."
+    )
+    for step in range(args.benchmark_steps):
+        start = time.time()
+        image = pipe(prompt, num_inference_steps=args.inference_steps)[0]
+        latency = time.time() - start
+        time_costs += [latency]
+        print(f"No {step:3d} time cost: {latency:2f} s")
+    print(
+        f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+        f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+    )
+    image.save(args.image_path)
+    print(f"Image saved in {args.image_path}!")
diff --git a/examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py b/examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py
new file mode 100644
index 0000000000..0c7418bd7b
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py
@@ -0,0 +1,236 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+import numpy as np
+
+from paddlenlp.transformers import CLIPTokenizer
+import fastdeploy as fd
+from scheduling_utils import PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler
+import PIL
+from PIL import Image
+import logging
+
+
+class StableDiffusionFastDeployPipeline(object):
+    vae_decoder_runtime: fd.Runtime
+    text_encoder_runtime: fd.Runtime
+    tokenizer: CLIPTokenizer
+    unet_runtime: fd.Runtime
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
+                     EulerAncestralDiscreteScheduler]
+
+    def __init__(self,
+                 vae_decoder_runtime: fd.Runtime,
+                 text_encoder_runtime: fd.Runtime,
+                 tokenizer: CLIPTokenizer,
+                 unet_runtime: fd.Runtime,
+                 scheduler: Union[DDIMScheduler, PNDMScheduler,
+                                  LMSDiscreteScheduler]):
+        self.vae_decoder_runtime = vae_decoder_runtime
+        self.text_encoder_runtime = text_encoder_runtime
+        self.unet_runtime = unet_runtime
+        self.scheduler = scheduler
+        self.tokenizer = tokenizer
+
+    def __call__(
+            self,
+            prompt: Union[str, List[str]],
+            height: Optional[int]=512,
+            width: Optional[int]=512,
+            num_inference_steps: Optional[int]=50,
+            guidance_scale: Optional[float]=7.5,
+            negative_prompt: Optional[Union[str, List[str]]]=None,
+            num_images_per_prompt: Optional[int]=1,
+            eta: Optional[float]=0.0,
+            generator: Optional[np.random.RandomState]=None,
+            latents: Optional[np.ndarray]=None,
+            output_type: Optional[str]="pil",
+            return_dict: bool=True,
+            callback: Optional[Callable[[int, int, np.ndarray], None]]=None,
+            callback_steps: Optional[int]=1,
+            **kwargs, ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (callback_steps is not None and (
+                not isinstance(callback_steps, int) or callback_steps <= 0)):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}.")
+
+        if generator is None:
+            generator = np.random
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np", )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(
+                text_input_ids[:, self.tokenizer.model_max_length:])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+            text_input_ids = text_input_ids[:, :
+                                            self.tokenizer.model_max_length]
+
+        input_name = self.text_encoder_runtime.get_input_info(0).name
+        text_embeddings = self.text_encoder_runtime.infer({
+            input_name: text_input_ids.astype(np.int64)
+        })[0]
+        text_embeddings = np.repeat(
+            text_embeddings, num_images_per_prompt, axis=0)
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}.")
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`.")
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np")
+            uncond_embeddings = self.text_encoder_runtime.infer({
+                input_name: uncond_input.input_ids.astype(np.int64)
+            })[0]
+            uncond_embeddings = np.repeat(
+                uncond_embeddings, num_images_per_prompt, axis=0)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate(
+                [uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+        latents_dtype = text_embeddings.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8,
+                         width // 8)
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        elif latents.shape != latents_shape:
+            raise ValueError(
+                f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+
+            # predict the noise residual
+            sample_name = self.unet_runtime.get_input_info(0).name
+            timestep_name = self.unet_runtime.get_input_info(1).name
+            encoder_hidden_states_name = self.unet_runtime.get_input_info(
+                2).name
+            # Required fp16 input.
+            input_type = [np.float16, np.float16, np.float16]
+            if self.unet_runtime.get_input_info(0).dtype == fd.FDDataType.FP32:
+                input_type = [np.float32, np.int64, np.float32]
+            noise_pred = self.unet_runtime.infer({
+                sample_name: latent_model_input.astype(input_type[0]),
+                timestep_name: np.array(
+                    [t], dtype=input_type[1]),
+                encoder_hidden_states_name:
+                text_embeddings.astype(input_type[2]),
+            })[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents,
+                                          **extra_step_kwargs).prev_sample
+            latents = np.array(latents)
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        sample_name = self.vae_decoder_runtime.get_input_info(0).name
+        input_dtype = np.float16
+        if self.vae_decoder_runtime.get_input_info(
+                0).dtype == fd.FDDataType.FP32:
+            input_dtype = np.float32
+        image = self.vae_decoder_runtime.infer({
+            sample_name: latents.astype(input_dtype)
+        })[0]
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        return image
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
diff --git a/examples/multimodal/stable_diffusion/requirements_paddle.txt b/examples/multimodal/stable_diffusion/requirements_paddle.txt
new file mode 100644
index 0000000000..73a56d2bde
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/requirements_paddle.txt
@@ -0,0 +1,3 @@
+ppdiffusers
+paddlenlp
+paddlepaddle-gpu
diff --git a/examples/multimodal/stable_diffusion/requirements_torch.txt b/examples/multimodal/stable_diffusion/requirements_torch.txt
new file mode 100644
index 0000000000..111a2fdeb3
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/requirements_torch.txt
@@ -0,0 +1,5 @@
+onnx
+torch
+diffusers
+transformers
+scipy
diff --git a/examples/multimodal/stable_diffusion/scheduling_utils.py b/examples/multimodal/stable_diffusion/scheduling_utils.py
new file mode 100644
index 0000000000..0e6209cbd7
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/scheduling_utils.py
@@ -0,0 +1,1128 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Tuple, Union, Any
+from scipy import integrate
+
+import numpy as np
+from config_utils import register_to_config, ConfigMixin
+from dataclasses import dataclass
+from collections import OrderedDict
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+class BaseOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+    <Tip warning={true}>
+    You can't unpack a `BaseOutput` directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
+    before.
+    </Tip>
+    """
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(
+            getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and isinstance(first_field, dict):
+            for key, value in first_field.items():
+                self[key] = value
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
+        )
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
+        )
+
+    def pop(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(
+            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
+        )
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            if self.__class__.__name__ in [
+                    "StableDiffusionPipelineOutput", "ImagePipelineOutput"
+            ] and k == "sample":
+                deprecate("samples", "0.6.0",
+                          "Please use `.images` or `'images'` instead.")
+                return inner_dict["images"]
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+class SchedulerMixin:
+    """
+    Mixin containing common functions for the schedulers.
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+
+    def set_format(self, tensor_format="pt"):
+        return self
+
+
+class SchedulerOutput(BaseOutput):
+    """
+    Base class for the scheduler's step function output.
+    Args:
+        prev_sample (`np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: np.ndarray
+
+
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+    Args:
+        prev_sample (` np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (` np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: np.ndarray
+    pred_original_sample: Optional[np.ndarray] = None
+
+
+class LMSDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+    Args:
+        prev_sample (`np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: np.ndarray
+    pred_original_sample: Optional[np.ndarray] = None
+
+
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+    Args:
+        prev_sample (`np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`np.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: np.ndarray
+    pred_original_sample: Optional[np.ndarray] = None
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.concatenate(betas).astype(np.float32)
+
+
+class PNDMScheduler(SchedulerMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int=1000,
+            beta_start: float=0.0001,
+            beta_end: float=0.02,
+            beta_schedule: str="linear",
+            trained_betas: Optional[np.ndarray]=None,
+            skip_prk_steps: bool=False,
+            set_alpha_to_one: bool=False,
+            steps_offset: int=0,
+            **kwargs, ):
+        if trained_betas is not None:
+            self.betas = trained_betas
+        elif beta_schedule == "linear":
+            self.betas = np.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_train_timesteps,
+                dtype=np.float32)**2)
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(
+                f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
+
+        self.final_alpha_cumprod = 1.0 if set_alpha_to_one else self.alphas_cumprod[
+            0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.cur_model_output = 0
+        self.counter = 0
+        self.cur_sample = None
+        self.ets = []
+
+        # setable values
+        self.num_inference_steps = None
+        self._timesteps = np.arange(
+            0, num_train_timesteps)[::-1].copy().astype("int64")
+        self.prk_timesteps = None
+        self.plms_timesteps = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps: int, **kwargs) -> np.ndarray:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        offset = self.config.steps_offset
+
+        self.num_inference_steps = num_inference_steps
+        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # casting to int to avoid issues when num_inference_step is power of 3
+        self._timesteps = (np.arange(0, num_inference_steps) *
+                           step_ratio).round()
+        self._timesteps += offset
+
+        if self.config.skip_prk_steps:
+            # for some models like stable diffusion the prk steps can/should be skipped to
+            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+            self.prk_timesteps = np.array([])
+            self.plms_timesteps = np.concatenate([
+                self._timesteps[:-1], self._timesteps[-2:-1],
+                self._timesteps[-1:]
+            ])[::-1].copy()
+        else:
+            prk_timesteps = np.array(self._timesteps[
+                -self.pndm_order:]).repeat(2) + np.tile(
+                    np.array([
+                        0, self.config.num_train_timesteps //
+                        num_inference_steps // 2
+                    ]), self.pndm_order)
+            self.prk_timesteps = (
+                prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+            self.plms_timesteps = self._timesteps[:-3][::-1].copy()
+
+        self.timesteps = np.concatenate(
+            [self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+
+        self.ets = []
+        self.counter = 0
+
+    def step(
+            self,
+            model_output: np.ndarray,
+            timestep: int,
+            sample: np.ndarray,
+            return_dict: bool=True, ):
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
+
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.counter < len(
+                self.prk_timesteps) and not self.config.skip_prk_steps:
+            return self.step_prk(
+                model_output=model_output,
+                timestep=timestep,
+                sample=sample,
+                return_dict=return_dict)
+        else:
+            return self.step_plms(
+                model_output=model_output,
+                timestep=timestep,
+                sample=sample,
+                return_dict=return_dict)
+
+    def step_prk(self,
+                 model_output: np.ndarray,
+                 timestep: int,
+                 sample: np.ndarray,
+                 return_dict: bool=True):
+        """
+        Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
+        solution to the differential equation.
+
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
+        prev_timestep = timestep - diff_to_prev
+        timestep = self.prk_timesteps[self.counter // 4 * 4]
+
+        if self.counter % 4 == 0:
+            self.cur_model_output += 1 / 6 * model_output
+            self.ets.append(model_output)
+            self.cur_sample = sample
+        elif (self.counter - 1) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 2) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 3) % 4 == 0:
+            model_output = self.cur_model_output + 1 / 6 * model_output
+            self.cur_model_output = 0
+
+        # cur_sample should not be `None`
+        cur_sample = self.cur_sample if self.cur_sample is not None else sample
+
+        prev_sample = self._get_prev_sample(cur_sample, timestep,
+                                            prev_timestep, model_output)
+        self.counter += 1
+        if not return_dict:
+            return (prev_sample, )
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def step_plms(self,
+                  model_output: np.ndarray,
+                  timestep: int,
+                  sample: np.ndarray,
+                  return_dict: bool=True):
+        """
+        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+        times to approximate the solution.
+
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+        Returns:
+            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if not self.config.skip_prk_steps and len(self.ets) < 3:
+            raise ValueError(
+                f"{self.__class__} can only be run AFTER scheduler has been run "
+                "in 'prk' mode for at least 12 iterations "
+                "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+                "for more information.")
+
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        if self.counter != 1:
+            self.ets.append(model_output)
+        else:
+            prev_timestep = timestep
+            timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
+
+        if len(self.ets) == 1 and self.counter == 0:
+            model_output = model_output
+            self.cur_sample = sample
+        elif len(self.ets) == 1 and self.counter == 1:
+            model_output = (model_output + self.ets[-1]) / 2
+            sample = self.cur_sample
+            self.cur_sample = None
+        elif len(self.ets) == 2:
+            model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            model_output = (
+                23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] +
+                                       37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep,
+                                            model_output)
+        self.counter += 1
+        if not return_dict:
+            return (prev_sample, )
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: np.ndarray, *args,
+                          **kwargs) -> np.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`np.ndarray`): input sample
+        Returns:
+            `np.ndarray`: scaled input sample
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t)**(0.5)
+
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev**(0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev)**(0.5)
+
+        prev_sample = (sample_coeff * sample -
+                       (alpha_prod_t_prev - alpha_prod_t
+                        ) * model_output / model_output_denom_coeff)
+
+        return prev_sample
+
+    def add_noise(
+            self,
+            original_samples: np.ndarray,
+            noise: np.ndarray,
+            timesteps: np.ndarray, ) -> np.ndarray:
+
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(
+                original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+class DDIMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between -1 and 1 for numerical stability.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            an offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+            stable diffusion.
+
+    """
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int=1000,
+            beta_start: float=0.0001,
+            beta_end: float=0.02,
+            beta_schedule: str="linear",
+            trained_betas: Optional[np.ndarray]=None,
+            clip_sample: bool=True,
+            set_alpha_to_one: bool=True,
+            steps_offset: int=0,
+            **kwargs, ):
+        if trained_betas is not None:
+            self.betas = trained_betas
+        elif beta_schedule == "linear":
+            self.betas = np.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_train_timesteps,
+                dtype=np.float32)**2)
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(
+                f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = 1.0 if set_alpha_to_one else self.alphas_cumprod[
+            0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = np.arange(0, num_train_timesteps)[::-1]
+
+    def scale_model_input(self,
+                          sample: np.ndarray,
+                          timestep: Optional[int]=None) -> np.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`np.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+        Returns:
+            `np.ndarray`: scaled input sample
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (
+            1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def set_timesteps(self, num_inference_steps: int, **kwargs):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        offset = self.config.steps_offset
+
+        self.num_inference_steps = num_inference_steps
+        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # casting to int to avoid issues when num_inference_step is power of 3
+        self.timesteps = (np.arange(0, num_inference_steps) *
+                          step_ratio).round()[::-1]
+        self.timesteps += offset
+
+    def step(
+            self,
+            model_output: np.ndarray,
+            timestep: int,
+            sample: np.ndarray,
+            eta: float=0.0,
+            use_clipped_model_output: bool=False,
+            generator=None,
+            return_dict: bool=True, ):
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): TODO
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+
+        Returns:
+            [`~scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+            [`~scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_original_sample = (sample - beta_prod_t**
+                                (0.5) * model_output) / alpha_prod_t**(0.5)
+
+        # 4. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = np.clip(pred_original_sample, -1, 1)
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance**(0.5)
+
+        if use_clipped_model_output:
+            # the model_output is always re-derived from the clipped x_0 in Glide
+            model_output = (sample - alpha_prod_t**
+                            (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
+            0.5) * model_output
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev**(
+            0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            noise = np.random.randn(*model_output.shape)
+            variance = self._get_variance(timestep, prev_timestep)**(
+                0.5) * eta * noise
+
+            prev_sample = prev_sample + variance
+        if not return_dict:
+            return (prev_sample, )
+        return DDIMSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+    Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+    """
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int=1000,
+            beta_start: float=0.0001,
+            beta_end: float=0.02,
+            beta_schedule: str="linear",
+            trained_betas: Optional[np.ndarray]=None,
+            **kwargs, ):
+        if trained_betas is not None:
+            self.betas = trained_betas
+        elif beta_schedule == "linear":
+            self.betas = np.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_train_timesteps,
+                dtype=np.float32)**2)
+        else:
+            raise NotImplementedError(
+                f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
+                          0.5)
+        self.sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = np.linspace(
+            0, num_train_timesteps - 1, num_train_timesteps,
+            dtype=float)[::-1].copy()
+        self.derivatives = []
+
+    def scale_model_input(self,
+                          sample: np.ndarray,
+                          timestep: Union[float, np.ndarray]) -> np.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
+        Args:
+            sample (`np.ndarray`): input sample
+            timestep (`float` or `np.ndarray`): the current timestep in the diffusion chain
+        Returns:
+            `np.ndarray`: scaled input sample
+        """
+        step_index = (self.timesteps == timestep).nonzero()[0]
+        sigma = self.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1)**0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def get_lms_coefficient(self, order, t, current_order):
+        """
+        Compute a linear multistep coefficient.
+
+        Args:
+            order (TODO):
+            t (TODO):
+            current_order (TODO):
+        """
+
+        def lms_derivative(tau):
+            prod = 1.0
+            for k in range(order):
+                if current_order == k:
+                    continue
+                prod *= (tau - self.sigmas[t - k]) / (
+                    self.sigmas[t - current_order] - self.sigmas[t - k])
+            return prod
+
+        integrated_coeff = integrate.quad(
+            lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+
+        return integrated_coeff
+
+    def set_timesteps(self, num_inference_steps: int):
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(
+            0,
+            self.config.num_train_timesteps - 1,
+            num_inference_steps,
+            dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
+                          0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = sigmas
+        self.timesteps = timesteps
+
+        self.derivatives = []
+
+    def step(
+            self,
+            model_output: np.ndarray,
+            timestep: int,
+            sample: np.ndarray,
+            order: int=4,
+            return_dict: bool=True, ):
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+            order: coefficient for multi-step inference.
+            return_dict (`bool`): option for returning tuple rather than LMSDiscreteSchedulerOutput class
+
+        Returns:
+            [`~scheduling_utils.LMSDiscreteSchedulerOutput`] or `tuple`:
+            [`~scheduling_utils.LMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+
+        """
+        sigma = self.sigmas[int(timestep)]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = sample - sigma * model_output
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+        self.derivatives.append(derivative)
+        if len(self.derivatives) > order:
+            self.derivatives.pop(0)
+
+        # 3. Compute linear multistep coefficients
+        order = min(timestep + 1, order)
+        lms_coeffs = [
+            self.get_lms_coefficient(order, timestep, curr_order)
+            for curr_order in range(order)
+        ]
+
+        # 4. Compute previous sample based on the derivatives path
+        prev_sample = sample + sum(coeff * derivative
+                                   for coeff, derivative in zip(
+                                       lms_coeffs, reversed(self.derivatives)))
+
+        if not return_dict:
+            return (prev_sample, )
+
+        return LMSDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+            self,
+            original_samples: np.ndarray,
+            noise: np.ndarray,
+            timesteps: np.ndarray, ) -> np.ndarray:
+        sigmas = self.sigmas
+
+        sigma = sigmas[timesteps].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+    """
+
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "LMSDiscreteScheduler",
+        "PNDMScheduler",
+        "EulerDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
+    ]
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int=1000,
+            beta_start: float=0.0001,
+            beta_end: float=0.02,
+            beta_schedule: str="linear",
+            trained_betas: Optional[np.ndarray]=None, ):
+        if trained_betas is not None:
+            self.betas = np.array(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = np.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_train_timesteps,
+                dtype="float32")**2)
+        else:
+            raise NotImplementedError(
+                f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
+                          0.5)
+        self.sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(
+            0, num_train_timesteps - 1, num_train_timesteps,
+            dtype=float)[::-1].copy()
+        self.timesteps = timesteps
+        self.is_scale_input_called = False
+
+    def scale_model_input(self,
+                          sample: np.ndarray,
+                          timestep: Union[float, np.ndarray]) -> np.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+        Args:
+            sample (`np.ndarray`): input sample
+            timestep (`float` or `np.ndarray`): the current timestep in the diffusion chain
+        Returns:
+            `np.ndarray`: scaled input sample
+        """
+        step_index = (self.timesteps == timestep).nonzero()[0]
+        sigma = self.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1)**0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int):
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(
+            0,
+            self.config.num_train_timesteps - 1,
+            num_inference_steps,
+            dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
+                          0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = sigmas
+        self.timesteps = timesteps
+
+    def step(
+            self,
+            model_output: np.ndarray,
+            timestep: Union[float, np.ndarray],
+            sample: np.ndarray,
+            return_dict: bool=True, ) -> Union[
+                EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`np.ndarray`): direct output from learned diffusion model.
+            timestep (`float`): current timestep in the diffusion chain.
+            sample (`np.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than EulerAncestralDiscreteSchedulerOutput class
+        Returns:
+            [`~scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+            [`~scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
+            a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        if not self.is_scale_input_called:
+            logger.warn(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example.")
+        step_index = (self.timesteps == timestep).nonzero()[0]
+        sigma = self.sigmas[step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = sample - sigma * model_output
+        sigma_from = self.sigmas[step_index]
+        sigma_to = self.sigmas[step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from
+                    **2)**0.5
+        sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        dt = sigma_down - sigma
+
+        prev_sample = sample + derivative * dt
+        noise = np.random.randn(*model_output.shape).astype(model_output.dtype)
+
+        prev_sample = prev_sample + noise * sigma_up
+
+        if not return_dict:
+            return (prev_sample, )
+
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+            self,
+            original_samples: np.ndarray,
+            noise: np.ndarray,
+            timesteps: np.ndarray, ) -> np.ndarray:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        self.sigmas = self.sigmas.astype(original_samples.dtype)
+
+        schedule_timesteps = self.timesteps
+        step_indices = [(schedule_timesteps == t).nonzero() for t in timesteps]
+
+        sigma = self.sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/python/fastdeploy/__init__.py b/python/fastdeploy/__init__.py
index b767393f10..bacad5bdf7 100644
--- a/python/fastdeploy/__init__.py
+++ b/python/fastdeploy/__init__.py
@@ -37,3 +37,4 @@
 from . import pipeline
 from . import text
 from .download import download, download_and_decompress, download_model
+from .utils import profile
diff --git a/python/fastdeploy/utils/__init__.py b/python/fastdeploy/utils/__init__.py
index 97043fd7ba..7e1abd0b16 100644
--- a/python/fastdeploy/utils/__init__.py
+++ b/python/fastdeploy/utils/__init__.py
@@ -11,3 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .profile import profile
diff --git a/python/fastdeploy/utils/profile.py b/python/fastdeploy/utils/profile.py
new file mode 100644
index 0000000000..4c49cfe43e
--- /dev/null
+++ b/python/fastdeploy/utils/profile.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cProfile, pstats, io
+from pstats import SortKey
+
+
+def profile(func, *args, **kwargs):
+    pr = cProfile.Profile()
+    pr.enable()
+    func(*args, **kwargs)
+    pr.disable()
+    s = io.StringIO()
+    sortby = SortKey.CUMULATIVE
+    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+    ps.print_stats()
+    print(s.getvalue())