diff --git a/requirements.txt b/requirements.txt
index 5dd75e0..e871871 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,9 @@ skypilot
 fastapi
 supabase
 pytest 
-pytest-benchmark 
\ No newline at end of file
+pytest-benchmark 
+tensorrt
+torch 
+einops
+tiktoken
+uvicorn
\ No newline at end of file
diff --git a/servers/blip2.py b/servers/blip2.py
new file mode 100644
index 0000000..b5115b6
--- /dev/null
+++ b/servers/blip2.py
@@ -0,0 +1,141 @@
+import argparse
+import os
+
+import torch
+import tensorrt as trt
+
+# isort: on
+import tensorrt_llm
+
+
+def get_engine_name(rank):
+    return "rank{}.engine".format(rank)
+
+
+def trt_dtype_to_torch(dtype):
+    if dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    elif dtype == trt.int32:
+        return torch.int32
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+def TRTOPT(args, config):
+    dtype = config["pretrained_config"]["dtype"]
+    world_size = config["pretrained_config"]["mapping"]["world_size"]
+    assert (
+        world_size == tensorrt_llm.mpi_world_size()
+    ), f"Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})"
+
+    use_gpt_attention_plugin = bool(
+        config["build_config"]["plugin_config"]["gpt_attention_plugin"]
+    )
+
+    num_heads = config["pretrained_config"]["num_attention_heads"] // world_size
+    hidden_size = config["pretrained_config"]["hidden_size"] // world_size
+    vocab_size = config["pretrained_config"]["vocab_size"]
+    max_batch_size = config["build_config"]["max_batch_size"]
+    num_layers = config["pretrained_config"]["num_hidden_layers"]
+    remove_input_padding = config["build_config"]["plugin_config"][
+        "remove_input_padding"
+    ]
+    max_prompt_embedding_table_size = config["build_config"].get(
+        "max_prompt_embedding_table_size", 0
+    )
+
+    model_config = tensorrt_llm.runtime.ModelConfig(
+        max_batch_size=max_batch_size,
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        num_kv_heads=num_heads,
+        hidden_size=hidden_size,
+        gpt_attention_plugin=use_gpt_attention_plugin,
+        remove_input_padding=remove_input_padding,
+        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+        dtype=dtype,
+    )
+
+    runtime_rank = tensorrt_llm.mpi_rank()
+    runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+    engine_name = get_engine_name(runtime_rank)
+    serialize_path = os.path.join(args.opt_engine_dir, engine_name)
+
+    tensorrt_llm.logger.set_level(args.log_level)
+
+    with open(serialize_path, "rb") as f:
+        engine_buffer = f.read()
+    decoder = tensorrt_llm.runtime.GenerationSession(
+        model_config, engine_buffer, runtime_mapping
+    )
+
+    max_input_len = config["build_config"]["max_input_len"]
+    return decoder, model_config, world_size, dtype, max_input_len
+
+
+def ptuning_setup(
+    prompt_table,
+    dtype,
+    hidden_size,
+    tasks,
+    input_ids,
+    input_lengths,
+    remove_input_padding,
+):
+    if prompt_table is not None:
+        task_vocab_size = torch.tensor(
+            [prompt_table.shape[1]], dtype=torch.int32, device="cuda"
+        )
+        prompt_table = prompt_table.view(
+            (prompt_table.shape[0] * prompt_table.shape[1], prompt_table.shape[2])
+        )
+        prompt_table = prompt_table.cuda().to(
+            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
+        )
+    else:
+        prompt_table = torch.empty([1, hidden_size]).cuda()
+        task_vocab_size = torch.zeros([1]).cuda()
+
+    num_sequences = input_lengths.size(0) if remove_input_padding else input_ids.size(0)
+
+    if tasks is not None:
+        tasks = torch.tensor(
+            [int(t) for t in tasks.split(",")], dtype=torch.int32, device="cuda"
+        )
+        assert (
+            tasks.shape[0] == num_sequences
+        ), "Number of supplied tasks must match input batch size"
+    else:
+        tasks = torch.zeros([num_sequences], dtype=torch.int32).cuda()
+
+    return [prompt_table, tasks, task_vocab_size]
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--max_output_len", type=int, default=30)
+    parser.add_argument("--log_level", type=str, default="info")
+    parser.add_argument("--engine_dir", type=str, default="./plan")
+    parser.add_argument("--input_dir", type=str, default="image.pt")
+    parser.add_argument("--query_tokens", type=str, default="query_tokens.pt")
+    parser.add_argument(
+        "--opt_engine_dir", type=str, default="trt_engine/blip-2-opt-2.7b/fp16/1-gpu/"
+    )
+    parser.add_argument("--hf_model_location", type=str, default="facebook/opt-2.7b")
+    parser.add_argument(
+        "--input_text", type=str, default="Question: which city is this? Answer:"
+    )
+    parser.add_argument(
+        "--num_beams", type=int, help="Use beam search if num_beams >1", default=1
+    )
+    parser.add_argument(
+        "--max_txt_len", type=int, help="Max text prompt length", default=32
+    )
+    parser.add_argument("--top_k", type=int, default=1)
+
+    return parser.parse_args()
diff --git a/servers/fuyu_api.py b/servers/fuyu_api.py
new file mode 100644
index 0000000..af846bb
--- /dev/null
+++ b/servers/fuyu_api.py
@@ -0,0 +1,107 @@
+import argparse
+import asyncio
+import json
+from typing import AsyncGenerator
+
+import uvicorn
+from executor import GenerationExecutor
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from swarms import Fuyu, Conversation
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds.
+TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
+app = FastAPI()
+executor: GenerationExecutor | None = None
+
+
+@app.get("/stats")
+async def stats() -> Response:
+    assert executor is not None
+    return JSONResponse(json.loads(await executor.aget_stats()))
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post("/generate")
+async def generate(request: Request) -> Response:
+    assert executor is not None
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+
+    streaming = request_dict.pop("streaming", False)
+
+    model_name = request.query_params.get("model_name")
+    max_new_tokens = request.query_params.get("max_new_tokens")
+
+    model = Fuyu(
+        model_name=model_name,
+        max_new_tokens=max_new_tokens,
+        args=args  # Injecting args into the Fuyu model
+    )
+    response = model.run(
+        request_dict.pop("prompt"),
+        request_dict.pop("max_num_tokens", 8),
+        streaming
+    )
+
+    async def stream_results() -> AsyncGenerator[bytes, None]:
+        async for output in response:
+            yield (json.dumps({"text": output.text}) + "\n").encode("utf-8")
+
+    if streaming:
+        return StreamingResponse(stream_results(), media_type="text/plain")
+
+    # Non-streaming case
+    await response.await_completion()
+
+    # Return model configurations as JSON
+    model_config = {
+        "model_name": model.model_name,
+        "max_new_tokens": model.max_new_tokens,
+        "args": {
+            "model_dir": args.model_dir,
+            "tokenizer_type": args.tokenizer_type,
+            "max_beam_width": args.max_beam_width
+        }
+    }
+
+    return JSONResponse({"model_config": model_config, "choices": [{"text": response.text}]})
+
+
+async def main(args):
+    global executor
+
+    executor = GenerationExecutor(
+        args.model_dir, args.tokenizer_type, args.max_beam_width
+    )
+    config = uvicorn.Config(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+    )
+    await uvicorn.Server(config).serve()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_dir")
+    parser.add_argument("tokenizer_type")
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--max_beam_width", type=int, default=1)
+    args = parser.parse_args()
+
+    asyncio.run(main(args))
diff --git a/servers/qwen_tensort.py b/servers/qwen_tensort.py
new file mode 100644
index 0000000..3325b79
--- /dev/null
+++ b/servers/qwen_tensort.py
@@ -0,0 +1,524 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+from typing import List, Tuple
+
+import tensorrt as trt
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+import tensorrt_llm
+import tensorrt_llm.profiler as profiler
+from tensorrt_llm import logger
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig, Session, TensorInfo
+
+
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return "{}_{}_tp{}_rank{}.engine".format(model, dtype, tp_size, rank)
+    return "{}_{}_tp{}_pp{}_rank{}.engine".format(model, dtype, tp_size, pp_size, rank)
+
+
+def trt_dtype_to_torch(dtype):
+    if dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    elif dtype == trt.int32:
+        return torch.int32
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+class QWenInfer(object):
+    def __init__(
+        self,
+        tokenizer_dir,
+        qwen_engine_dir,
+        log_level,
+        output_csv,
+        output_npy,
+        num_beams,
+    ):
+        self.tokenizer_dir = tokenizer_dir
+        self.qwen_engine_dir = qwen_engine_dir
+        self.log_level = log_level
+        self.global_max_input_len = 2048
+        self.decoder = None
+        self.tokenizer = None
+        self.config = None
+        self.sampling_config = None
+        self.output_csv = output_csv
+        self.output_npy = output_npy
+        self.num_beams = num_beams
+        self.model_config = None
+
+    def get_model(self):
+        # --load the tokenizer and engine #
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_dir,
+            legacy=False,
+            trust_remote_code=True,
+        )
+        config_path = os.path.join(self.qwen_engine_dir, "config.json")
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        gen_config_path = os.path.join(self.tokenizer_dir, "generation_config.json")
+        with open(gen_config_path, "r") as f:
+            gen_config = json.load(f)
+        top_k = gen_config["top_k"]
+        top_p = gen_config["top_p"]
+        chat_format = gen_config["chat_format"]
+        if chat_format == "raw":
+            eos_token_id = gen_config["eos_token_id"]
+            pad_token_id = gen_config["pad_token_id"]
+        elif chat_format == "chatml":
+            pad_token_id = eos_token_id = tokenizer.im_end_id
+        else:
+            raise Exception("unknown chat format ", chat_format)
+
+        use_gpt_attention_plugin = config["plugin_config"]["gpt_attention_plugin"]
+        remove_input_padding = config["plugin_config"]["remove_input_padding"]
+        dtype = config["builder_config"]["precision"]
+        tp_size = config["builder_config"]["tensor_parallel"]
+        pp_size = config["builder_config"]["pipeline_parallel"]
+        world_size = tp_size * pp_size
+        assert (
+            world_size == tensorrt_llm.mpi_world_size()
+        ), f"Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})"
+        num_heads = config["builder_config"]["num_heads"] // world_size
+        max_batch_size = config["builder_config"]["max_batch_size"]
+        hidden_size = config["builder_config"]["hidden_size"] // world_size
+        vocab_size = config["builder_config"]["vocab_size"]
+        num_layers = config["builder_config"]["num_layers"]
+        num_kv_heads = config["builder_config"].get("num_kv_heads", num_heads)
+        paged_kv_cache = config["plugin_config"]["paged_kv_cache"]
+        tokens_per_block = config["plugin_config"]["tokens_per_block"]
+        max_prompt_embedding_table_size = config["builder_config"].get(
+            "max_prompt_embedding_table_size", 0
+        )
+        quant_mode = QuantMode(config["builder_config"]["quant_mode"])
+        if config["builder_config"].get("multi_query_mode", False):
+            tensorrt_llm.logger.warning(
+                "`multi_query_mode` config is deprecated. Please rebuild the engine."
+            )
+            num_kv_heads = 1
+        # num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+        use_custom_all_reduce = config["plugin_config"].get(
+            "use_custom_all_reduce", False
+        )
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(
+            world_size=world_size, rank=runtime_rank, tp_size=tp_size, pp_size=pp_size
+        )
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+        model_config = ModelConfig(
+            max_batch_size=max_batch_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            hidden_size=hidden_size,
+            vocab_size=vocab_size,
+            num_layers=num_layers,
+            gpt_attention_plugin=use_gpt_attention_plugin,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            remove_input_padding=remove_input_padding,
+            dtype=dtype,
+            quant_mode=quant_mode,
+            use_custom_all_reduce=use_custom_all_reduce,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+        )
+        sampling_config = SamplingConfig(
+            end_id=eos_token_id,
+            pad_id=pad_token_id,
+            num_beams=self.num_beams,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=1.0,
+        )
+
+        engine_name = get_engine_name("qwen", dtype, tp_size, pp_size, runtime_rank)
+        serialize_path = os.path.join(self.qwen_engine_dir, engine_name)
+        print(f"Loading engine from {serialize_path}")
+        return (
+            model_config,
+            sampling_config,
+            runtime_mapping,
+            runtime_rank,
+            serialize_path,
+            tokenizer,
+            eos_token_id,
+            pad_token_id,
+        )
+
+    def qwen_model_init(self):
+        (
+            model_config,
+            sampling_config,
+            runtime_mapping,
+            runtime_rank,
+            serialize_path,
+            tokenizer,
+            eos_token_id,
+            pad_token_id,
+        ) = self.get_model()
+        with open(serialize_path, "rb") as f:
+            engine_buffer = f.read()
+        self.decoder = tensorrt_llm.runtime.GenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
+        self.tokenizer = tokenizer
+        self.sampling_config = sampling_config
+        self.model_config = model_config
+        self.config, _ = AutoConfig.from_pretrained(
+            self.tokenizer_dir,
+            return_unused_kwargs=True,
+            trust_remote_code=True,
+        )
+
+    def ptuning_setup(self, prompt_table, dtype, hidden_size, tasks, input_ids):
+        if prompt_table is not None:
+            task_vocab_size = torch.tensor(
+                [prompt_table.shape[1]], dtype=torch.int32, device="cuda"
+            )
+            prompt_table = prompt_table.view(
+                (prompt_table.shape[0] * prompt_table.shape[1], prompt_table.shape[2])
+            )
+            prompt_table = prompt_table.cuda().to(
+                dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
+            )
+        else:
+            prompt_table = torch.empty([1, hidden_size]).cuda()
+            task_vocab_size = torch.zeros([1]).cuda()
+
+        if tasks is not None:
+            tasks = torch.tensor(
+                [int(t) for t in tasks.split(",")], dtype=torch.int32, device="cuda"
+            )
+            assert (
+                tasks.shape[0] == input_ids.shape[0]
+            ), "Number of supplied tasks must match input batch size"
+        else:
+            tasks = torch.zeros([input_ids.size(0)], dtype=torch.int32).cuda()
+
+        return [prompt_table, tasks, task_vocab_size]
+
+    def make_context(
+        self,
+        query: str,
+        history: List[Tuple[str, str]] = None,
+        system: str = "You are a helpful assistant.",
+        max_window_size: int = 6144,
+    ):
+        if history is None:
+            history = []
+
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [self.tokenizer.im_start_id]  # 151644
+        im_end_tokens = [self.tokenizer.im_end_id]  # [151645]
+        nl_tokens = self.tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", self.tokenizer.encode(
+                role, allowed_special=set(self.tokenizer.IMAGE_ST)
+            ) + nl_tokens + self.tokenizer.encode(
+                content, allowed_special=set(self.tokenizer.IMAGE_ST)
+            )
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            if turn_response is not None:
+                response_text, response_tokens_part = _tokenize_str(
+                    "assistant", turn_response
+                )
+                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+                next_context_tokens = (
+                    nl_tokens + query_tokens + nl_tokens + response_tokens
+                )
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            else:
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n"
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + self.tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+        return raw_text, context_tokens
+
+    def generate_for_qwenvl(
+        self,
+        input_tokens,
+        max_new_tokens: int,
+        prompt_table=None,
+        tasks=None,
+        task_vocab_size=None,
+    ):
+        input_ids = None
+        input_lengths = None
+        input_ids = torch.as_tensor(input_tokens, device="cuda", dtype=torch.int32)
+        input_lengths = torch.tensor(
+            [input_ids.size(1)], device="cuda", dtype=torch.int32
+        )
+        max_input_length = torch.max(input_lengths).item()
+        max_new_tokens = min(
+            max_new_tokens, self.global_max_input_len - max_input_length
+        )
+        self.decoder.setup(
+            batch_size=input_lengths.size(0),
+            max_context_length=max_input_length,
+            max_new_tokens=max_new_tokens,
+        )
+        profiler.start("QWen")
+        run_time = 1
+        for _ in range(run_time):
+            output_ids = self.decoder.decode(
+                input_ids,
+                input_lengths,
+                self.sampling_config,
+                prompt_table,
+                tasks,
+                task_vocab_size,
+            )
+            torch.cuda.synchronize()
+        profiler.stop("QWen")
+        Qwen_time = profiler.elapsed_time_in_sec("QWen") / run_time
+
+        return output_ids, Qwen_time
+
+    def qwen_infer(
+        self, input_vit, images_path, input_text, max_new_tokens, history=None
+    ):
+        if images_path is None:
+            content_list = []
+        else:
+            content_list = images_path
+        if history is None:
+            history = []
+        content_list.append({"text": input_text})
+        query = self.tokenizer.from_list_format(content_list)
+        raw_text, context_tokens = self.make_context(query, history=history)
+        # context_tokens = self.tokenizer.encode(query)
+        input_ids = torch.tensor([context_tokens]).to("cuda")
+        bos_pos = torch.where(input_ids == self.config.visual["image_start_id"])
+        eos_pos = torch.where(input_ids == self.config.visual["image_start_id"] + 1)
+        assert (bos_pos[0] == eos_pos[0]).all()
+        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
+        vocab_size = self.config.vocab_size
+        fake_prompt_id = torch.arange(
+            vocab_size,
+            vocab_size + input_vit.shape[0] * input_vit.shape[1],
+            device="cuda",
+        )
+        fake_prompt_id = fake_prompt_id.reshape(input_vit.shape[0], input_vit.shape[1])
+        for idx, (i, a, b) in enumerate(img_pos):
+            input_ids[i][a + 1 : b] = fake_prompt_id[idx]
+        input_ids = input_ids.contiguous().to(torch.int32).cuda()
+        input_lengths = torch.tensor(input_ids.size(1), dtype=torch.int32).cuda()
+        dtype = self.model_config.dtype
+        prompt_table, tasks, task_vocab_size = self.ptuning_setup(
+            input_vit, dtype, self.config.hidden_size, None, input_ids
+        )
+
+        output_ids, Qwen_time = self.generate_for_qwenvl(
+            input_ids, max_new_tokens, prompt_table, tasks, task_vocab_size
+        )
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+        input_lengths = torch.tensor(
+            [input_ids.size(1)], device="cuda", dtype=torch.int32
+        )
+        effective_output_token = 0
+        if runtime_rank == 0:
+            if self.output_csv is None and self.output_npy is None:
+                for b in range(input_lengths.size(0)):
+                    inputs = input_ids[b]
+                    if content_list is not None:
+                        print(f'Input: "{content_list}"')
+                        print("\n")
+                    if self.num_beams <= 1:
+                        outputs = output_ids[b][0, len(inputs) :].tolist()
+                        try:
+                            effective_output_token = (
+                                effective_output_token + outputs.index(151643)
+                            )
+                        except:
+                            effective_output_token = 1
+                        output_text = self.tokenizer.decode(
+                            outputs, skip_special_tokens=True
+                        )
+                        print(f'Output: "{output_text}"')
+                        print("\n")
+                    else:
+                        for beam in range(self.num_beams):
+                            outputs = output_ids[b][beam, len(inputs) :].tolist()
+                            output_text = self.tokenizer.decode(
+                                outputs, skip_special_tokens=True
+                            )
+                            print(f'Output(beam: {beam}): "{output_text}"')
+
+        logger.info(f"TensorRT-LLM QWen time: {Qwen_time} sec ")
+        history.append((query, output_text))
+        return output_text
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--max_new_tokens", type=int, default=200)
+    parser.add_argument("--log_level", type=str, default="info")
+    parser.add_argument(
+        "--vit_engine_dir",
+        type=str,
+        default="qwen_outputs",
+    )
+    parser.add_argument(
+        "--qwen_engine_dir",
+        type=str,
+        default="qwen_outputs",
+    )
+    parser.add_argument(
+        "--tokenizer_dir",
+        type=str,
+        default=".",
+        help="Directory containing the tokenizer.model.",
+    )
+    parser.add_argument("--input_text", type=str, default="Describe the picture")
+    parser.add_argument(
+        "--images_path", type=list, default=[{"image": "./pics/demo.jpeg"}]
+    )
+    parser.add_argument("--input_dir", type=list, default=[{"image": "image.pt"}])
+
+    parser.add_argument(
+        "--input_tokens",
+        dest="input_file",
+        type=str,
+        help="CSV or Numpy file containing tokenized input. Alternative to text input.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_csv",
+        type=str,
+        help="CSV file where the tokenized output is stored.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_npy",
+        type=str,
+        help="Numpy file where the tokenized output is stored.",
+        default=None,
+    )
+    parser.add_argument(
+        "--num_beams", type=int, help="Use beam search if num_beams >1", default=1
+    )
+    parser.add_argument("--display", default=False, action="store_true")
+    parser.add_argument("--port", type=str, default="8006")
+    parser.add_argument("--local_machine", default=False, action="store_true")
+    return parser.parse_args()
+
+
+def vit_process(image_path, engine_dir, stream):
+    vit_path = os.path.join(engine_dir, "visual_encoder/visual_encoder_fp16.plan")
+    logger.info(f"Loading engine from {vit_path}")
+    with open(vit_path, "rb") as f:
+        engine_buffer = f.read()
+    logger.info(f"Creating session from engine {vit_path}")
+    session_vit = Session.from_serialized_engine(engine_buffer)
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    images_list = []
+    for img in image_path:
+        for v in img.values():
+            image = torch.load(v)
+            if image.device.type == "cpu":
+                image = image.to(device)
+            images_list.append(image)
+    images = torch.cat(images_list)
+    batch_size = images.size(0)
+    images = images.expand(batch_size, -1, -1, -1).contiguous()
+    visual_inputs = {"input": images.float()}
+    visual_output_info = session_vit.infer_shapes(
+        [TensorInfo("input", trt.DataType.FLOAT, images.shape)]
+    )
+    visual_outputs = {
+        t.name: torch.empty(
+            tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device="cuda"
+        )
+        for t in visual_output_info
+    }
+    profiler.start("ViT")
+
+    run_time = 1
+    for _ in range(run_time):
+        ok = session_vit.run(visual_inputs, visual_outputs, stream)
+    profiler.stop("ViT")
+    Vit_time = profiler.elapsed_time_in_sec("ViT") / run_time
+    logger.info(f"TensorRT-LLM ViT latency: {Vit_time} sec ")
+
+    assert ok, "Runtime execution failed for vit session"
+
+    image_embeds = visual_outputs["output"]
+    return image_embeds
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    stream = torch.cuda.current_stream().cuda_stream
+    tensorrt_llm.logger.set_level(args.log_level)
+    image_embeds = vit_process(args.input_dir, args.vit_engine_dir, stream)
+    qinfer = QWenInfer(
+        args.tokenizer_dir,
+        args.qwen_engine_dir,
+        args.log_level,
+        args.output_csv,
+        args.output_npy,
+        args.num_beams,
+    )
+    qinfer.qwen_model_init()
+    qinfer.qwen_infer(
+        image_embeds, args.images_path, args.input_text, args.max_new_tokens, history=[]
+    )
diff --git a/swarms_cloud/__init__.py b/swarms_cloud/__init__.py
index 2945cd6..438b434 100644
--- a/swarms_cloud/__init__.py
+++ b/swarms_cloud/__init__.py
@@ -38,6 +38,12 @@
     check_request,
 )
 
+from swarms_cloud.openai_spec import (
+    InputOpenAISpec,
+    OutputOpenAISpec,
+    OpenAIAPIWrapper,
+)
+
 
 __all__ = [
     "generate_api_key",
@@ -75,4 +81,7 @@
     "get_model_list",
     "create_error_response",
     "check_request",
+    "InputOpenAISpec",
+    "OutputOpenAISpec",
+    "OpenAIAPIWrapper",
 ]
diff --git a/swarms_cloud/openai_spec.py b/swarms_cloud/openai_spec.py
new file mode 100644
index 0000000..c2eec6b
--- /dev/null
+++ b/swarms_cloud/openai_spec.py
@@ -0,0 +1,134 @@
+from typing import List, Dict, Any
+import uuid
+from pydantic import BaseModel
+from typing import Optional
+import time
+
+# ID
+id = str(uuid.uuid4())
+
+
+class InputOpenAISpec(BaseModel):
+    """OpenAI Spec for the model"""
+
+    model: Optional[str] = "gpt-3.5-turbo"
+    max_new_tokens: Optional[int] = 100
+    prompt: Optional[str] = ""
+    stream: Optional[bool] = False
+    sampling_params: Optional[Dict[str, Any]] = None
+    best_of: Optional[int] = 1
+    echo: Optional[bool] = False
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, Any]] = None
+    logprobs: Optional[int] = None
+    max_tokens: Optional[int] = None
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0.0
+    seed: Optional[int] = None
+    stop: Optional[str] = None
+    suffix: Optional[str] = None
+    temperature: Optional[float] = 0.0
+    top_k: Optional[int] = 0
+    top_p: Optional[float] = 1.0
+    user: Optional[str] = None
+
+
+class OutputOpenAISpec(BaseModel):
+    """OpenAI Spec for the model"""
+
+    id: Optional[str] = id
+    created: Optional[int] = int(time.time())
+    object: Optional[str] = None
+    system_fingerpoint: Optional[str] = None
+    model: Optional[str] = "gpt-3.5-turbo"
+    max_new_tokens: Optional[int] = 100
+    prompt: Optional[str] = ""
+    stream: Optional[bool] = False
+    sampling_params: Optional[Dict[str, Any]] = None
+    best_of: Optional[int] = 1
+    echo: Optional[bool] = False
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, Any]] = None
+    logprobs: Optional[int] = None
+    max_tokens: Optional[int] = None
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0.0
+    seed: Optional[int] = None
+    stop: Optional[str] = None
+    suffix: Optional[str] = None
+    temperature: Optional[float] = 0.0
+    top_k: Optional[int] = 0
+    top_p: Optional[float] = 1.0
+    user: Optional[str] = None
+    usage: Optional[Dict[str, Any]] = None
+    completion_tokens: Optional[int] = None
+    prompt_tokens: Optional[int] = None
+    total_tokens: Optional[int] = None
+    choices: Optional[List[Dict[str, Any]]] = None
+    finish_reason: Optional[str] = None
+    index: Optional[int] = None
+    logprobs: Optional[Dict[str, Any]] = None
+    text: Optional[str] = None
+    conversation_id: Optional[str] = None
+    response_id: Optional[str] = None
+    timestamp: Optional[str] = None
+    status: Optional[str] = None
+    error: Optional[str] = None
+    error_message: Optional[str] = None
+    error_code: Optional[int] = None
+    error_details: Optional[str] = None
+    error_traceback: Optional[str] = None
+    error_cause: Optional[str] = None
+    error_type: Optional[str] = None
+    error_context: Optional[str] = None
+
+class OpenAIAPIWrapper:
+    """
+    A wrapper class for the OpenAI API.
+
+    This class provides methods to set and get the input and output specifications for the OpenAI API.
+    """
+
+    def __init__(self):
+        self.input_spec = InputOpenAISpec()
+        self.output_spec = OutputOpenAISpec()
+        
+    def set_input_spec(self, **kwargs):
+        """
+        Set the input specification for the OpenAI API.
+
+        Args:
+            **kwargs: Keyword arguments representing the input specification attributes and their values.
+        """
+        for key, value in kwargs.items():
+            if hasattr(self.input_spec, key):
+                setattr(self.input_spec, key, value)
+    
+    def set_output_spec(self, **kwargs):
+        """
+        Set the output specification for the OpenAI API.
+
+        Args:
+            **kwargs: Keyword arguments representing the output specification attributes and their values.
+        """
+        for key, value in kwargs.items():
+            if hasattr(self.output_spec, key):
+                setattr(self.output_spec, key, value)
+                
+    def get_input_spec(self):
+        """
+        Get the input specification for the OpenAI API.
+
+        Returns:
+            str: JSON representation of the input specification.
+        """
+        return self.input_spec.json()
+    
+    def get_output_spec(self):
+        """
+        Get the output specification for the OpenAI API.
+
+        Returns:
+            str: JSON representation of the output specification.
+        """
+        return self.output_spec.json()
\ No newline at end of file