From 1b0e0f81e36f9c5acf46adc68c61ddb4ac4a6ea5 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Sun, 3 Nov 2024 21:22:10 +0700
Subject: [PATCH 1/7] chore: add document for function calling

---
 docs/docs/capabilities/models/index.mdx      |   8 +-
 docs/docs/capabilities/models/model-yaml.mdx |  88 +++-
 docs/docs/guides/function-calling.md         | 439 ++++++++++++++++++-
 3 files changed, 514 insertions(+), 21 deletions(-)

diff --git a/docs/docs/capabilities/models/index.mdx b/docs/docs/capabilities/models/index.mdx
index b6f4b9036..861dd1211 100644
--- a/docs/docs/capabilities/models/index.mdx
+++ b/docs/docs/capabilities/models/index.mdx
@@ -14,10 +14,10 @@ When Cortex.cpp is started, it automatically starts an API server, this is inspi
 The model in the API server is automatically loaded/unloaded by using the [`/chat/completions`](/api-reference#tag/inference/post/v1/chat/completions) endpoint.
 :::
 ## Model Formats
-Cortex.cpp supports three model formats:
-- GGUF
-- ONNX
-- TensorRT-LLM
+Cortex.cpp supports three model formats and each model format require specific engine to run:
+- GGUF - run with `llama-cpp` engine
+- ONNX - run with `onnxruntime` engine
+- TensorRT-LLM - run with `tensorrt-llm` engine
 
 :::info
 For details on each format, see the [Model Formats](/docs/capabilities/models/model-yaml#model-formats) page.
diff --git a/docs/docs/capabilities/models/model-yaml.mdx b/docs/docs/capabilities/models/model-yaml.mdx
index 983f0f528..06a4455a4 100644
--- a/docs/docs/capabilities/models/model-yaml.mdx
+++ b/docs/docs/capabilities/models/model-yaml.mdx
@@ -39,6 +39,23 @@ temperature: 0.6     # Ranges: 0 to 1
 frequency_penalty: 0 # Ranges: 0 to 1
 presence_penalty: 0  # Ranges: 0 to 1
 max_tokens: 8192     # Should be default to context length
+seed: -1
+dynatemp_range: 0
+dynatemp_exponent: 1
+top_k: 40
+min_p: 0.05
+tfs_z: 1
+typ_p: 1
+repeat_last_n: 64
+repeat_penalty: 1
+mirostat: false
+mirostat_tau: 5
+mirostat_eta: 0.1
+penalize_nl: false
+ignore_eos: false
+n_probs: 0
+n_parallels: 1
+min_keep: 0
 ## END OPTIONAL
 # END INFERENCE PARAMETERS
 
@@ -54,6 +71,7 @@ prompt_template: |+  # tokenizer.chat_template
 ## BEGIN OPTIONAL
 ctx_len: 0          # llama.context_length | 0 or undefined = loaded from model
 ngl: 33             # Undefined = loaded from model
+engine: llama-cpp
 ## END OPTIONAL
 # END MODEL LOAD PARAMETERS
 
@@ -84,23 +102,59 @@ stop:
   - <|end_of_text|>
   - <|eot_id|>
   - <|eom_id|>
-stream: true         
-top_p: 0.9           
-temperature: 0.6     
-frequency_penalty: 0 
-presence_penalty: 0  
-max_tokens: 8192     
+stream: true # Default true?
+top_p: 0.9 # Ranges: 0 to 1
+temperature: 0.6 # Ranges: 0 to 1
+frequency_penalty: 0 # Ranges: 0 to 1
+presence_penalty: 0 # Ranges: 0 to 1
+max_tokens: 8192 # Should be default to context length
+seed: -1
+dynatemp_range: 0
+dynatemp_exponent: 1
+top_k: 40
+min_p: 0.05
+tfs_z: 1
+typ_p: 1
+repeat_last_n: 64
+repeat_penalty: 1
+mirostat: false
+mirostat_tau: 5
+mirostat_eta: 0.1
+penalize_nl: false
+ignore_eos: false
+n_probs: 0
+n_parallels: 1
+min_keep: 0
+
 ```
 Inference parameters define how the results will be produced. The required parameters include:
-| **Parameter**          | **Description**                                                                      | **Required** |
-|------------------------|--------------------------------------------------------------------------------------|--------------|
-| `top_p`                | The cumulative probability threshold for token sampling.                             | No  |
-| `temperature`          | Controls the randomness of predictions by scaling logits before applying softmax.    | No   |
-| `frequency_penalty`    | Penalizes new tokens based on their existing frequency in the sequence so far.       | No   |
-| `presence_penalty`     | Penalizes new tokens based on whether they appear in the sequence so far.            | No   |
-| `max_tokens`           | Maximum number of tokens in the output.              | No   |
-| `stream`               | Enables or disables streaming mode for the output (true or false).                   | No   |
-| `stop`               | Specifies the stopping condition for the model, which can be a word, a letter, or a specific text. | Yes   |
+
+| **Parameter** | **Description** | **Required** |
+|---------------|-----------------|--------------|
+| `stream` | Enables or disables streaming mode for the output (true or false). | No |
+| `top_p` | The cumulative probability threshold for token sampling. Ranges from 0 to 1. | No |
+| `temperature` | Controls the randomness of predictions by scaling logits before applying softmax. Ranges from 0 to 1. | No |
+| `frequency_penalty` | Penalizes new tokens based on their existing frequency in the sequence so far. Ranges from 0 to 1. | No |
+| `presence_penalty` | Penalizes new tokens based on whether they appear in the sequence so far. Ranges from 0 to 1. | No |
+| `max_tokens` | Maximum number of tokens in the output for 1 turn. | No |
+| `seed` | Seed for the random number generator. `-1` means no seed. | No |
+| `dynatemp_range` | Dynamic temperature range. | No |
+| `dynatemp_exponent` | Dynamic temperature exponent. | No |
+| `top_k` | The number of most likely tokens to consider at each step. | No |
+| `min_p` | Minimum probability threshold for token sampling. | No |
+| `tfs_z` | The z-score used for Typical token sampling. | No |
+| `typ_p` | The cumulative probability threshold used for Typical token sampling. | No |
+| `repeat_last_n` | Number of previous tokens to penalize for repeating. | No |
+| `repeat_penalty` | Penalty for repeating tokens. | No |
+| `mirostat` | Enables or disables Mirostat sampling (true or false). | No |
+| `mirostat_tau` | Target entropy value for Mirostat sampling. | No |
+| `mirostat_eta` | Learning rate for Mirostat sampling. | No |
+| `penalize_nl` | Penalizes newline tokens (true or false). | No |
+| `ignore_eos` | Ignores the end-of-sequence token (true or false). | No |
+| `n_probs` | Number of probabilities to return. | No |
+| `min_keep` | Minimum number of tokens to keep. | No |
+| `n_parallels` | Number of parallel streams to use. This params allow you to use multiple chat terminal at the same time. Notice that you need to update `ctx_len` coressponding to `n_parallels` (e.g n_parallels=1, ctx_len=2048 -> n_parallels=2, ctx_len=4096. ) | No |
+| `stop` | Specifies the stopping condition for the model, which can be a word, a letter, or a specific text. | Yes |
 
 
 ### Model Load Parameters
@@ -114,6 +168,8 @@ prompt_template: |+
 
 ctx_len: 0          
 ngl: 33 
+engine: llama-cpp
+
 ```
 Model load parameters include the options that control how Cortex.cpp runs the model. The required parameters include:
 | **Parameter**          | **Description**                                                                      | **Required** |
@@ -121,7 +177,7 @@ Model load parameters include the options that control how Cortex.cpp runs the m
 | `ngl`                  | Number of attention heads.                                                           | No          |
 | `ctx_len`              | Context length (maximum number of tokens).                                           | No          |
 | `prompt_template`      | Template for formatting the prompt, including system messages and instructions.      | Yes          |
-
+| `engine`      | The engine that run model, default to `llama-cpp` for local model with gguf format.      | Yes          |
 
 :::info
 You can download all the supported model formats from the following:
diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
index 40a708675..41f9c5084 100644
--- a/docs/docs/guides/function-calling.md
+++ b/docs/docs/guides/function-calling.md
@@ -1,3 +1,440 @@
 ---
 title: Function Calling
----
\ No newline at end of file
+---
+
+# Function calling with OpenAI compatible
+
+This tutorial, I use the `mistral-nemo:12b-gguf-q4-km` for testing with cortex.cpp. All steps are reproduced from original openai instruction https://platform.openai.com/docs/guides/function-calling
+
+## Step by step with function calling
+
+### 1. Start server and run model.
+
+```
+cortex run mistral-nemo:12b-gguf-q4-km
+```
+
+### 2. Create a python script `function_calling.py` with this content:
+
+```
+from datetime import datetime
+from openai import OpenAI
+from pydantic import BaseModel
+ENDPOINT = "http://localhost:39281/v1"
+MODEL = "mistral-nemo:12b-gguf-q4-km"
+client = OpenAI(
+    base_url=ENDPOINT,
+    api_key="not-needed"
+)
+```
+
+This step creates OpenAI client in python
+
+### 3. Start create a chat completion with tool calling
+
+```
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_delivery_date",
+
+                "strict": True,
+            "description": "Get the delivery date for a customer's order. Call this whenever you need to know the delivery date, for example when a customer asks 'Where is my package'",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "order_id": {
+                        "type": "string",
+                        "description": "The customer's order ID.",
+                    },
+                },
+                "required": ["order_id"],
+                "additionalProperties": False,
+            },
+        }
+    }
+]
+completion_payload = {
+    "messages": [
+        {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
+        {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
+    ]
+}
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages=completion_payload["messages"],
+    tools=tools,
+)
+print(response)
+```
+
+Because you didn't provide the `order_id`, the model will ask again
+
+```
+ChatCompletion(
+   id='1lblzWtLw9h5HG0GjYYi',
+   choices=[
+       Choice(
+           finish_reason=None,
+           index=0,
+           logprobs=None,
+           message=ChatCompletionMessage(
+               content='Of course! Please provide your order ID so I can look it up.',
+               refusal=None,
+               role='assistant',
+               audio=None,
+               function_call=None,
+               tool_calls=None
+           )
+       )
+   ],
+   created=1730204306,
+   model='_',
+   object='chat.completion',
+   service_tier=None,
+   system_fingerprint='_',
+   usage=CompletionUsage(
+       completion_tokens=15,
+       prompt_tokens=449,
+       total_tokens=464,
+       completion_tokens_details=None,
+       prompt_tokens_details=None
+   )
+)
+```
+
+### 4. Add new message user provide order id
+
+```
+completion_payload = {
+    "messages": [
+        {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
+        {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
+        {"role": "assistant", "content": "Sure! Could you please provide your order ID so I can look up the delivery date for you?"},
+        {"role": "user", "content": "i think it is order_12345"},
+    ]
+}
+
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages=completion_payload["messages"],
+    tools=tools
+)
+```
+
+The response of the model will be
+
+```
+ChatCompletion(
+   id='zUnHwEPCambJtrvWOAQy',
+   choices=[
+       Choice(
+           finish_reason='tool_calls',
+           index=0,
+           logprobs=None,
+           message=ChatCompletionMessage(
+               content='',
+               refusal=None,
+               role='assistant',
+               audio=None,
+               function_call=None,
+               tool_calls=[
+                   ChatCompletionMessageToolCall(
+                       id=None,
+                       function=Function(
+                           arguments='{"order_id": "order_12345"}',
+                           name='get_delivery_date'
+                       ),
+                       type='function'
+                   )
+               ]
+           )
+       )
+   ],
+   created=1730204559,
+   model='_',
+   object='chat.completion',
+   service_tier=None,
+   system_fingerprint='_',
+   usage=CompletionUsage(
+       completion_tokens=23,
+       prompt_tokens=483,
+       total_tokens=506,
+       completion_tokens_details=None,
+       prompt_tokens_details=None
+   )
+)
+```
+
+It can return correct function with arguments
+
+### 5. Push the response to the conversation and ask model to answer user
+
+```
+order_id = "order_12345"
+delivery_date = datetime.now()
+
+# Simulate the tool call response
+response = {
+    "choices": [
+        {
+            "message": {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_62136354",
+                        "type": "function",
+                        "function": {
+                            "arguments": "{'order_id': 'order_12345'}",
+                            "name": "get_delivery_date"
+                        }
+                    }
+                ]
+            }
+        }
+    ]
+}
+
+# Create a message containing the result of the function call
+function_call_result_message = {
+    "role": "tool",
+    "content": json.dumps({
+        "order_id": order_id,
+        "delivery_date": delivery_date.strftime('%Y-%m-%d %H:%M:%S')
+    }),
+    "tool_call_id": response['choices'][0]['message']['tool_calls'][0]['id']
+}
+
+# Prepare the chat completion call payload
+completion_payload = {
+    "messages": [
+        {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
+        {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
+        {"role": "assistant", "content": "Sure! Could you please provide your order ID so I can look up the delivery date for you?"},
+        {"role": "user", "content": "i think it is order_12345"},
+        response["choices"][0]["message"],
+        function_call_result_message
+    ]
+}
+
+client = OpenAI(
+    # This is the default and can be omitted
+    base_url=ENDPOINT,
+    api_key="not-needed"
+)
+
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages=completion_payload["messages"],
+    tools=tools,
+)
+print(response)
+```
+
+The response will include all the content that processed by the function, where the delivery date is produced by query db, ....
+
+```
+ChatCompletion(
+   id='l1xdCuKVMYBSC5tEDlAn',
+   choices=[
+       Choice(
+           finish_reason=None,
+           index=0,
+           logprobs=None,
+           message=ChatCompletionMessage(
+               content="Your order with ID 'order_12345' is scheduled to be delivered on October 29, 2024. Is there anything else I can help you with?",
+               refusal=None,
+               role='assistant',
+               audio=None,
+               function_call=None,
+               tool_calls=None
+           )
+       )
+   ],
+   created=1730205470,
+   model='_',
+   object='chat.completion',
+   service_tier=None,
+   system_fingerprint='_',
+   usage=CompletionUsage(
+       completion_tokens=40,
+       prompt_tokens=568,
+       total_tokens=608,
+       completion_tokens_details=None,
+       prompt_tokens_details=None
+   )
+)
+```
+
+## Handling parallel function calling
+
+Cortex cpp support parallel function calling by default
+
+```
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_delivery_date",
+
+                "strict": True,
+            "description": "Get the delivery date for a customer's order. Call this whenever you need to know the delivery date, for example when a customer asks 'Where is my package'",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "order_id": {
+                        "type": "string",
+                        "description": "The customer's order ID.",
+                    },
+                },
+                "required": ["order_id"],
+                "additionalProperties": False,
+            },
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_conditions",
+            "description": "Get the current weather conditions for a specific location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g., San Francisco, CA"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["Celsius", "Fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the user's location."
+                    }
+                },
+                "required": ["location", "unit"]
+            }
+        }
+    }
+]
+
+messages = [
+    {"role": "user", "content": "Hi, can you tell me the delivery date for my order order_12345 and check the weather condition in LA?"}
+]
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages= messages, 
+    tools=tools
+)
+print(response)
+```
+
+It will call 2 functions in parallel
+
+```
+ChatCompletion(
+    id='5ot3qux399DojubnBFrG',
+    choices=[
+        Choice(
+            finish_reason='tool_calls',
+            index=0,
+            logprobs=None,
+            message=ChatCompletionMessage(
+                content='',
+                refusal=None,
+                role='assistant',
+                audio=None,
+                function_call=None,
+                tool_calls=[
+                    ChatCompletionMessageToolCall(
+                        id=None,
+                        function=Function(
+                            arguments='{"order_id": "order_12345"}',
+                            name='get_delivery_date'
+                        ),
+                        type='function'
+                    ),
+                    ChatCompletionMessageToolCall(
+                        id=None,
+                        function=Function(
+                            arguments='{"location": "LA", "unit": "Fahrenheit"}',
+                            name='get_current_conditions'
+                        ),
+                        type='function'
+                    )
+                ]
+            )
+        )
+    ],
+    created=1730205975,
+    model='_',
+    object='chat.completion',
+    service_tier=None,
+    system_fingerprint='_',
+    usage=CompletionUsage(
+        completion_tokens=47,
+        prompt_tokens=568,
+        total_tokens=615,
+        completion_tokens_details=None,
+        prompt_tokens_details=None
+    )
+)
+```
+
+## Configuring function calling behavior using the tool_choice parameter
+
+User can set `tool_choice=none` to disable function calling even if the tools are provided
+
+```
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages= messages, #completion_payload["messages"],
+    tools=tools,
+    tool_choice="none"
+)
+```
+
+User can also force model to call a tool by specify the tool name, in this example it's the `get_current_conditions`
+
+```
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages= [{"role": "user", "content": "Hi, can you tell me the delivery date for my order order_12345 and check the weather condition in LA?"}],
+    tools=tools,
+    tool_choice= {"type": "function", "function": {"name": "get_current_conditions"}})
+
+```
+
+User can also specify the function with enum field to the tool definition to make model generate more accurate.
+
+```
+{
+    "name": "pick_tshirt_size",
+    "description": "Call this if the user specifies which size t-shirt they want",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "size": {
+                "type": "string",
+                "enum": ["s", "m", "l"],
+                "description": "The size of the t-shirt that the user would like to order"
+            }
+        },
+        "required": ["size"],
+        "additionalProperties": false
+    }
+}
+```
+
+(*) Note that the accuracy of function calling heavily depends on the quality of the model. For small models like 8B or 12B, we should only use function calling with simple cases.

From fd2a4750114da38dd3cbb65c745245ce9af2b1ec Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Sun, 3 Nov 2024 22:07:35 +0700
Subject: [PATCH 2/7] chore: add example on structure output

---
 docs/docs/guides/structured-outputs.md | 227 ++++++++++++++++++++++++-
 1 file changed, 226 insertions(+), 1 deletion(-)

diff --git a/docs/docs/guides/structured-outputs.md b/docs/docs/guides/structured-outputs.md
index b14739ab2..8a523fe93 100644
--- a/docs/docs/guides/structured-outputs.md
+++ b/docs/docs/guides/structured-outputs.md
@@ -1,3 +1,228 @@
 ---
 title: Structured Outputs
----
\ No newline at end of file
+---
+# Structured Outputs
+
+The Structured Outputs/Response Format feature in [OpenAI](https://platform.openai.com/docs/guides/structured-outputs) is fundamentally a prompt engineering challenge. While its goal is to use system prompts to generate JSON output matching a specific schema, popular open-source models like Llama 3.1 and Mistral Nemo struggle to consistently generate exact JSON output that matches the requirements. An easy way to directly guild the model to reponse in json format in system message:
+
+```
+from openai import OpenAI
+from pydantic import BaseModel
+ENDPOINT = "http://localhost:39281/v1"
+MODEL = "llama3.1:8b-gguf-q4-km"
+client = OpenAI(
+    base_url=ENDPOINT,
+    api_key="not-needed"
+)
+
+format = {
+    "steps": [{
+        "explanation": "string",
+        "output": "string"
+    }
+    ],
+    "final_output": "string"
+}
+
+completion_payload = {
+    "messages": [
+        {"role": "system", "content": f"You are a helpful math tutor. Guide the user through the solution step by step. You have to response in this json format {format}\n"},
+        {"role": "user", "content": "how can I solve 8x + 7 = -23"}
+    ]
+}
+
+response = client.chat.completions.create(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages=completion_payload["messages"]
+)
+
+print(response)
+```
+
+The output of the model like this
+
+```
+
+ChatCompletion(
+    id='OZI0q8hghjYQY7NXlLId',
+    choices=[
+        Choice(
+            finish_reason=None,
+            index=0,
+            logprobs=None,
+            message=ChatCompletionMessage(
+                content='''Here's how you can solve it:
+
+{
+    "steps": [
+        {
+            "explanation": "First, we need to isolate the variable x. To do this, subtract 7 from both sides of the equation.",
+            "output": "8x + 7 - 7 = -23 - 7"
+        },
+        {
+            "explanation": "This simplifies to 8x = -30",
+            "output": "8x = -30"
+        },
+        {
+            "explanation": "Next, divide both sides of the equation by 8 to solve for x.",
+            "output": "(8x) / 8 = -30 / 8"
+        },
+        {
+            "explanation": "This simplifies to x = -3.75",
+            "output": "x = -3.75"
+        }
+    ],
+    "final_output": "-3.75"
+}''',
+                refusal=None,
+                role='assistant',
+                audio=None,
+                function_call=None,
+                tool_calls=None
+            )
+        )
+    ],
+    created=1730645716,
+    model='_',
+    object='chat.completion',
+    service_tier=None,
+    system_fingerprint='_',
+    usage=CompletionUsage(
+        completion_tokens=190,
+        prompt_tokens=78,
+        total_tokens=268,
+        completion_tokens_details=None,
+        prompt_tokens_details=None
+    )
+)
+```
+
+From the output, you can easily parse the response to get correct json format as you guild the model in the system prompt.
+
+Howerver, open source model like llama3.1 or mistral nemo still truggling on mimic newest OpenAI API on response format. For example, consider this request created using the OpenAI library with very simple request like [OpenAI](https://platform.openai.com/docs/guides/structured-outputs#chain-of-thought):
+
+```
+from openai import OpenAI
+ENDPOINT = "http://localhost:39281/v1"
+MODEL = "llama3.1:8b-gguf-q4-km"
+client = OpenAI(
+    base_url=ENDPOINT,
+    api_key="not-needed"
+)
+
+class Step(BaseModel):
+    explanation: str
+    output: str
+
+
+class MathReasoning(BaseModel):
+    steps: List[Step]
+    final_answer: str
+
+  
+completion_payload = {
+    "messages": [
+        {"role": "system", "content": f"You are a helpful math tutor. Guide the user through the solution step by step.\n"},
+        {"role": "user", "content": "how can I solve 8x + 7 = -23"}
+    ]
+}
+
+response = client.beta.chat.completions.parse(
+    top_p=0.9,
+    temperature=0.6,
+    model=MODEL,
+    messages= completion_payload["messages"],
+    response_format=MathReasoning
+)
+```
+
+The response format parsed by OpenAI before sending to the server is quite complex for the `MathReasoning` schema. Unlike GPT models, Llama 3.1 and Mistral Nemo cannot reliably generate responses that can be parsed as shown in the [OpenAI tutorial](https://platform.openai.com/docs/guides/structured-outputs/example-response). This may be due to these models not being trained on similar structured output tasks.
+
+```
+"response_format" : 
+        {
+                "json_schema" : 
+                {
+                        "name" : "MathReasoning",
+                        "schema" : 
+                        {
+                                "$defs" : 
+                                {
+                                        "Step" : 
+                                        {
+                                                "additionalProperties" : false,
+                                                "properties" : 
+                                                {
+                                                        "explanation" : 
+                                                        {
+                                                                "title" : "Explanation",
+                                                                "type" : "string"
+                                                        },
+                                                        "output" : 
+                                                        {
+                                                                "title" : "Output",
+                                                                "type" : "string"
+                                                        }
+                                                },
+                                                "required" : 
+                                                [
+                                                        "explanation",
+                                                        "output"
+                                                ],
+                                                "title" : "Step",
+                                                "type" : "object"
+                                        }
+                                },
+                                "additionalProperties" : false,
+                                "properties" : 
+                                {
+                                        "final_answer" : 
+                                        {
+                                                "title" : "Final Answer",
+                                                "type" : "string"
+                                        },
+                                        "steps" : 
+                                        {
+                                                "items" : 
+                                                {
+                                                        "$ref" : "#/$defs/Step"
+                                                },
+                                                "title" : "Steps",
+                                                "type" : "array"
+                                        }
+                                },
+                                "required" : 
+                                [
+                                        "steps",
+                                        "final_answer"
+                                ],
+                                "title" : "MathReasoning",
+                                "type" : "object"
+                        },
+                        "strict" : true
+                },
+                "type" : "json_schema"
+        }
+```
+
+The response for this request by `mistral-nemo` and `llama3.1` can not be used to parse result like in the [original tutorial by openAI](https://platform.openai.com/docs/guides/structured-outputs/example-response). Maybe `llama3.1` and `mistral-nemo` didn't train with this kind of data, so it fails to handle this case.
+
+```
+Response: {
+        "choices" : 
+        [
+                {
+                        "finish_reason" : null,
+                        "index" : 0,
+                        "message" : 
+                        {
+                                "content" : "Here's a step-by-step guide to solving the equation 8x + 7 = -23:\n\n```json\n{\n  \"name\": \"MathReasoning\",\n  \"schema\": {\n    \"$defs\": {\n      \"Step\": {\n        \"additionalProperties\": false,\n        \"properties\": {\n          \"explanation\": {\"title\": \"Explanation\", \"type\": \"string\"},\n          \"output\": {\"title\": \"Output\", \"type\": \"string\"}\n        },\n        \"required\": [\"explanation\", \"output\"],\n        \"title\": \"Step\",\n        \"type\": \"object\"\n      }\n    },\n    \"additionalProperties\": false,\n    \"properties\": {\n      \"final_answer\": {\"title\": \"Final Answer\", \"type\": \"string\"},\n      \"steps\": {\n        \"items\": {\"$ref\": \"#/$defs/Step\"},\n        \"title\": \"Steps\",\n        \"type\": \"array\"\n      }\n    },\n    \"required\": [\"steps\", \"final_answer\"],\n    \"title\": \"MathReasoning\",\n    \"type\": \"object\"\n  },\n  \"strict\": true\n}\n```\n\n1. **Subtract 7 from both sides** to isolate the term with x:\n\n   - Explanation: To get rid of the +7 on the left side, we add -7 to both sides of the equation.\n   - Output: `8x + 7 - 7 = -23 - 7`\n\n   This simplifies to:\n   ```\n   8x = -30\n   ```\n\n2. **Divide both sides by 8** to solve for x:\n\n   - Explanation: To get rid of the 8 on the left side, we multiply both sides of the equation by the reciprocal of 8, which is 1/8.\n   - Output: `8x / 8 = -30 / 8`\n\n   This simplifies to:\n   ```\n   x = -3.75\n   ```\n\nSo, the final answer is:\n\n- Final Answer: `x = -3.75`",
+                                "role" : "assistant"
+                        }
+                }
+        ],
+```
+
+This feature currently works reliably only with GPT models, not with open-source models. Given these limitations, we suggest that you should only use Response Format feature as the first example (guild the json format for the reponse for model). Besides, the response format maybe just in beta because we have to use `client.beta.chat.completions.parse` to create chat completion instead of `client.chat.completion.create`

From 1d53a8e3cc4327273d6b89d306068f887c9073c4 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Mon, 4 Nov 2024 12:27:14 +0700
Subject: [PATCH 3/7] chore: add document for model vs engine

---
 docs/docs/capabilities/models/index.mdx | 4 ++++
 docs/docs/guides/function-calling.md    | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/docs/capabilities/models/index.mdx b/docs/docs/capabilities/models/index.mdx
index 861dd1211..293fea7ca 100644
--- a/docs/docs/capabilities/models/index.mdx
+++ b/docs/docs/capabilities/models/index.mdx
@@ -7,6 +7,10 @@ description: The Model section overview
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
 
+Models in cortex.cpp are used for inference purposes (e.g., chat completion, embedding, etc.). We support two types of models: local and remote.
+Local models use a local inference engine to run completely offline on your hardware. Currently, we support llama.cpp with the GGUF model format, and we have plans to support TensorRT-LLM and ONNX engines in the future.
+Remote models (like OpenAI GPT-4 and Claude 3.5 Sonnet) use remote engines. Support for OpenAI and Anthropic engines is under development and will be available in cortex.cpp soon.
+
 When Cortex.cpp is started, it automatically starts an API server, this is inspired by Docker CLI. This server manages various model endpoints. These endpoints facilitate the following:
 - **Model Operations**: Run and stop models.
 - **Model Management**: Manage your local models.
diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
index 41f9c5084..002ecdcfe 100644
--- a/docs/docs/guides/function-calling.md
+++ b/docs/docs/guides/function-calling.md
@@ -1,10 +1,9 @@
 ---
 title: Function Calling
 ---
-
 # Function calling with OpenAI compatible
 
-This tutorial, I use the `mistral-nemo:12b-gguf-q4-km` for testing with cortex.cpp. All steps are reproduced from original openai instruction https://platform.openai.com/docs/guides/function-calling
+This tutorial, I use the `mistral-nemo:12b-gguf-q4-km` for testing function calling with cortex.cpp. All steps are reproduced from original openai instruction https://platform.openai.com/docs/guides/function-calling
 
 ## Step by step with function calling
 
@@ -113,7 +112,7 @@ completion_payload = {
     "messages": [
         {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
         {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
-        {"role": "assistant", "content": "Sure! Could you please provide your order ID so I can look up the delivery date for you?"},
+        {"role": "assistant", "content": "Of course! Please provide your order ID so I can look it up."},
         {"role": "user", "content": "i think it is order_12345"},
     ]
 }

From e72eec26be53a05616132cd6bf58835b2c452962 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Mon, 4 Nov 2024 14:52:13 +0700
Subject: [PATCH 4/7] Update model.yml documentation

---
 docs/docs/capabilities/models/index.mdx      |  2 +
 docs/docs/capabilities/models/model-yaml.mdx | 41 +++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/docs/docs/capabilities/models/index.mdx b/docs/docs/capabilities/models/index.mdx
index 293fea7ca..f4a10b1ae 100644
--- a/docs/docs/capabilities/models/index.mdx
+++ b/docs/docs/capabilities/models/index.mdx
@@ -8,7 +8,9 @@ description: The Model section overview
 :::
 
 Models in cortex.cpp are used for inference purposes (e.g., chat completion, embedding, etc.). We support two types of models: local and remote.
+
 Local models use a local inference engine to run completely offline on your hardware. Currently, we support llama.cpp with the GGUF model format, and we have plans to support TensorRT-LLM and ONNX engines in the future.
+
 Remote models (like OpenAI GPT-4 and Claude 3.5 Sonnet) use remote engines. Support for OpenAI and Anthropic engines is under development and will be available in cortex.cpp soon.
 
 When Cortex.cpp is started, it automatically starts an API server, this is inspired by Docker CLI. This server manages various model endpoints. These endpoints facilitate the following:
diff --git a/docs/docs/capabilities/models/model-yaml.mdx b/docs/docs/capabilities/models/model-yaml.mdx
index 06a4455a4..ed19f0e71 100644
--- a/docs/docs/capabilities/models/model-yaml.mdx
+++ b/docs/docs/capabilities/models/model-yaml.mdx
@@ -10,7 +10,7 @@ import TabItem from "@theme/TabItem";
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
 
-Cortex.cpp uses a `model.yaml` file to specify the configuration for running a model. Models can be downloaded from the Cortex Model Hub or Hugging Face repositories. Once downloaded, the model data is parsed and stored in the `models` folder.
+Cortex.cpp utilizes a `model.yaml` file to specify the configuration for running a model. Models can be downloaded from the Cortex Model Hub or Hugging Face repositories. Once downloaded, the model data is parsed and stored in the `models` folder.
 
 ## Structure of `model.yaml`
 
@@ -174,11 +174,48 @@ engine: llama-cpp
 Model load parameters include the options that control how Cortex.cpp runs the model. The required parameters include:
 | **Parameter**          | **Description**                                                                      | **Required** |
 |------------------------|--------------------------------------------------------------------------------------|--------------|
-| `ngl`                  | Number of attention heads.                                                           | No          |
+| `ngl`                  | Number of model layers will be offload to GPU.                                                           | No          |
 | `ctx_len`              | Context length (maximum number of tokens).                                           | No          |
 | `prompt_template`      | Template for formatting the prompt, including system messages and instructions.      | Yes          |
 | `engine`      | The engine that run model, default to `llama-cpp` for local model with gguf format.      | Yes          |
 
+All parameters from the `model.yml` file are used for running the model via the [CLI chat command](/docs/cli/chat) or [CLI run command](/docs/cli/run). These parameters also act as defaults when using the [model start API](/api-reference#tag/models/post/v1/models/start) through cortex.cpp.
+
+## Runtime parameters
+
+In addition to predefined parameters in `model.yml`, Cortex.cpp supports runtime parameters to override these settings when using the [model start API](/api-reference#tag/models/post/v1/models/start).
+
+### Model start params
+
+Cortex.cpp supports the following parameters when starting a model via the [model start API](/api-reference#tag/models/post/v1/models/start) for the `llama-cpp engine`:
+
+```
+cache_enabled: bool
+ngl: int
+n_parallel: int
+cache_type: string
+ctx_len: int
+
+## Support for vision model
+mmproj: string 
+llama_model_path: string
+model_path: string
+```
+
+| **Parameter**          | **Description**                                                                      | **Required** |
+|------------------------|--------------------------------------------------------------------------------------|--------------|
+| `cache_type`           | Data type of the KV cache in llama.cpp models. Supported types are `f16`, `q8_0`, and `q4_0`, default is `f16`.  | No          |
+| `cache_enabled`           |Enables caching of conversation history for reuse in subsequent requests. Default is `false`  | No          |
+
+
+These parameters will override the `model.yml` parameters when starting model through the API.
+
+### Chat completion API parameters
+
+The API is accessible at the `/v1/chat/completions` URL and accepts all parameters from the chat completion API as described [API reference](/api-reference#tag/chat/post/v1/chat/completions)
+
+With the `llama-cpp` engine, cortex.cpp accept all parameters from [`model.yml` inference section](#Inference Parameters) and accept all parameters from the chat completion API.
+
 :::info
 You can download all the supported model formats from the following:
 - [Cortex Model Repos](/docs/hub/cortex-hub)

From a4df116d5ddb33611be344b6405460015f4110e0 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Mon, 4 Nov 2024 15:25:44 +0700
Subject: [PATCH 5/7] Update function calling note

---
 docs/docs/guides/function-calling.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
index 002ecdcfe..be6d9ad8a 100644
--- a/docs/docs/guides/function-calling.md
+++ b/docs/docs/guides/function-calling.md
@@ -437,3 +437,15 @@ User can also specify the function with enum field to the tool definition to mak
 ```
 
 (*) Note that the accuracy of function calling heavily depends on the quality of the model. For small models like 8B or 12B, we should only use function calling with simple cases.
+
+
+
+<!--
+ Note: The function calling feature from cortex.cpp is primarily an application of prompt engineering. When tools are specified, we inject a system prompt into the conversation to facilitate this functionality.
+ 
+ Compatibility: This feature works best with models like llama3.1 and its derivatives, such as mistral-nemo or qwen.
+
+ Customization: Users have the option to manually update the system prompt to fine-tune it for specific problems or use cases. The detail implementation is in this [PR](https://github.com/janhq/cortex.cpp/pull/1472/files).
+
+ The full steps to mimic the function calling feature in Python using openai lib can be found [here](https://github.com/janhq/models/issues/16#issuecomment-2381129322)
+-->

From b558009d7d127d139f07dd90f4a152f4e7e7f705 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Mon, 4 Nov 2024 15:44:43 +0700
Subject: [PATCH 6/7] chore: update structure output documentation

---
 docs/docs/guides/structured-outputs.md | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/docs/guides/structured-outputs.md b/docs/docs/guides/structured-outputs.md
index 8a523fe93..f683d7c4b 100644
--- a/docs/docs/guides/structured-outputs.md
+++ b/docs/docs/guides/structured-outputs.md
@@ -3,6 +3,14 @@ title: Structured Outputs
 ---
 # Structured Outputs
 
+Structured outputs, or response formats, are a feature designed to generate responses in a defined JSON schema, enabling more predictable and machine-readable outputs. This is essential for applications where data consistency and format adherence are crucial, such as automated data processing, structured data generation, and integrations with other systems.
+
+In recent developments, systems like OpenAI's models have excelled at producing these structured outputs. However, while open-source models like Llama 3.1 and Mistral Nemo offer powerful capabilities, they currently struggle to produce reliably structured JSON outputs required for advanced use cases. This often stems from the models not being specifically trained on tasks demanding strict schema adherence.
+
+This guide explores the concept of structured outputs using these models, highlights the challenges faced in achieving consistent output formatting, and provides strategies for improving output accuracy, particularly when using models that don't inherently support this feature as robustly as GPT models.
+
+By understanding these nuances, users can make informed decisions when choosing models for tasks requiring structured outputs, ensuring that the tools they select align with their project's formatting requirements and expected accuracy.
+
 The Structured Outputs/Response Format feature in [OpenAI](https://platform.openai.com/docs/guides/structured-outputs) is fundamentally a prompt engineering challenge. While its goal is to use system prompts to generate JSON output matching a specific schema, popular open-source models like Llama 3.1 and Mistral Nemo struggle to consistently generate exact JSON output that matches the requirements. An easy way to directly guild the model to reponse in json format in system message:
 
 ```
@@ -225,4 +233,16 @@ Response: {
         ],
 ```
 
-This feature currently works reliably only with GPT models, not with open-source models. Given these limitations, we suggest that you should only use Response Format feature as the first example (guild the json format for the reponse for model). Besides, the response format maybe just in beta because we have to use `client.beta.chat.completions.parse` to create chat completion instead of `client.chat.completion.create`
+
+
+
+## Limitations of Open-Source Models for Structured Outputs
+
+While the concept of structured outputs is compelling, particularly for applications requiring machine-readable data, it's important to understand that not all models support this capability equally. Open-source models such as Llama 3.1 and Mistral Nemo face notable challenges in generating outputs that adhere strictly to defined JSON schemas. Here are the key limitations:
+
+- Lack of Training Data: These models have not been specifically trained on tasks demanding precise JSON formatting, unlike some proprietary models which have been fine-tuned for such tasks.
+- Inconsistency in Output: Due to their training scope, `Llama 3.1` and `Mistral Nemo` often produce outputs that may deviate from the intended schema. This can include additional natural language explanations or incorrectly nested JSON structures.
+- Complexity in Parsing: Without consistent JSON formatting, downstream processes that rely on predictable data schemas may encounter errors, leading to challenges in automation and data integration tasks.
+- Beta Features: Some features related to structured outputs may still be in beta, requiring usage of specific methods like `client.beta.chat.completions.parse`, which suggests they are not yet fully reliable in all scenarios.
+
+Given these constraints, users should consider these limitations when choosing a model for tasks involving structured outputs. Where strict compliance with a JSON schema is critical, alternative models designed for such precision might be a more suitable choice.

From c9a26564f4c50f27c4a4e84933dda43012b4d535 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Mon, 4 Nov 2024 16:40:18 +0700
Subject: [PATCH 7/7] Update Chat completion api docs for llama-cpp supported 
 params

---
 docs/docs/capabilities/models/model-yaml.mdx |  3 +-
 docs/docs/guides/function-calling.md         |  8 +-
 docs/static/openapi/cortex.json              | 79 +++++++++++++++++++-
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/docs/docs/capabilities/models/model-yaml.mdx b/docs/docs/capabilities/models/model-yaml.mdx
index ed19f0e71..b47b86a64 100644
--- a/docs/docs/capabilities/models/model-yaml.mdx
+++ b/docs/docs/capabilities/models/model-yaml.mdx
@@ -206,7 +206,8 @@ model_path: string
 |------------------------|--------------------------------------------------------------------------------------|--------------|
 | `cache_type`           | Data type of the KV cache in llama.cpp models. Supported types are `f16`, `q8_0`, and `q4_0`, default is `f16`.  | No          |
 | `cache_enabled`           |Enables caching of conversation history for reuse in subsequent requests. Default is `false`  | No          |
-
+| `mmproj`           |  path to mmproj GGUF model, support for llava model   | No          |
+| `llama_model_path` | path to llm GGUF model  | No          |
 
 These parameters will override the `model.yml` parameters when starting model through the API.
 
diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
index be6d9ad8a..d37911935 100644
--- a/docs/docs/guides/function-calling.md
+++ b/docs/docs/guides/function-calling.md
@@ -438,14 +438,10 @@ User can also specify the function with enum field to the tool definition to mak
 
 (*) Note that the accuracy of function calling heavily depends on the quality of the model. For small models like 8B or 12B, we should only use function calling with simple cases.
 
+ The function calling feature from cortex.cpp is primarily an application of prompt engineering. When tools are specified, we inject a system prompt into the conversation to facilitate this functionality.
 
-
-<!--
- Note: The function calling feature from cortex.cpp is primarily an application of prompt engineering. When tools are specified, we inject a system prompt into the conversation to facilitate this functionality.
- 
  Compatibility: This feature works best with models like llama3.1 and its derivatives, such as mistral-nemo or qwen.
 
  Customization: Users have the option to manually update the system prompt to fine-tune it for specific problems or use cases. The detail implementation is in this [PR](https://github.com/janhq/cortex.cpp/pull/1472/files).
 
- The full steps to mimic the function calling feature in Python using openai lib can be found [here](https://github.com/janhq/models/issues/16#issuecomment-2381129322)
--->
+ The full steps to mimic the function calling feature in Python using openai lib can be found [here](https://github.com/janhq/models/issues/16#issuecomment-2381129322).
diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index f6120a4ad..52dd3960b 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -227,7 +227,9 @@
             }
           }
         },
-        "tags": ["Chat"]
+        "tags": [
+          "Chat"
+        ]
       }
     },
     "/v1/models/pull": {
@@ -664,7 +666,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/threads": {
@@ -2235,6 +2239,66 @@
           "user": {
             "type": "string",
             "description": "A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582)."
+          },
+          "dynatemp_range": {
+            "type": "number",
+            "description": "Dynamic temperature range. This parameter only supported by `llama-cpp` engine."
+          },
+          "dynatemp_exponent": {
+            "type": "number",
+            "description": "Dynamic temperature exponent. This parameter only supported by `llama-cpp` engine."
+          },
+          "top_k": {
+            "type": "integer",
+            "description": "The number of most likely tokens to consider at each step. This parameter only supported by `llama-cpp` engine."
+          },
+          "min_p": {
+            "type": "number",
+            "description": "Minimum probability threshold for token sampling. This parameter only supported by `llama-cpp` engine."
+          },
+          "tfs_z": {
+            "type": "number",
+            "description": "The z-score used for Typical token sampling. This parameter only supported by `llama-cpp` engine."
+          },
+          "typ_p": {
+            "type": "number",
+            "description": "The cumulative probability threshold used for Typical token sampling. This parameter only supported by `llama-cpp` engine."
+          },
+          "repeat_last_n": {
+            "type": "integer",
+            "description": "Number of previous tokens to penalize for repeating. This parameter only supported by `llama-cpp` engine."
+          },
+          "repeat_penalty": {
+            "type": "number",
+            "description": "Penalty for repeating tokens. This parameter only supported by `llama-cpp` engine."
+          },
+          "mirostat": {
+            "type": "boolean",
+            "description": "Enables or disables Mirostat sampling (true or false). This parameter only supported by `llama-cpp` engine."
+          },
+          "mirostat_tau": {
+            "type": "number",
+            "description": "Target entropy value for Mirostat sampling. This parameter only supported by `llama-cpp` engine."
+          },
+          "mirostat_eta": {
+            "type": "number",
+            "description": "Learning rate for Mirostat sampling. This parameter only supported by `llama-cpp` engine."
+          },
+          "penalize_nl": {
+            "type": "boolean",
+            "description": "Penalizes newline tokens (true or false). This parameter only supported by `llama-cpp` engine."
+          },
+          "ignore_eos": {
+            "type": "boolean",
+            "description": "Ignores the end-of-sequence token (true or false). This parameter only supported by `llama-cpp` engine."
+          },
+          "n_probs": {
+            "type": "integer",
+            "description": "Number of probabilities to return. This parameter only supported by `llama-cpp` engine."
+          },
+          "min_keep": {
+            "type": "integer",
+            "description": "Minimum number of tokens to keep. This parameter only supported by `llama-cpp` engine."
           }
         },
         "required": [
@@ -3189,7 +3253,10 @@
             "description": "The display name of the model."
           }
         },
-        "required": ["model", "modelPath"]
+        "required": [
+          "model",
+          "modelPath"
+        ]
       },
       "ImportModelResponse": {
         "type": "object",
@@ -3208,7 +3275,11 @@
             "example": "OK"
           }
         },
-        "required": ["message", "modelHandle", "result"]
+        "required": [
+          "message",
+          "modelHandle",
+          "result"
+        ]
       },
       "CommonResponseDto": {
         "type": "object",