From 8091ade22bb59f427c0c21b05f4224a230b8322d Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Wed, 30 Oct 2024 20:43:23 +0700
Subject: [PATCH 01/13] update chat completion object

---
 docs/static/openapi/cortex.json | 953 +++++++++++++++++++++++++++++---
 1 file changed, 873 insertions(+), 80 deletions(-)

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 0f715456d..0a9a68e07 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -22,7 +22,9 @@
             "description": "The assistant has been successfully created."
           }
         },
-        "tags": ["Assistants"]
+        "tags": [
+          "Assistants"
+        ]
       },
       "get": {
         "operationId": "AssistantsController_findAll",
@@ -81,7 +83,9 @@
             }
           }
         },
-        "tags": ["Assistants"]
+        "tags": [
+          "Assistants"
+        ]
       }
     },
     "/v1/assistants/{id}": {
@@ -112,7 +116,9 @@
             }
           }
         },
-        "tags": ["Assistants"]
+        "tags": [
+          "Assistants"
+        ]
       },
       "delete": {
         "operationId": "AssistantsController_remove",
@@ -141,7 +147,9 @@
             }
           }
         },
-        "tags": ["Assistants"]
+        "tags": [
+          "Assistants"
+        ]
       }
     },
     "/v1/chat/completions": {
@@ -166,13 +174,24 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/ChatCompletionResponseDto"
+                  "oneOf": [
+                    {
+                      "title":"None stream response",
+                      "$ref": "#/components/schemas/ChatCompletionResponseDto"
+                    },
+                    {
+                      "title":"Stream chunk response",
+                      "$ref": "#/components/schemas/ChatCompletionResponseDto"
+                    }
+                  ]
                 }
               }
             }
           }
         },
-        "tags": ["Inference"]
+        "tags": [
+          "Inference"
+        ]
       }
     },
     "/v1/models/pull": {
@@ -271,10 +290,14 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       },
       "delete": {
-        "tags": ["Models"],
+        "tags": [
+          "Models"
+        ],
         "summary": "Stop model download",
         "description": "Stops the download of a model with the corresponding taskId provided in the request body",
         "operationId": "ModelsController_stopModelDownload",
@@ -290,7 +313,9 @@
                     "description": "The unique identifier of the download task to be stopped"
                   }
                 },
-                "required": ["taskId"]
+                "required": [
+                  "taskId"
+                ]
               }
             }
           }
@@ -385,7 +410,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/models/start": {
@@ -418,7 +445,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/models/stop": {
@@ -451,7 +480,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/models/{id}": {
@@ -482,7 +513,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       },
       "delete": {
         "operationId": "ModelsController_remove",
@@ -511,7 +544,9 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/models/{model}": {
@@ -551,14 +586,18 @@
             }
           }
         },
-        "tags": ["Models"]
+        "tags": [
+          "Models"
+        ]
       }
     },
     "/v1/threads": {
       "post": {
         "operationId": "ThreadsController_create",
         "summary": "Create thread",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "description": "Creates a new thread.",
         "parameters": [],
         "requestBody": {
@@ -587,7 +626,9 @@
       "get": {
         "operationId": "ThreadsController_findAll",
         "summary": "List threads",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "description": "Lists all the available threads along with its configurations.",
         "parameters": [],
         "responses": {
@@ -611,7 +652,9 @@
       "get": {
         "operationId": "ThreadsController_retrieveMessage",
         "summary": "Retrieve message",
-        "tags": ["Messages"],
+        "tags": [
+          "Messages"
+        ],
         "description": "Retrieves a message.",
         "parameters": [
           {
@@ -647,7 +690,9 @@
       "post": {
         "operationId": "ThreadsController_updateMessage",
         "summary": "Modify message",
-        "tags": ["Messages"],
+        "tags": [
+          "Messages"
+        ],
         "description": "Modifies a message.",
         "responses": {
           "201": {
@@ -694,7 +739,9 @@
         "operationId": "ThreadsController_deleteMessage",
         "summary": "Delete message",
         "description": "Deletes a message.",
-        "tags": ["Messages"],
+        "tags": [
+          "Messages"
+        ],
         "parameters": [
           {
             "name": "thread_id",
@@ -731,7 +778,9 @@
       "get": {
         "operationId": "ThreadsController_getMessagesOfThread",
         "summary": "List messages",
-        "tags": ["Messages"],
+        "tags": [
+          "Messages"
+        ],
         "description": "Returns a list of messages for a given thread.",
         "parameters": [
           {
@@ -799,7 +848,9 @@
       "post": {
         "operationId": "ThreadsController_createMessageInThread",
         "summary": "Create message",
-        "tags": ["Messages"],
+        "tags": [
+          "Messages"
+        ],
         "description": "Create a message.",
         "responses": {
           "201": {
@@ -840,7 +891,9 @@
         "operationId": "ThreadsController_cleanThread",
         "summary": "Clean thread",
         "description": "Deletes all messages in a thread.",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "parameters": [
           {
             "name": "thread_id",
@@ -862,7 +915,9 @@
       "get": {
         "operationId": "ThreadsController_retrieveThread",
         "summary": "Retrieve thread",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "description": "Retrieves a thread.",
         "parameters": [
           {
@@ -890,7 +945,9 @@
       "post": {
         "operationId": "ThreadsController_modifyThread",
         "summary": "Modify thread",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "description": "Modifies a thread.",
         "parameters": [
           {
@@ -931,7 +988,9 @@
       "delete": {
         "operationId": "ThreadsController_remove",
         "summary": "Delete thread",
-        "tags": ["Threads"],
+        "tags": [
+          "Threads"
+        ],
         "description": "Deletes a specific thread defined by a thread `id` .",
         "parameters": [
           {
@@ -968,7 +1027,9 @@
             "description": ""
           }
         },
-        "tags": ["System"]
+        "tags": [
+          "System"
+        ]
       },
       "get": {
         "operationId": "SystemController_get",
@@ -980,7 +1041,9 @@
             "description": "Ok"
           }
         },
-        "tags": ["System"]
+        "tags": [
+          "System"
+        ]
       }
     },
     "/v1/system/events/download": {
@@ -1001,7 +1064,9 @@
             }
           }
         },
-        "tags": ["System"]
+        "tags": [
+          "System"
+        ]
       }
     },
     "/v1/system/events/model": {
@@ -1022,7 +1087,9 @@
             }
           }
         },
-        "tags": ["System"]
+        "tags": [
+          "System"
+        ]
       }
     },
     "/v1/system/events/resources": {
@@ -1043,7 +1110,9 @@
             }
           }
         },
-        "tags": ["System"]
+        "tags": [
+          "System"
+        ]
       }
     },
     "/v1/engines": {
@@ -1064,7 +1133,9 @@
             }
           }
         },
-        "tags": ["Engines"]
+        "tags": [
+          "Engines"
+        ]
       }
     },
     "/v1/engines/{name}": {
@@ -1080,7 +1151,11 @@
             "description": "The unique identifier of the engine.",
             "schema": {
               "type": "string",
-              "enum": ["onnxruntime", "llama-cpp", "tensorrt-llm"]
+              "enum": [
+                "onnxruntime",
+                "llama-cpp",
+                "tensorrt-llm"
+              ]
             }
           }
         ],
@@ -1096,7 +1171,9 @@
             }
           }
         },
-        "tags": ["Engines"]
+        "tags": [
+          "Engines"
+        ]
       }
     },
     "/v1/engines/install/{name}": {
@@ -1112,7 +1189,11 @@
             "description": "The unique identifier of the engine.",
             "schema": {
               "type": "string",
-              "enum": ["onnxruntime", "llama-cpp", "tensorrt-llm"]
+              "enum": [
+                "onnxruntime",
+                "llama-cpp",
+                "tensorrt-llm"
+              ]
             }
           }
         ],
@@ -1138,7 +1219,9 @@
             }
           }
         },
-        "tags": ["Engines"]
+        "tags": [
+          "Engines"
+        ]
       },
       "delete": {
         "operationId": "EnginesController_deleteEngine",
@@ -1152,7 +1235,11 @@
             "description": "The unique identifier of the engine.",
             "schema": {
               "type": "string",
-              "enum": ["onnxruntime", "llama-cpp", "tensorrt-llm"]
+              "enum": [
+                "onnxruntime",
+                "llama-cpp",
+                "tensorrt-llm"
+              ]
             }
           }
         ],
@@ -1178,7 +1265,9 @@
             }
           }
         },
-        "tags": ["Engines"]
+        "tags": [
+          "Engines"
+        ]
       }
     }
   },
@@ -1402,31 +1491,435 @@
             "description": "Indicates whether the assistant was successfully deleted."
           }
         },
-        "required": ["id", "object", "deleted"]
+        "required": [
+          "id",
+          "object",
+          "deleted"
+        ]
       },
-      "ChatCompletionMessage": {
+      "Message": {
         "type": "object",
+        "discriminator": {
+          "propertyName": "role",
+          "mapping": {
+            "system": "SystemMessage",
+            "user": "UserMessage",
+            "assistant": "AssistantMessage",
+            "tool": "ToolMessage",
+            "function": "FunctionMessage"
+          }
+        },
         "properties": {
-          "content": {
+          "role": {
             "type": "string",
-            "description": "The Content of the chat message."
+            "enum": [
+              "system",
+              "user",
+              "assistant",
+              "tool"
+            ]
           },
-          "role": {
+          "name": {
+            "type": "string",
+            "description": "An optional name for the participant. Provides the model information to differentiate between participants of the same role."
+          }
+        },
+        "required": [
+          "role"
+        ]
+      },
+      "SystemMessage": {
+        "allOf": [
+          {
+            "type": "object",
+            "properties": {
+              "role": {
+                "type": "string",
+                "description": "The role of the messages author, in this case `system`."
+              },
+              "content": {
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "array",
+                    "items": {
+                      "$ref": "#/components/schemas/TextContentPart"
+                    }
+                  }
+                ]
+              },
+              "name": {
+                "type": "string",
+                "description": "An optional name for the participant. Provides the model information to differentiate between participants of the same role."
+              }
+            },
+            "required": [
+              "content",
+              "role"
+            ]
+          }
+        ]
+      },
+      "UserMessage": {
+        "allOf": [
+          {
+            "type": "object",
+            "properties": {
+              "role": {
+                "type": "string",
+                "description": "The role of the messages author, in this case `user`."
+              },
+              "content": {
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "anyOf": [
+                        {
+                          "type": "object",
+                          "title": "Text Content Part",
+                          "description": "Text Content",
+                          "$ref": "#/components/schemas/TextContentPart"
+                        },
+                        {
+                          "type": "object",
+                          "title": "Image Content Part",
+                          "description": "Image Content",
+                          "$ref": "#/components/schemas/ImageContentPart"
+                        },
+                        {
+                          "type": "object",
+                          "title": "Audio Content Part",
+                          "description": "Audio Content",
+                          "$ref": "#/components/schemas/AudioContentPart"
+                        }
+                      ]
+                    }
+                  }
+                ]
+              },
+              "name": {
+                "type": "string",
+                "description": "An optional name for the participant. Provides the model information to differentiate between participants of the same role."
+              }
+            },
+            "required": [
+              "content",
+              "role"
+            ]
+          }
+        ]
+      },
+      "AssistantMessage": {
+        "allOf": [
+          {
+            "type": "object",
+            "properties": {
+              "role": {
+                "type": "string",
+                "description": "The role of the messages author, in this case `assistant`."
+              },
+              "content": {
+                "description": "The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.",
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "anyOf": [
+                        {
+                          "$ref": "#/components/schemas/TextContentPart"
+                        },
+                        {
+                          "$ref": "#/components/schemas/RefusalContentPart"
+                        }
+                      ]
+                    }
+                  }
+                ]
+              },
+              "name": {
+                "type": "string",
+                "description": "An optional name for the participant. Provides the model information to differentiate between participants of the same role."
+              },
+              "refusal": {
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "null"
+                  }
+                ]
+              },
+              "audio": {
+                "type": "object",
+                "anyOf": [
+                  {
+                    "$ref": "#/components/schemas/Audio"
+                  },
+                  {
+                    "type": "null"
+                  }
+                ]
+              },
+              "tool_calls": {
+                "type": "array",
+                "items": {
+                  "$ref": "#/components/schemas/ToolCall"
+                }
+              },
+              "function_call": {
+                "deprecated": true,
+                "anyOf": [
+                  {
+                    "$ref": "#/components/schemas/FunctionCall"
+                  },
+                  {
+                    "type": "null"
+                  }
+                ]
+              }
+            }
+          }
+        ]
+      },
+      "ToolMessage": {
+        "allOf": [
+          {
             "type": "object",
-            "description": "The role of the entity in the chat completion.",
-            "example": "user"
+            "properties": {
+              "role": {
+                "type": "string",
+                "description": "The role of the messages author, in this case `tool`."
+              },
+              "content": {
+                "anyOf": [
+                  {
+                    "type": "string"
+                  },
+                  {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "anyOf": [
+                        {
+                          "$ref": "#/components/schemas/TextContentPart"
+                        }
+                      ]
+                    }
+                  }
+                ]
+              },
+              "tool_call_id": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "content",
+              "tool_call_id"
+            ]
+          }
+        ]
+      },
+      "FunctionMessage": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/Message"
+          }
+        ],
+        "deprecated": true
+      },
+      "TextContentPart": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": [
+              "text"
+            ]
+          },
+          "text": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "type",
+          "text"
+        ]
+      },
+      "ImageContentPart": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": [
+              "image_url"
+            ]
+          },
+          "image_url": {
+            "$ref": "#/components/schemas/ImageUrl"
+          }
+        },
+        "required": [
+          "type",
+          "image_url"
+        ]
+      },
+      "AudioContentPart": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "description": "The type of the content part. Always `input_audio`."
+          },
+          "input_audio": {
+            "$ref": "#/components/schemas/InputAudio"
+          }
+        },
+        "required": [
+          "type",
+          "input_audio"
+        ]
+      },
+      "RefusalContentPart": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string"
+          },
+          "refusal": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "type",
+          "refusal"
+        ]
+      },
+      "ImageUrl": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "type": "string",
+            "description": "Either a URL of the image or the base64 encoded image data."
+          },
+          "detail": {
+            "type": "string",
+            "default": "auto",
+            "description": "Specifies the detail level of the image. Defaults to `auto`."
+          }
+        },
+        "required": [
+          "url"
+        ]
+      },
+      "InputAudio": {
+        "type": "object",
+        "properties": {
+          "data": {
+            "type": "string",
+            "description": "Base64 encoded audio data."
+          },
+          "format": {
+            "type": "string",
+            "enum": [
+              "wav",
+              "mp3"
+            ],
+            "description": "The format of the encoded audio data. Currently supports `wav` and `mp3`."
+          }
+        },
+        "required": [
+          "data",
+          "format"
+        ]
+      },
+      "Audio": {
+        "type": "object",
+        "description": "Data about a previous audio response from the model.",
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "Unique identifier for a previous audio response from the model."
+          }
+        },
+        "required": [
+          "id"
+        ]
+      },
+      "ToolCall": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string"
+          },
+          "type": {
+            "type": "string"
+          },
+          "function": {
+            "$ref": "#/components/schemas/FunctionCall"
           }
         },
-        "required": ["content", "role"]
+        "required": [
+          "id",
+          "type",
+          "function"
+        ]
+      },
+      "FunctionCall": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string"
+          },
+          "arguments": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "name",
+          "arguments"
+        ]
       },
       "CreateChatCompletionDto": {
         "type": "object",
         "properties": {
           "messages": {
-            "description": "Array of chat messages to be used for generating the chat completion.",
+            "description": "Array of chat messages to be used for generating the chat completion. Depending on the model you use, different message types (modalities) are supported, like text, images, and audio. Currently, cortex only support text modalities.",
             "type": "array",
             "items": {
-              "$ref": "#/components/schemas/ChatCompletionMessage"
+              "anyOf": [
+                {
+                  "title": "System Message",
+                  "description": "System Message",
+                  "$ref": "#/components/schemas/SystemMessage"
+                },
+                {
+                  "title": "User Message",
+                  "description": "User Message",
+                  "$ref": "#/components/schemas/UserMessage"
+                },
+                {
+                  "title": "Assistant Message",
+                  "description": "Assistant Message",
+                  "$ref": "#/components/schemas/AssistantMessage"
+                },
+                {
+                  "title": "Tool Message",
+                  "description": "Tool Message",
+                  "$ref": "#/components/schemas/ToolMessage"
+                }
+              ]
             }
           },
           "model": {
@@ -1441,12 +1934,19 @@
           },
           "max_tokens": {
             "type": "number",
-            "description": "Sets the upper limit on the number of tokens the model can generate in a single output.",
-            "example": 4096
+            "description": "Sets the upper limit on the number of tokens the model can generate in a single output. This value is now deprecated in favor of `max_completion_tokens`.",
+            "example": 4096,
+            "deprecated": true
+          },
+          "max_completion_tokens": {
+            "type": "number",
+            "description": "Sets the upper limit on the number of tokens the model can generate in a single output."
           },
           "stop": {
             "description": "Defines specific tokens or phrases that signal the model to stop producing further output.",
-            "example": ["End"],
+            "example": [
+              "End"
+            ],
             "type": "array",
             "items": {
               "type": "string"
@@ -1471,9 +1971,218 @@
             "type": "number",
             "description": "Sets probability threshold for more relevant outputs.",
             "example": 0.95
+          },
+          "modalities": {
+            "type": "array",
+            "items": {
+              "type": "string",
+              "enum": [
+                "text",
+                "audio"
+              ]
+            },
+            "description": "Specifies the modalities (types of input) supported by the model. Currently, cortex only support text modalities.",
+            "example": [
+              "text"
+            ]
+          },
+          "audio": {
+            "description": "Parameters for audio output. Required when audio output is requested with `modalities: ['audio']`.",
+            "type": "object",
+            "properties": {
+              "voice": {
+                "type": "string",
+                "description": "The voice of the generated audio."
+              },
+              "format": {
+                "type": "string",
+                "description": "Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.",
+                "enum": [
+                  "mp3",
+                  "wav",
+                  "flac",
+                  "opus",
+                  "pcm16"
+                ]
+              }
+            },
+            "required": [
+              "voice",
+              "format"
+            ]
+          },
+          "store": {
+            "type": "boolean",
+            "description": "Whether or not to store the output of this chat completion request for use in our model distillation or evals products.",
+            "default": false,
+            "example": false
+          },
+          "metadata": {
+            "type": "object",
+            "description": "Developer-defined tags and values used for filtering completions in the dashboard.",
+            "example": {
+              "type": "conversation"
+            }
+          },
+          "logit_bias": {
+            "type": "object",
+            "description": "Modify the likelihood of specified tokens appearing in the completion. \n\nAccepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.",
+            "example": {
+              "15496": -100,
+              "51561": -100
+            },
+            "default": null
+          },
+          "logprobs": {
+            "type": "boolean",
+            "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.",
+            "example": false,
+            "default": false
+          },
+          "top_logprobs": {
+            "type": "number",
+            "description": "An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used."
+          },
+          "n": {
+            "type": "number",
+            "description": "How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
+            "example": 1,
+            "default": 1
+          },
+          "response_format": {
+            "type": "object",
+            "description": "An object specifying the format that the model must output. Setting to { \"type\": \"json_object\" } enables JSON mode, which guarantees the message the model generates is valid JSON.",
+            "properties": {
+              "type": {
+                "type": "string",
+                "description": "The format of the generated output. Must be one of `text`, `json_schema` or `json_object`.",
+                "enum": [
+                  "text",
+                  "json_object",
+                  "json_schema"
+                ]
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          "seed": {
+            "type": "number",
+            "description": "This feature is in Beta. If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed, and you should refer to the system_fingerprint response parameter to monitor changes in the backend.",
+            "example": 123,
+            "default": null
+          },
+          "service_tier": {
+            "type": "string",
+            "description": "Specifies the latency tier to use for processing the request. This parameter is relevant for customers subscribed to the scale tier service:\n\n - If set to 'auto', and the Project is Scale tier enabled, the system will utilize scale tier credits until they are exhausted.\n- If set to 'auto', and the Project is not Scale tier enabled, the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\n- If set to 'default', the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\nWhen not set, the default behavior is 'auto'.\nWhen this parameter is set, the response body will include the service_tier utilized."
+          },
+          "stream_options": {
+            "type": "object",
+            "default": null,
+            "description": "Options for streaming response. Only set this when you set `stream: true`.",
+            "properties": {
+              "include_usage": {
+                "type": "boolean",
+                "description": "If set, an additional chunk will be streamed before the data: `[DONE]` message. The `usage` field on this chunk shows the token usage statistics for the entire request, and the `choices` field will always be an empty array. All other chunks will also include a `usage` field, but with a null value.",
+                "example": false,
+                "default": false
+              }
+            }
+          },
+          "tools": {
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "function"
+                ]
+              },
+              "function": {
+                "$ref": "#/components/schemas/Function"
+              }
+            },
+            "required": [
+              "type",
+              "function"
+            ]
+          },
+          "tool_choice": {
+            "anyOf": [
+              {
+                "type": "string",
+                "enum": [
+                  "none",
+                  "auto",
+                  "required"
+                ]
+              },
+              {
+                "type": "object",
+                "properties": {
+                  "type": {
+                    "type": "string",
+                    "enum": [
+                      "function"
+                    ]
+                  },
+                  "function": {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      }
+                    },
+                    "required": [
+                      "name"
+                    ]
+                  }
+                },
+                "required": [
+                  "type",
+                  "function"
+                ]
+              }
+            ]
+          },
+          "parallel_tool_calls": {
+            "type": "boolean",
+            "description": "Whether to enable parallel function calling during tool use. Cortex support parallel tool calls by default",
+            "example": true,
+            "default": true
+          },
+          "user": {
+            "type": "string",
+            "description": "A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse."
+          }
+        },
+        "required": [
+          "messages",
+          "model"
+        ]
+      },
+      "Function": {
+        "type": "object",
+        "properties": {
+          "description": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string",
+            "pattern": "^[a-zA-Z0-9_-]{1,64}$"
+          },
+          "parameters": {
+            "type": "object"
+          },
+          "strict": {
+            "type": "boolean",
+            "default": false
           }
         },
-        "required": ["messages", "model"]
+        "required": [
+          "name"
+        ]
       },
       "MessageDto": {
         "type": "object",
@@ -1487,7 +2196,10 @@
             "description": "The role of the participant in the chat, such as 'user' or 'system', indicating who is the sender of the message."
           }
         },
-        "required": ["content", "role"]
+        "required": [
+          "content",
+          "role"
+        ]
       },
       "ChoiceDto": {
         "type": "object",
@@ -1509,7 +2221,11 @@
             ]
           }
         },
-        "required": ["finish_reason", "index", "message"]
+        "required": [
+          "finish_reason",
+          "index",
+          "message"
+        ]
       },
       "UsageDto": {
         "type": "object",
@@ -1527,7 +2243,11 @@
             "description": "The total number of tokens used in both the prompt and the completion, summarizing the entire token count of the chat operation."
           }
         },
-        "required": ["completion_tokens", "prompt_tokens", "total_tokens"]
+        "required": [
+          "completion_tokens",
+          "prompt_tokens",
+          "total_tokens"
+        ]
       },
       "ChatCompletionResponseDto": {
         "type": "object",
@@ -1587,7 +2307,9 @@
             "description": "The name of the embedding model to be used."
           },
           "input": {
-            "example": ["Hello World"],
+            "example": [
+              "Hello World"
+            ],
             "description": "The text or token array(s) to be embedded. This can be a single string, an array of strings, or an array of token arrays to embed multiple inputs in one request.",
             "type": "array",
             "items": {
@@ -1605,7 +2327,10 @@
             "description": "Defines the number of dimensions for the output embeddings. This feature is supported by certain models only. This field is optional."
           }
         },
-        "required": ["model", "input"]
+        "required": [
+          "model",
+          "input"
+        ]
       },
       "EmbeddingsResponseDto": {
         "type": "object",
@@ -1634,11 +2359,18 @@
             ]
           }
         },
-        "required": ["object", "model", "embedding", "usage"]
+        "required": [
+          "object",
+          "model",
+          "embedding",
+          "usage"
+        ]
       },
       "PullModelRequest": {
         "type": "object",
-        "required": ["model"],
+        "required": [
+          "model"
+        ],
         "properties": {
           "model": {
             "type": "string",
@@ -1687,7 +2419,9 @@
           },
           "files": {
             "description": "The URL sources from which the model downloaded or accessed.",
-            "example": ["https://huggingface.co/cortexso/mistral/tree/gguf"],
+            "example": [
+              "https://huggingface.co/cortexso/mistral/tree/gguf"
+            ],
             "oneOf": [
               {
                 "type": "array",
@@ -1707,7 +2441,9 @@
           },
           "stop": {
             "description": "Defines specific tokens or phrases that signal the model to stop producing further output.",
-            "example": ["End"],
+            "example": [
+              "End"
+            ],
             "type": "array",
             "items": {
               "type": "string"
@@ -1777,7 +2513,10 @@
             "default": ""
           }
         },
-        "required": ["model", "files"]
+        "required": [
+          "model",
+          "files"
+        ]
       },
       "StartModelSuccessDto": {
         "type": "object",
@@ -1791,7 +2530,10 @@
             "description": "The unique identifier of the model."
           }
         },
-        "required": ["message", "modelId"]
+        "required": [
+          "message",
+          "modelId"
+        ]
       },
       "ModelStartDto": {
         "type": "object",
@@ -1811,7 +2553,9 @@
             "description": "The response success or error message."
           }
         },
-        "required": ["message"]
+        "required": [
+          "message"
+        ]
       },
       "EngineUninstallationResponseDto": {
         "type": "object",
@@ -1867,7 +2611,11 @@
             "example": "OK"
           }
         },
-        "required": ["data", "object", "result"]
+        "required": [
+          "data",
+          "object",
+          "result"
+        ]
       },
       "Engine": {
         "type": "object",
@@ -1897,7 +2645,12 @@
             "example": "0.1.34"
           }
         },
-        "required": ["description", "name", "productName", "status"]
+        "required": [
+          "description",
+          "name",
+          "productName",
+          "status"
+        ]
       },
       "ModelDto": {
         "type": "object",
@@ -1913,7 +2666,9 @@
             "description": "A predefined text or framework that guides the AI model's response generation."
           },
           "stop": {
-            "example": ["End"],
+            "example": [
+              "End"
+            ],
             "description": "Defines specific tokens or phrases that signal the model to stop producing further output.",
             "type": "array",
             "items": {
@@ -2020,7 +2775,9 @@
             "example": "llamacpp"
           }
         },
-        "required": ["id"]
+        "required": [
+          "id"
+        ]
       },
       "ListModelsResponseDto": {
         "type": "object",
@@ -2028,7 +2785,9 @@
           "object": {
             "type": "string",
             "example": "list",
-            "enum": ["list"]
+            "enum": [
+              "list"
+            ]
           },
           "data": {
             "description": "List of models",
@@ -2038,7 +2797,10 @@
             }
           }
         },
-        "required": ["object", "data"]
+        "required": [
+          "object",
+          "data"
+        ]
       },
       "UpdateModelDto": {
         "type": "object",
@@ -2064,7 +2826,11 @@
             "description": "Indicates whether the model was successfully deleted."
           }
         },
-        "required": ["id", "object", "deleted"]
+        "required": [
+          "id",
+          "object",
+          "deleted"
+        ]
       },
       "CreateThreadAssistantDto": {
         "type": "object",
@@ -2154,7 +2920,10 @@
           "tool_resources": {
             "type": "object",
             "example": {
-              "resources": ["database1", "database2"]
+              "resources": [
+                "database1",
+                "database2"
+              ]
             },
             "description": "Tool resources for the assistant."
           }
@@ -2182,7 +2951,9 @@
             }
           }
         },
-        "required": ["assistants"]
+        "required": [
+          "assistants"
+        ]
       },
       "ContentDto": {
         "type": "object",
@@ -2201,7 +2972,10 @@
             "description": "Text content of the message along with any annotations."
           }
         },
-        "required": ["type", "text"]
+        "required": [
+          "type",
+          "text"
+        ]
       },
       "GetMessageResponseDto": {
         "type": "object",
@@ -2375,7 +3149,13 @@
             "description": "Indicates whether there are more messages to retrieve."
           }
         },
-        "required": ["object", "data", "first_id", "last_id", "has_more"]
+        "required": [
+          "object",
+          "data",
+          "first_id",
+          "last_id",
+          "has_more"
+        ]
       },
       "CreateMessageDto": {
         "type": "object",
@@ -2391,7 +3171,10 @@
             "description": "The text contents of the message."
           }
         },
-        "required": ["role", "content"]
+        "required": [
+          "role",
+          "content"
+        ]
       },
       "UpdateMessageDto": {
         "type": "object",
@@ -2417,7 +3200,11 @@
             "description": "Indicates whether the message was successfully deleted."
           }
         },
-        "required": ["id", "object", "deleted"]
+        "required": [
+          "id",
+          "object",
+          "deleted"
+        ]
       },
       "GetThreadResponseDto": {
         "type": "object",
@@ -2438,7 +3225,9 @@
             "description": "Unix timestamp representing the creation time of the thread."
           },
           "assistants": {
-            "example": ["assistant-001"],
+            "example": [
+              "assistant-001"
+            ],
             "description": "List of assistants involved in the thread.",
             "type": "array",
             "items": {
@@ -2492,8 +3281,12 @@
             "description": "Indicates whether the thread was successfully deleted."
           }
         },
-        "required": ["id", "object", "deleted"]
+        "required": [
+          "id",
+          "object",
+          "deleted"
+        ]
       }
     }
   }
-}
+}
\ No newline at end of file

From 53d5c80dbee70f20d7ac50c38daabe5f2a8a0757 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Wed, 30 Oct 2024 22:02:52 +0700
Subject: [PATCH 02/13] Update chat completion response api

---
 docs/static/openapi/cortex.json | 579 ++++++++++++++++++++++++++++++--
 1 file changed, 559 insertions(+), 20 deletions(-)

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 0a9a68e07..6790a54b8 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -176,12 +176,12 @@
                 "schema": {
                   "oneOf": [
                     {
-                      "title":"None stream response",
+                      "title": "Chat Completion Response",
                       "$ref": "#/components/schemas/ChatCompletionResponseDto"
                     },
                     {
-                      "title":"Stream chunk response",
-                      "$ref": "#/components/schemas/ChatCompletionResponseDto"
+                      "title": "Chat Completion Chunk Response",
+                      "$ref": "#/components/schemas/ChatCompletionChunkResponseDto"
                     }
                   ]
                 }
@@ -2252,50 +2252,589 @@
       "ChatCompletionResponseDto": {
         "type": "object",
         "properties": {
+          "id": {
+            "type": "string",
+            "description": "A unique identifier for the chat completion."
+          },
           "choices": {
-            "description": "A list of choices generated by the chat model.",
             "type": "array",
+            "description": "A list of chat completion choices. Can be more than one if n is greater than 1.",
             "items": {
-              "$ref": "#/components/schemas/ChoiceDto"
+              "type": "object",
+              "properties": {
+                "finish_reason": {
+                  "type": "string",
+                  "description": "The reason the model stopped generating tokens. This will be stop if the model hit a natural stop point or a provided stop sequence, length if the maximum number of tokens specified in the request was reached, content_filter if content was omitted due to a flag from our content filters, tool_calls if the model called a tool, or function_call (deprecated) if the model called a function."
+                },
+                "index": {
+                  "type": "integer",
+                  "description": "The index of the choice in the list of choices."
+                },
+                "message": {
+                  "type": "object",
+                  "properties": {
+                    "content": {
+                      "type": [
+                        "string",
+                        "null"
+                      ],
+                      "description": "The contents of the message."
+                    },
+                    "refusal": {
+                      "type": [
+                        "string",
+                        "null"
+                      ],
+                      "description": "The refusal message generated by the model."
+                    },
+                    "tool_calls": {
+                      "type": "array",
+                      "description": "The tool calls generated by the model, such as function calls.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "id": {
+                            "type": "string",
+                            "description": "The ID of the tool call."
+                          },
+                          "type": {
+                            "type": "string",
+                            "description": "The type of the tool. Currently, only function is supported."
+                          },
+                          "function": {
+                            "type": "object",
+                            "properties": {
+                              "name": {
+                                "type": "string",
+                                "description": "The name of the function to call."
+                              },
+                              "arguments": {
+                                "type": "string",
+                                "description": "The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function."
+                              }
+                            },
+                            "required": [
+                              "name",
+                              "arguments"
+                            ]
+                          }
+                        },
+                        "required": [
+                          "id",
+                          "type",
+                          "function"
+                        ]
+                      }
+                    },
+                    "role": {
+                      "type": "string",
+                      "description": "The role of the author of this message."
+                    },
+                    "function_call": {
+                      "type": "object",
+                      "deprecated": true,
+                      "description": "Deprecated and replaced by tool_calls. The name and arguments of a function that should be called, as generated by the model.",
+                      "properties": {
+                        "arguments": {
+                          "type": "string",
+                          "description": "The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function."
+                        },
+                        "name": {
+                          "type": "string",
+                          "description": "The name of the function to call."
+                        }
+                      },
+                      "required": [
+                        "arguments",
+                        "name"
+                      ]
+                    },
+                    "audio": {
+                      "type": "object",
+                      "description": "If the audio output modality is requested, this object contains data about the audio response from the model.",
+                      "properties": {
+                        "id": {
+                          "type": "string",
+                          "description": "Unique identifier for this audio response."
+                        },
+                        "expires_at": {
+                          "type": "integer",
+                          "description": "The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations."
+                        },
+                        "data": {
+                          "type": "string",
+                          "description": "Base64 encoded audio bytes generated by the model, in the format specified in the request."
+                        },
+                        "transcript": {
+                          "type": "string",
+                          "description": "Transcript of the audio generated by the model."
+                        }
+                      },
+                      "required": [
+                        "id",
+                        "expires_at",
+                        "data",
+                        "transcript"
+                      ]
+                    }
+                  },
+                  "required": [
+                    "role"
+                  ]
+                },
+                "logprobs": {
+                  "type": "object",
+                  "description": "Log probability information for the choice.",
+                  "properties": {
+                    "content": {
+                      "type": [
+                        "array",
+                        "null"
+                      ],
+                      "description": "A list of message content tokens with log probability information.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    },
+                    "top_logprobs": {
+                      "type": "array",
+                      "description": "List of the most likely tokens and their log probability, at this token position. In rare cases, there may be fewer than the number of requested top_logprobs returned.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    },
+                    "refusal": {
+                      "type": [
+                        "array",
+                        "null"
+                      ],
+                      "description": "A list of message refusal tokens with log probability information.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    }
+                  }
+                }
+              },
+              "required": [
+                "finish_reason",
+                "index",
+                "message"
+              ]
             }
           },
           "created": {
-            "type": "number",
-            "description": "The timestamp of when the chat completion was created, expressed as a Unix timestamp."
+            "type": "integer",
+            "description": "The Unix timestamp (in seconds) of when the chat completion was created."
           },
-          "id": {
+          "model": {
             "type": "string",
-            "description": "The unique identifier for the chat completion."
+            "description": "The model used for the chat completion."
           },
-          "model": {
+          "service_tier": {
+            "type": [
+              "string",
+              "null"
+            ],
+            "description": "The service tier used for processing the request. This field is only included if the service_tier parameter is specified in the request."
+          },
+          "system_fingerprint": {
             "type": "string",
-            "description": "The identifier of the model used to generate the chat completion."
+            "description": "This fingerprint represents the backend configuration that the model runs with. Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism."
           },
           "object": {
             "type": "string",
-            "description": "The type of object, typically set to 'chat_completion' to denote the nature of the API response."
+            "description": "The object type, which is always chat.completion."
+          },
+          "usage": {
+            "type": "object",
+            "description": "Usage statistics for the completion request.",
+            "properties": {
+              "completion_tokens": {
+                "type": "integer",
+                "description": "Number of tokens in the generated completion."
+              },
+              "prompt_tokens": {
+                "type": "integer",
+                "description": "Number of tokens in the prompt."
+              },
+              "total_tokens": {
+                "type": "integer",
+                "description": "Total number of tokens used in the request (prompt + completion)."
+              },
+              "completion_tokens_details": {
+                "type": "object",
+                "description": "Breakdown of tokens used in a completion.",
+                "properties": {
+                  "audio_tokens": {
+                    "type": "integer",
+                    "description": "Audio input tokens generated by the model."
+                  },
+                  "reasoning_tokens": {
+                    "type": "integer",
+                    "description": "Tokens generated by the model for reasoning."
+                  }
+                },
+                "required": [
+                  "audio_tokens",
+                  "reasoning_tokens"
+                ]
+              },
+              "prompt_tokens_details": {
+                "type": "object",
+                "description": "Breakdown of tokens used in the prompt.",
+                "properties": {
+                  "audio_tokens": {
+                    "type": "integer",
+                    "description": "Audio input tokens present in the prompt."
+                  },
+                  "cached_tokens": {
+                    "type": "integer",
+                    "description": "Cached tokens present in the prompt."
+                  }
+                },
+                "required": [
+                  "audio_tokens",
+                  "cached_tokens"
+                ]
+              }
+            },
+            "required": [
+              "completion_tokens",
+              "prompt_tokens",
+              "total_tokens",
+              "completion_tokens_details",
+              "prompt_tokens_details"
+            ]
+          }
+        },
+        "required": [
+          "id",
+          "choices",
+          "created",
+          "model",
+          "system_fingerprint",
+          "object",
+          "usage"
+        ]
+      },
+      "ChatCompletionChunkResponseDto": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "A unique identifier for the chat completion. Each chunk has the same ID."
+          },
+          "choices": {
+            "type": "array",
+            "description": "A list of chat completion choices. Can contain more than one element if n is greater than 1. Can also be empty for the last chunk if you set stream_options: {\"include_usage\": true}.",
+            "items": {
+              "type": "object",
+              "properties": {
+                "delta": {
+                  "type": "object",
+                  "description": "A chat completion delta generated by streamed model responses.",
+                  "properties": {
+                    "content": {
+                      "type": [
+                        "string",
+                        "null"
+                      ],
+                      "description": "The contents of the chunk message."
+                    },
+                    "function_call": {
+                      "type": "object",
+                      "description": "Deprecated and replaced by tool_calls. The name and arguments of a function that should be called, as generated by the model.",
+                      "deprecated": true
+                    },
+                    "tool_calls": {
+                      "type": "array",
+                      "description": "The tool calls generated by the model.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "index": {
+                            "type": "integer",
+                            "description": "The index of the tool call in the list of tool calls."
+                          },
+                          "id": {
+                            "type": "string",
+                            "description": "The ID of the tool call."
+                          },
+                          "type": {
+                            "type": "string",
+                            "description": "The type of the tool. Currently, only function is supported."
+                          },
+                          "function": {
+                            "type": "object",
+                            "properties": {
+                              "name": {
+                                "type": "string",
+                                "description": "The name of the function to call."
+                              },
+                              "arguments": {
+                                "type": "string",
+                                "description": "The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function."
+                              }
+                            },
+                            "required": [
+                              "name",
+                              "arguments"
+                            ]
+                          }
+                        },
+                        "required": [
+                          "index",
+                          "id",
+                          "type",
+                          "function"
+                        ]
+                      }
+                    },
+                    "role": {
+                      "type": "string",
+                      "description": "The role of the author of this message."
+                    },
+                    "refusal": {
+                      "type": [
+                        "string",
+                        "null"
+                      ],
+                      "description": "The refusal message generated by the model."
+                    }
+                  }
+                },
+                "logprobs": {
+                  "type": "object",
+                  "description": "Log probability information for the choice.",
+                  "properties": {
+                    "content": {
+                      "type": [
+                        "array",
+                        "null"
+                      ],
+                      "description": "A list of message content tokens with log probability information.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    },
+                    "top_logprobs": {
+                      "type": "array",
+                      "description": "List of the most likely tokens and their log probability, at this token position. In rare cases, there may be fewer than the number of requested top_logprobs returned.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    },
+                    "refusal": {
+                      "type": [
+                        "array",
+                        "null"
+                      ],
+                      "description": "A list of message refusal tokens with log probability information.",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "token": {
+                            "type": "string",
+                            "description": "The token."
+                          },
+                          "logprob": {
+                            "type": "number",
+                            "description": "The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely."
+                          },
+                          "bytes": {
+                            "type": [
+                              "array",
+                              "null"
+                            ],
+                            "description": "A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be null if there is no bytes representation for the token."
+                          }
+                        },
+                        "required": [
+                          "token",
+                          "logprob"
+                        ]
+                      }
+                    }
+                  }
+                },
+                "finish_reason": {
+                  "type": [
+                    "string",
+                    "null"
+                  ],
+                  "description": "The reason the model stopped generating tokens. This will be stop if the model hit a natural stop point or a provided stop sequence, length if the maximum number of tokens specified in the request was reached, content_filter if content was omitted due to a flag from our content filters, tool_calls if the model called a tool, or function_call (deprecated) if the model called a function."
+                },
+                "index": {
+                  "type": "integer",
+                  "description": "The index of the choice in the list of choices."
+                }
+              },
+              "required": [
+                "delta",
+                "index"
+              ]
+            }
+          },
+          "created": {
+            "type": "integer",
+            "description": "The Unix timestamp (in seconds) of when the chat completion was created. Each chunk has the same timestamp."
+          },
+          "model": {
+            "type": "string",
+            "description": "The model used to generate the completion."
+          },
+          "service_tier": {
+            "type": [
+              "string",
+              "null"
+            ],
+            "description": "The service tier used for processing the request. This field is only included if the service_tier parameter is specified in the request."
           },
           "system_fingerprint": {
             "type": "string",
-            "description": "A unique fingerprint that identifies the system configuration used during the chat completion."
+            "description": "This fingerprint represents the backend configuration that the model runs with. Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism."
+          },
+          "object": {
+            "type": "string",
+            "description": "The object type, which is always chat.completion.chunk."
           },
           "usage": {
-            "description": "An object representing the usage statistics of the model for the current completion.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/UsageDto"
+            "type": "object",
+            "description": "An optional field that will only be present when you set stream_options: {\"include_usage\": true} in your request. When present, it contains a null value except for the last chunk which contains the token usage statistics for the entire request.",
+            "properties": {
+              "completion_tokens": {
+                "type": "integer",
+                "description": "Number of tokens in the generated completion."
+              },
+              "prompt_tokens": {
+                "type": "integer",
+                "description": "Number of tokens in the prompt."
+              },
+              "total_tokens": {
+                "type": "integer",
+                "description": "Total number of tokens used in the request (prompt + completion)."
               }
+            },
+            "required": [
+              "completion_tokens",
+              "prompt_tokens",
+              "total_tokens"
             ]
           }
         },
         "required": [
+          "id",
           "choices",
           "created",
-          "id",
           "model",
-          "object",
           "system_fingerprint",
-          "usage"
+          "object"
         ]
       },
       "CreateEmbeddingsDto": {

From 6293903b0479ecdfcfaa11e9e2b18056ee285336 Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Thu, 31 Oct 2024 14:00:38 +0700
Subject: [PATCH 03/13] fix: add notification for future plan

---
 docs/static/openapi/cortex.json | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 6790a54b8..9b9ac733a 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -1935,8 +1935,7 @@
           "max_tokens": {
             "type": "number",
             "description": "Sets the upper limit on the number of tokens the model can generate in a single output. This value is now deprecated in favor of `max_completion_tokens`.",
-            "example": 4096,
-            "deprecated": true
+            "example": 4096
           },
           "max_completion_tokens": {
             "type": "number",
@@ -1981,13 +1980,13 @@
                 "audio"
               ]
             },
-            "description": "Specifies the modalities (types of input) supported by the model. Currently, cortex only support text modalities.",
+            "description": "Specifies the modalities (types of input) supported by the model. Currently, cortex only support text modalities. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582).",
             "example": [
               "text"
             ]
           },
           "audio": {
-            "description": "Parameters for audio output. Required when audio output is requested with `modalities: ['audio']`.",
+            "description": "Parameters for audio output. Required when audio output is requested with `modalities: ['audio']`. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582).",
             "type": "object",
             "properties": {
               "voice": {
@@ -2013,13 +2012,13 @@
           },
           "store": {
             "type": "boolean",
-            "description": "Whether or not to store the output of this chat completion request for use in our model distillation or evals products.",
+            "description": "Whether or not to store the output of this chat completion request for use in our model distillation or evals products. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582).",
             "default": false,
             "example": false
           },
           "metadata": {
             "type": "object",
-            "description": "Developer-defined tags and values used for filtering completions in the dashboard.",
+            "description": "Developer-defined tags and values used for filtering completions in the dashboard. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582).",
             "example": {
               "type": "conversation"
             }
@@ -2051,7 +2050,7 @@
           },
           "response_format": {
             "type": "object",
-            "description": "An object specifying the format that the model must output. Setting to { \"type\": \"json_object\" } enables JSON mode, which guarantees the message the model generates is valid JSON.",
+            "description": "An object specifying the format that the model must output. Setting to { \"type\": \"json_object\" } enables JSON mode, which guarantees the message the model generates is valid JSON. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582).",
             "properties": {
               "type": {
                 "type": "string",
@@ -2075,7 +2074,7 @@
           },
           "service_tier": {
             "type": "string",
-            "description": "Specifies the latency tier to use for processing the request. This parameter is relevant for customers subscribed to the scale tier service:\n\n - If set to 'auto', and the Project is Scale tier enabled, the system will utilize scale tier credits until they are exhausted.\n- If set to 'auto', and the Project is not Scale tier enabled, the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\n- If set to 'default', the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\nWhen not set, the default behavior is 'auto'.\nWhen this parameter is set, the response body will include the service_tier utilized."
+            "description": "Specifies the latency tier to use for processing the request. This parameter is relevant for customers subscribed to the scale tier service:\n\n - If set to 'auto', and the Project is Scale tier enabled, the system will utilize scale tier credits until they are exhausted.\n- If set to 'auto', and the Project is not Scale tier enabled, the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\n- If set to 'default', the request will be processed using the default service tier with a lower uptime SLA and no latency guarentee.\nWhen not set, the default behavior is 'auto'.\nWhen this parameter is set, the response body will include the service_tier utilized.\n\n We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582)."
           },
           "stream_options": {
             "type": "object",
@@ -2154,7 +2153,7 @@
           },
           "user": {
             "type": "string",
-            "description": "A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse."
+            "description": "A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. We are actively working on this feature to bring cortex as fully OpenAI compatible platform. Planning and roadmap for this feature can be found [**here**](https://github.com/janhq/cortex.cpp/issues/1582)."
           }
         },
         "required": [

From a529cd27a7eb1f712235a04ee37a2eacbdc1806f Mon Sep 17 00:00:00 2001
From: nguyenhoangthuan99 <thuanhppro12@gmail.com>
Date: Thu, 31 Oct 2024 15:37:51 +0700
Subject: [PATCH 04/13] add api swagger for server check health and terminate
 process

---
 docs/static/openapi/cortex.json | 45 ++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 9b9ac733a..f62f31da9 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -152,6 +152,44 @@
         ]
       }
     },
+    "/healthz": {
+      "get": {
+        "operationId": "HealthController_check",
+        "summary": "Check health",
+        "description": "Checks the health of the application.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "Ok",
+            "content": {
+              "application/json": {}
+            }
+          }
+        },
+        "tags": [
+          "Server"
+        ]
+      }
+    },
+    "/processManager/destroy": {
+      "delete": {
+        "operationId": "Terminate server process",
+        "summary": "Terminate server",
+        "description": "Terminate server process.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "Ok",
+            "content": {
+              "application/json": {}
+            }
+          }
+        },
+        "tags": [
+          "Server"
+        ]
+      }
+    },
     "/v1/chat/completions": {
       "post": {
         "operationId": "ChatController_create",
@@ -1290,6 +1328,10 @@
       "name": "Models",
       "description": "These endpoints provide a list and descriptions of all available models within the Cortex framework."
     },
+    {
+      "name": "Server",
+      "description": "These endpoints manage the lifecycle of Server, including heath check and shutdown."
+    },
     {
       "name": "Messages",
       "description": "These endpoints manage the retrieval and storage of conversation content, including responses from LLMs and other metadata related to chat interactions."
@@ -1316,7 +1358,8 @@
         "Events",
         "Models",
         "Processes",
-        "Status"
+        "Status",
+        "Server"
       ]
     }
   ],

From 43fbc529abc9c3926702fd2766804b0bc7a5c04b Mon Sep 17 00:00:00 2001
From: hiento09 <136591877+hiento09@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:13:06 +0700
Subject: [PATCH 05/13] chore: auto set pre-release for beta version (#1608)

Co-authored-by: Hien To <tominhhien97@gmail.com>
---
 .github/workflows/beta-build.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml
index b5bee44b6..0a4d3c735 100644
--- a/.github/workflows/beta-build.yml
+++ b/.github/workflows/beta-build.yml
@@ -90,6 +90,8 @@ jobs:
         pull-requests: write
       runs-on: ubuntu-latest
       steps:
+        - name: Getting the repo
+          uses: actions/checkout@v4
         - name: set release to prerelease
           run: |
             gh release edit v${{ needs.get-update-version.outputs.new_version }} --draft=false --prerelease

From 152b76fe8828fc8644119ba15aafa25eac044f0f Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 1 Nov 2024 17:58:12 +0700
Subject: [PATCH 06/13] fix: progress bar on CMD (#1609)

---
 engine/cli/utils/download_progress.cc | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc
index 9c38d4bdf..b47b4fc9a 100644
--- a/engine/cli/utils/download_progress.cc
+++ b/engine/cli/utils/download_progress.cc
@@ -36,6 +36,20 @@ bool DownloadProgress::Connect(const std::string& host, int port) {
 
 bool DownloadProgress::Handle(const DownloadType& event_type) {
   assert(!!ws_);
+#if defined(_WIN32)
+  HANDLE h_out = GetStdHandle(STD_OUTPUT_HANDLE);
+  DWORD dw_original_out_mode = 0;
+  if (h_out != INVALID_HANDLE_VALUE) {
+    GetConsoleMode(h_out, &dw_original_out_mode);
+
+    // Enable ANSI escape code processing
+    DWORD dw_requested_out_mode =
+        dw_original_out_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING;
+    if (!SetConsoleMode(h_out, dw_requested_out_mode)) {
+      SetConsoleMode(h_out, dw_original_out_mode);
+    }
+  }
+#endif
   std::unordered_map<std::string, uint64_t> totals;
   status_ = DownloadStatus::DownloadStarted;
   std::unique_ptr<indicators::DynamicProgress<indicators::ProgressBar>> bars;
@@ -124,6 +138,11 @@ bool DownloadProgress::Handle(const DownloadType& event_type) {
     ws_->dispatch(handle_message);
   }
   indicators::show_console_cursor(true);
+#if defined(_WIN32)
+  if (dw_original_out_mode != 0 && h_out != INVALID_HANDLE_VALUE) {
+    SetConsoleMode(h_out, dw_original_out_mode);
+  }
+#endif
   if (status_ == DownloadStatus::DownloadError)
     return false;
   return true;

From 02ae2c1d6bed1667421c4ffe8fe0413743bccef9 Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 15:15:40 +0800
Subject: [PATCH 07/13] Refactor Cortex Docs structure

---
 docs/docs/architecture/cortex-db.md           |   3 +
 .../cortexrc.mdx                              |   0
 docs/docs/{ => architecture}/data-folder.mdx  |   0
 docs/docs/assistants/index.md                 |   3 +
 docs/docs/assistants/tools/index.md           |   3 +
 .../{server.mdx => api-server.mdx}            |   7 +-
 docs/docs/basic-usage/command-line.md         |  48 ---
 .../js-library.md => cortex-js.md}            |  15 +-
 .../py-library.md => cortex-py.md}            |  10 +-
 .../basic-usage/{overview.mdx => index.mdx}   |   2 +-
 docs/docs/built-in-models.mdx                 |  54 ---
 docs/docs/capabilities/audio-generation.md    |   3 +
 docs/docs/capabilities/embeddings.md          |   7 +
 docs/docs/capabilities/hardware/index.md      |  37 +++
 docs/docs/capabilities/image-generation.md    |   3 +
 .../models/index.mdx}                         |   0
 .../{ => capabilities/models}/model-yaml.mdx  |   3 +-
 .../models/presets.mdx}                       |   3 +-
 docs/docs/capabilities/moderation.md          |   3 +
 docs/docs/capabilities/reasoning.md           |   3 +
 docs/docs/capabilities/speech-to-text.md      |   3 +
 docs/docs/capabilities/text-generation.md     |   7 +
 docs/docs/capabilities/text-to-speech.md      |   3 +
 docs/docs/capabilities/vision.md              |   3 +
 docs/docs/chat-completions.mdx                |   3 +-
 .../engine-extension.mdx}                     |  11 +-
 docs/docs/engines/onnx.mdx                    |   3 +-
 docs/docs/engines/tensorrt-llm.mdx            |   3 +-
 docs/docs/guides/function-calling.md          |   3 +
 docs/docs/guides/structured-outputs.md        |   3 +
 docs/docs/installation/gpu-acceleration.mdx   |   8 -
 docs/docusaurus.config.ts                     |   1 -
 docs/sidebars.ts                              | 311 +++++-------------
 33 files changed, 206 insertions(+), 363 deletions(-)
 create mode 100644 docs/docs/architecture/cortex-db.md
 rename docs/docs/{basic-usage => architecture}/cortexrc.mdx (100%)
 rename docs/docs/{ => architecture}/data-folder.mdx (100%)
 create mode 100644 docs/docs/assistants/index.md
 create mode 100644 docs/docs/assistants/tools/index.md
 rename docs/docs/basic-usage/{server.mdx => api-server.mdx} (90%)
 delete mode 100644 docs/docs/basic-usage/command-line.md
 rename docs/docs/basic-usage/{integration/js-library.md => cortex-js.md} (79%)
 rename docs/docs/basic-usage/{integration/py-library.md => cortex-py.md} (91%)
 rename docs/docs/basic-usage/{overview.mdx => index.mdx} (98%)
 delete mode 100644 docs/docs/built-in-models.mdx
 create mode 100644 docs/docs/capabilities/audio-generation.md
 create mode 100644 docs/docs/capabilities/embeddings.md
 create mode 100644 docs/docs/capabilities/hardware/index.md
 create mode 100644 docs/docs/capabilities/image-generation.md
 rename docs/docs/{model-overview.mdx => capabilities/models/index.mdx} (100%)
 rename docs/docs/{ => capabilities/models}/model-yaml.mdx (99%)
 rename docs/docs/{model-presets.mdx => capabilities/models/presets.mdx} (98%)
 create mode 100644 docs/docs/capabilities/moderation.md
 create mode 100644 docs/docs/capabilities/reasoning.md
 create mode 100644 docs/docs/capabilities/speech-to-text.md
 create mode 100644 docs/docs/capabilities/text-generation.md
 create mode 100644 docs/docs/capabilities/text-to-speech.md
 create mode 100644 docs/docs/capabilities/vision.md
 rename docs/docs/{integrate-remote-engine.mdx => engines/engine-extension.mdx} (94%)
 create mode 100644 docs/docs/guides/function-calling.md
 create mode 100644 docs/docs/guides/structured-outputs.md
 delete mode 100644 docs/docs/installation/gpu-acceleration.mdx

diff --git a/docs/docs/architecture/cortex-db.md b/docs/docs/architecture/cortex-db.md
new file mode 100644
index 000000000..09de74ab4
--- /dev/null
+++ b/docs/docs/architecture/cortex-db.md
@@ -0,0 +1,3 @@
+---
+title: cortex.db
+---
\ No newline at end of file
diff --git a/docs/docs/basic-usage/cortexrc.mdx b/docs/docs/architecture/cortexrc.mdx
similarity index 100%
rename from docs/docs/basic-usage/cortexrc.mdx
rename to docs/docs/architecture/cortexrc.mdx
diff --git a/docs/docs/data-folder.mdx b/docs/docs/architecture/data-folder.mdx
similarity index 100%
rename from docs/docs/data-folder.mdx
rename to docs/docs/architecture/data-folder.mdx
diff --git a/docs/docs/assistants/index.md b/docs/docs/assistants/index.md
new file mode 100644
index 000000000..d38b33e52
--- /dev/null
+++ b/docs/docs/assistants/index.md
@@ -0,0 +1,3 @@
+---
+title: Assistants
+---
\ No newline at end of file
diff --git a/docs/docs/assistants/tools/index.md b/docs/docs/assistants/tools/index.md
new file mode 100644
index 000000000..9f8badb32
--- /dev/null
+++ b/docs/docs/assistants/tools/index.md
@@ -0,0 +1,3 @@
+---
+title: Tools 
+---
\ No newline at end of file
diff --git a/docs/docs/basic-usage/server.mdx b/docs/docs/basic-usage/api-server.mdx
similarity index 90%
rename from docs/docs/basic-usage/server.mdx
rename to docs/docs/basic-usage/api-server.mdx
index 69203b2e6..1003fff1f 100644
--- a/docs/docs/basic-usage/server.mdx
+++ b/docs/docs/basic-usage/api-server.mdx
@@ -1,16 +1,11 @@
 ---
-title: API
+title: API Server
 description: Cortex Server Overview.
-slug: "server"
 ---
 
 import Tabs from "@theme/Tabs";
 import TabItem from "@theme/TabItem";
 
-:::warning
-🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
-:::
-
 Cortex has an [API server](https://cortex.so/api-reference) that runs at `localhost:39281`.
 
 
diff --git a/docs/docs/basic-usage/command-line.md b/docs/docs/basic-usage/command-line.md
deleted file mode 100644
index f48a0b94c..000000000
--- a/docs/docs/basic-usage/command-line.md
+++ /dev/null
@@ -1,48 +0,0 @@
----
-title: Command Line Interface
-description: Cortex CLI Overview.
-slug: "command-line"
----
-
-:::warning
-🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
-:::
-
-Cortex has a [Docker](https://docs.docker.com/engine/reference/commandline/cli/) and [Ollama](https://ollama.com/)-inspired [CLI syntax](/docs/cli) for running model operations. 
-
-## How It Works
-Cortex’s CLI invokes the Cortex Engine’s API, which runs in the background on port `39281`. 
-
-
-## Basic Usage
-### [Start Cortex Server](/docs/cli)
-```bash
-# By default the server will be started on port `39281`
-cortex
-```
-### [Run Model](/docs/cli/run)
-Cortex supports these [Built-in Models](/models)
-```bash
-# Pull and start a model
-cortex run <model_id>
-```
-### [Chat with Model](/docs/cli/chat)
-```bash
-# chat with a model
-cortex chat <model_id>
-```
-### [Show the Model State](/docs/cli/ps) 
-```bash
-# Show a model and cortex system status
-cortex ps
-```
-### [Stop Model](/docs/cli/stop)
-```bash
-# Stop a model
-cortex stop
-```
-### [Pull Model](/docs/cli/pull)
-```bash
-# Pull a model
-cortex pull <model_id>
-```
diff --git a/docs/docs/basic-usage/integration/js-library.md b/docs/docs/basic-usage/cortex-js.md
similarity index 79%
rename from docs/docs/basic-usage/integration/js-library.md
rename to docs/docs/basic-usage/cortex-js.md
index e2d83fcdd..4e5a4a774 100644
--- a/docs/docs/basic-usage/integration/js-library.md
+++ b/docs/docs/basic-usage/cortex-js.md
@@ -1,9 +1,18 @@
 ---
 title: cortex.js
-description: How to integrate cortex.js with a Typescript application.
-slug: "ts-library"
+description: How to use the Cortex.js Library
 ---
 
+[Cortex.js](https://github.com/janhq/cortex.js) is a Typescript client library that can be used to interact with the Cortex API. 
+
+This is still a work in progress, and we will let the community know once a stable version is available. 
+
+:::warning
+🚧 Cortex.js is currently under development, and this page is a stub for future development. 
+:::
+
+
+<!-- 
 :::warning
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
@@ -61,4 +70,4 @@ async function inference() {
 }
 
 inference();
-```
+``` -->
diff --git a/docs/docs/basic-usage/integration/py-library.md b/docs/docs/basic-usage/cortex-py.md
similarity index 91%
rename from docs/docs/basic-usage/integration/py-library.md
rename to docs/docs/basic-usage/cortex-py.md
index 3e126d068..4ff1504d8 100644
--- a/docs/docs/basic-usage/integration/py-library.md
+++ b/docs/docs/basic-usage/cortex-py.md
@@ -1,9 +1,15 @@
 ---
 title: cortex.py
 description: How to integrate cortex.py with a Python application.
-slug: "py-library"
 ---
 
+
+:::warning
+🚧 Cortex.py is currently under development, and this page is a stub for future development. 
+:::
+
+
+<!-- 
 :::warning
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
@@ -51,4 +57,4 @@ completion = client.chat.completions.create(
     ],
 )
 print(completion.choices[0].message.content)
-```
+``` -->
diff --git a/docs/docs/basic-usage/overview.mdx b/docs/docs/basic-usage/index.mdx
similarity index 98%
rename from docs/docs/basic-usage/overview.mdx
rename to docs/docs/basic-usage/index.mdx
index 107746845..93baed257 100644
--- a/docs/docs/basic-usage/overview.mdx
+++ b/docs/docs/basic-usage/index.mdx
@@ -1,6 +1,6 @@
 ---
 title: Overview
-description: Overview.
+description: Cortex Overview
 slug: "basic-usage"
 ---
 
diff --git a/docs/docs/built-in-models.mdx b/docs/docs/built-in-models.mdx
deleted file mode 100644
index 836c2d874..000000000
--- a/docs/docs/built-in-models.mdx
+++ /dev/null
@@ -1,54 +0,0 @@
----
-title: Built-in Models
-description: Cortex Curated Models
----
-
-import Tabs from "@theme/Tabs";
-import TabItem from "@theme/TabItem";
-
-
-:::warning
-🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
-:::
-
-Cortex.cpp maintains a collection of built-in models that cover the most popular open-source models.
-
-## Cortex Model Repos
-Built-in models are [Cortex Model Repositories](/docs/hub/cortex-hub) hosted on HuggingFace and pre-compiled for different engines, allowing one model to have multiple branches in various formats.
-
-## Built-in Model Variants
-Built-in models are made available across the following variants: 
-
-- **By format**: `gguf`, `onnx`, and `tensorrt-llm`
-- **By Size**: `7b`, `13b`, and more.
-- **By quantizations**: `q4`, `q8`, and more.
-:::info
-You can see our full list of Built-in Models [here](/models). 
-:::
-### Run Model 
-
-Built-in models can be run via Docker-like syntax:
-
-```bash
-# Run a model
-cortex run model-id
-# Run a model variant
-cortex run model-id:branch
-```
-For example:
-
-```bash
-# Run Mistral Built-in Model
-cortex pull mistral
-# Run Mistral in GGUF format
-cortex pull mistral:gguf
-# Run Mistral in TensorRT-LLM format
-cortex engines tensorrt-llm init
-cortex pull mistral:7b-tensorrt-llm
-# Run Mistral in ONNX format
-cortex engines onnx init
-cortex pull mistral:onnx
-# Run Mistral with a different size
-cortex pull mistral:7b-gguf
-
-```
\ No newline at end of file
diff --git a/docs/docs/capabilities/audio-generation.md b/docs/docs/capabilities/audio-generation.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/audio-generation.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/capabilities/embeddings.md b/docs/docs/capabilities/embeddings.md
new file mode 100644
index 000000000..2c2fb4d54
--- /dev/null
+++ b/docs/docs/capabilities/embeddings.md
@@ -0,0 +1,7 @@
+---
+title: Embeddings
+---
+
+:::info
+🚧 Cortex is currently under development, and this page is a stub for future development. 
+:::
\ No newline at end of file
diff --git a/docs/docs/capabilities/hardware/index.md b/docs/docs/capabilities/hardware/index.md
new file mode 100644
index 000000000..6b0a66657
--- /dev/null
+++ b/docs/docs/capabilities/hardware/index.md
@@ -0,0 +1,37 @@
+---
+title: Hardware Awareness
+draft: True
+---
+
+# Hardware Awareness
+
+Cortex is designed to be Hardware Aware, meaning it can detect your hardware configuration and automatically set parameters to optimize performance and avoid hardware-related errors.
+
+## Hardware Awareness
+
+Cortex's Hardware awareness allows it to do the following: 
+
+- Context Length Optimization
+- Context Length Optimization: Cortex maximizes the context length allowed by your hardware, ensuring that you can work with larger datasets and more complex models without performance degradation.
+- Preventing hardware-related errors
+- Error Handling for Insufficient VRAM: When loading a second model, Cortex provides useful error messages if there is insufficient VRAM memory. This proactive approach helps prevent out-of-memory errors and guides users on how to resolve the issue.
+
+### Model Compatibility
+
+- Model Compatibility Detection: Cortex automatically detects your hardware configuration to determine the compatibility of different models. This ensures that the models you use are optimized for your specific hardware setup.
+- This is for the Hub, and for existing Models 
+
+## Hardware Management
+
+### Activating Specific GPUs
+
+Cortex gives you the ability to activating specific GPUs for inference, giving you fine-grained control over hardware resources. This is especially useful for multi-GPU systems. 
+- Activate GPUs: Cortex can activate and utilize GPUs to accelerate processing, ensuring that computationally intensive tasks are handled efficiently.
+You also have the option to deactivate all GPUs, to run inference on only CPU and RAM. 
+
+### Hardware Monitoring
+
+- Monitoring System Usage
+- Monitor VRAM Usage: Cortex keeps track of VRAM usage to prevent out-of-memory (OOM) errors. It ensures that VRAM is used efficiently and provides warnings when resources are running low.
+- Monitor System Resource Usage: Cortex continuously monitors the usage of system resources, including CPU, RAM, and GPUs. This helps in maintaining optimal performance and identifying potential bottlenecks.
+
diff --git a/docs/docs/capabilities/image-generation.md b/docs/docs/capabilities/image-generation.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/image-generation.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/model-overview.mdx b/docs/docs/capabilities/models/index.mdx
similarity index 100%
rename from docs/docs/model-overview.mdx
rename to docs/docs/capabilities/models/index.mdx
diff --git a/docs/docs/model-yaml.mdx b/docs/docs/capabilities/models/model-yaml.mdx
similarity index 99%
rename from docs/docs/model-yaml.mdx
rename to docs/docs/capabilities/models/model-yaml.mdx
index 53a25a770..602ba40cd 100644
--- a/docs/docs/model-yaml.mdx
+++ b/docs/docs/capabilities/models/model-yaml.mdx
@@ -6,7 +6,6 @@ description: The model.yaml
 import Tabs from "@theme/Tabs";
 import TabItem from "@theme/TabItem";
 
-
 :::warning
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
@@ -71,7 +70,7 @@ ngl: 33             # Undefined = loaded from model
 
 The `model.yaml` is composed of three high-level sections:
 
-### Cortex Meta
+### Model Metadata
 ```yaml
 model: gemma-2-9b-it-Q8_0 
 name: Llama 3.1      
diff --git a/docs/docs/model-presets.mdx b/docs/docs/capabilities/models/presets.mdx
similarity index 98%
rename from docs/docs/model-presets.mdx
rename to docs/docs/capabilities/models/presets.mdx
index d4196e146..799cf6cbc 100644
--- a/docs/docs/model-presets.mdx
+++ b/docs/docs/capabilities/models/presets.mdx
@@ -7,6 +7,7 @@ description: Model Presets
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
 
+<!-- 
 ## Model Presets
 
 Model presets are saved `model.yaml` files that serve as templates for pre-configured model settings. These presets are designed to ensure optimal performance with the specified engine.
@@ -14,4 +15,4 @@ These presets are not restricted to specific models. You can apply the presets t
 
 :::info
 Model presets override the values of the `model.yaml`. If presets are available, Cortex uses them. Otherwise, it defaults to `model.yaml` values.
-:::
\ No newline at end of file
+::: -->
\ No newline at end of file
diff --git a/docs/docs/capabilities/moderation.md b/docs/docs/capabilities/moderation.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/moderation.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/capabilities/reasoning.md b/docs/docs/capabilities/reasoning.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/reasoning.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/capabilities/speech-to-text.md b/docs/docs/capabilities/speech-to-text.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/speech-to-text.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/capabilities/text-generation.md b/docs/docs/capabilities/text-generation.md
new file mode 100644
index 000000000..680625667
--- /dev/null
+++ b/docs/docs/capabilities/text-generation.md
@@ -0,0 +1,7 @@
+---
+title: Text Generation
+---
+
+:::info
+🚧 Cortex is currently under development, and this page is a stub for future development. 
+:::
\ No newline at end of file
diff --git a/docs/docs/capabilities/text-to-speech.md b/docs/docs/capabilities/text-to-speech.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/text-to-speech.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/capabilities/vision.md b/docs/docs/capabilities/vision.md
new file mode 100644
index 000000000..355f25d6d
--- /dev/null
+++ b/docs/docs/capabilities/vision.md
@@ -0,0 +1,3 @@
+---
+unlisted: true
+---
\ No newline at end of file
diff --git a/docs/docs/chat-completions.mdx b/docs/docs/chat-completions.mdx
index c39f25877..9b1dce01d 100644
--- a/docs/docs/chat-completions.mdx
+++ b/docs/docs/chat-completions.mdx
@@ -1,7 +1,6 @@
 ---
 title: Chat Completions
-description: Chat Completions Feature.
-slug: "text-generation"
+description: Chat Completions Feature
 ---
 
 import Tabs from "@theme/Tabs";
diff --git a/docs/docs/integrate-remote-engine.mdx b/docs/docs/engines/engine-extension.mdx
similarity index 94%
rename from docs/docs/integrate-remote-engine.mdx
rename to docs/docs/engines/engine-extension.mdx
index b32fcc635..8a62cd813 100644
--- a/docs/docs/integrate-remote-engine.mdx
+++ b/docs/docs/engines/engine-extension.mdx
@@ -1,8 +1,13 @@
 ---
-title: Integrate Remote Engine
-description: How to integrate remote engine into Cortex.
+title: Building Engine Extensions
+description: Cortex supports Engine Extensions to integrate both :ocal inference engines, and Remote APIs.
 ---
 
+:::info
+🚧 Cortex is currently under development, and this page is a stub for future development. 
+:::
+
+<!-- 
 import Tabs from "@theme/Tabs";
 import TabItem from "@theme/TabItem";
 
@@ -81,4 +86,4 @@ The `transformResponse` method is used to transform the data received from the e
 **Example: Anthropic Engine**
 
 In the Anthropic Engine, the `transformResponse` method handles both stream and non-stream responses. It processes the response data and converts it into a standardized format.
-
+ -->
diff --git a/docs/docs/engines/onnx.mdx b/docs/docs/engines/onnx.mdx
index d4e999406..9a5e0092a 100644
--- a/docs/docs/engines/onnx.mdx
+++ b/docs/docs/engines/onnx.mdx
@@ -1,6 +1,7 @@
 ---
 title: ONNX
-description: ONNX Model Format.
+description: ONNX Model Format
+unlisted: true
 ---
 
 :::warning
diff --git a/docs/docs/engines/tensorrt-llm.mdx b/docs/docs/engines/tensorrt-llm.mdx
index 0cfe7d483..b03765b75 100644
--- a/docs/docs/engines/tensorrt-llm.mdx
+++ b/docs/docs/engines/tensorrt-llm.mdx
@@ -1,6 +1,7 @@
 ---
 title: TensorRT-LLM
-description: TensorRT-LLM Model Format.
+description: TensorRT-LLM Model Format
+unlisted: true
 ---
 
 :::warning
diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
new file mode 100644
index 000000000..40a708675
--- /dev/null
+++ b/docs/docs/guides/function-calling.md
@@ -0,0 +1,3 @@
+---
+title: Function Calling
+---
\ No newline at end of file
diff --git a/docs/docs/guides/structured-outputs.md b/docs/docs/guides/structured-outputs.md
new file mode 100644
index 000000000..b14739ab2
--- /dev/null
+++ b/docs/docs/guides/structured-outputs.md
@@ -0,0 +1,3 @@
+---
+title: Structured Outputs
+---
\ No newline at end of file
diff --git a/docs/docs/installation/gpu-acceleration.mdx b/docs/docs/installation/gpu-acceleration.mdx
deleted file mode 100644
index ff57a714f..000000000
--- a/docs/docs/installation/gpu-acceleration.mdx
+++ /dev/null
@@ -1,8 +0,0 @@
----
-title: GPU Acceleration
-description: GPU Acceleration.
----
-
-:::warning
-🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
-:::
\ No newline at end of file
diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts
index 32b32ab99..659e155d7 100644
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -410,7 +410,6 @@ const config: Config = {
       items: [
         { to: "/models", label: "Models", position: "left" },
         { to: "/changelog", label: "Changelog", position: "left" },
-        { to: "/contact", label: "Enterprise", position: "left" },
         {
           type: "doc",
           position: "right",
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 09f2fb298..95dbda3fa 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -43,269 +43,120 @@ const sidebars: SidebarsConfig = {
         { type: "doc", id: "installation/mac", label: "Mac" },
         { type: "doc", id: "installation/linux", label: "Linux" },
         { type: "doc", id: "installation/docker", label: "Docker" },
-        {
-          type: "doc",
-          id: "installation/gpu-acceleration",
-          label: "GPU Acceleration",
-        },
       ],
     },
-    {
-      type: "html",
-      value: "BASIC USAGE",
-
-      className: "sidebar-divider",
-    },
-    { type: "doc", id: "basic-usage/overview", label: "Overview" },
-    { type: "doc", id: "basic-usage/cortexrc", label: ".cortexrc" },
-    { type: "doc", id: "model-yaml", label: "model.yaml" },
-    { type: "doc", id: "data-folder", label: "Data Folder" },
     {
       type: "category",
-      label: "Libraries",
-      link: {
-        type: "generated-index",
-      },
+      label: "Basic Usage",
+      link: { type: "doc", id: "basic-usage/index" },
       collapsed: true,
       items: [
+        { type: "doc", id: "basic-usage/api-server", label: "API Server" },
         {
           type: "doc",
-          id: "basic-usage/integration/js-library",
+          id: "basic-usage/cortex-js",
           label: "cortex.js",
         },
         {
           type: "doc",
-          id: "basic-usage/integration/py-library",
+          id: "basic-usage/cortex-py",
           label: "cortex.py",
         },
       ],
     },
     {
       type: "category",
-      label: "Model Sources",
+      label: "Architecture",
+      link: {
+        type: "generated-index",
+      },
+      collapsed: true,
+      items: [
+        { type: "doc", id: "architecture/data-folder", label: "Cortex Data Folder" },
+        { type: "doc", id: "architecture/cortex-db", label: "cortex.db" },
+        { type: "doc", id: "architecture/cortexrc", label: ".cortexrc" },
+      ]
+    },
+    {
+      type: "html",
+      value: "CAPABILITIES",
+      className: "sidebar-divider",
+    },
+    {
+      type: "category",
+      label: "Pulling Models",
       link: { type: "doc", id: "hub/index" },
       collapsed: true,
       items: [
-        { type: "doc", id: "hub/cortex-hub", label: "Cortex Model Repos" },
-        { type: "doc", id: "hub/hugging-face", label: "HuggingFace Repos" },
-        {
-          type: "doc",
-          id: "hub/nvidia-ngc",
-          label: "Nvidia Catalog (Coming Soon)",
-        },
+        { type: "doc", id: "hub/hugging-face", label: "Adding a HF Organization" },
+        // { type: "doc", id: "hub/cortex-hub", label: "Cortex Model Repos" },
+        // {
+        //   type: "doc",
+        //   id: "hub/nvidia-ngc",
+        //   label: "Nvidia Catalog (Coming Soon)",
+        // },
       ],
     },
     {
       type: "category",
-      label: "Engines",
+      label: "Running Models",
+      link: { type: "doc", id: "capabilities/models/index"},
+      collapsed: true,
+      items: [
+        { type: "doc", id: "capabilities/models/model-yaml", label: "model.yaml" },
+        { type: "doc", id: "capabilities/models/presets", label: "Model Presets" },
+      ],
+    },
+    {
+      type: "category",
+      label: "Engine Management",
       link: { type: "doc", id: "engines/index" },
       collapsed: true,
       items: [
-        { type: "doc", id: "engines/llamacpp", label: "Llama.cpp" },
+        { type: "doc", id: "engines/llamacpp", label: "llama.cpp" },
         // { type: "doc", id: "engines/tensorrt-llm", label: "TensorRT-LLM" },
         // { type: "doc", id: "engines/onnx", label: "ONNX" },
+        { type: "doc", id: "engines/engine-extension", label: "Building Engine Extensions" },
+        
       ],
     },
-    // {
-    //   type: "category",
-    //   label: "Basic Usage",
-    //   link: {
-    //     type: "generated-index",
-    //   },
-    //   collapsed: true,
-    //   items: [
-    //     { type: "doc", id: "basic-usage/command-line", label: "CLI" },
-    //     { type: "doc", id: "basic-usage/server", label: "API" },
-    // {
-    //   type: "category",
-    //   label: "Integration",
-    //   link: {
-    //     type: "generated-index",
-    //   },
-    //   collapsed: true,
-    //   items: [
-    //     {
-    //       type: "doc",
-    //       id: "basic-usage/integration/js-library",
-    //       label: "cortex.js",
-    //     },
-    //     {
-    //       type: "doc",
-    //       id: "basic-usage/integration/py-library",
-    //       label: "cortex.py",
-    //     },
-    //   ],
-    // },
-    //   ],
-    // },
-    // { type: "doc", id: "telemetry", label: "Telemetry" },
-    // MODELs
-    // {
-    //   type: "html",
-    //   value: "MODELS",
-    //   className: "sidebar-divider",
-    // },
-    // { type: "doc", id: "model-overview", label: "Overview" },
-    // { type: "doc", id: "model-yaml", label: "model.yaml" },
-    // { type: "doc", id: "built-in-models", label: "Built-in Models" },
-    // {
-    //   type: "category",
-    //   label: "Using Models",
-    //   link: { type: "doc", id: "using-models" },
-    //   collapsed: true,
-    //   items: [
-    //     { type: "doc", id: "model-yaml", label: "model.yaml" },
-    //     // { type: "doc", id: "model-presets", label: "Model Presets" },
-    //     { type: "doc", id: "built-in-models", label: "Built-in Models" },
-    //   ],
-    // },
-    // BASIC USAGE
-    // {
-    //   type: "html",
-    //   value: "BASIC USAGE",
-    //   className: "sidebar-divider",
-    // },
-    // { type: "doc", id: "command-line", label: "CLI" },
-    // { type: "doc", id: "ts-library", label: "Typescript Library" },
-    // { type: "doc", id: "py-library", label: "Python Library" },
-    // { type: "doc", id: "server", label: "Server Endpoint" },
-    // CAPABILITIES
-    // {
-    //   type: "html",
-    //   value: "ENDPOINTS",
-    //   className: "sidebar-divider",
-    // },
-    // { type: "doc", id: "chat-completions", label: "Chat Completions" },
-    // { type: "doc", id: "embeddings", label: "Embeddings" },
-    // CLI
     {
-      type: "html",
-      value: "CLI",
-      className: "sidebar-divider",
-    },
-    { type: "doc", id: "cli/cortex", label: "cortex" },
-    { type: "doc", id: "cli/start", label: "cortex start" },
-    { type: "doc", id: "cli/chat", label: "cortex chat" },
-    { type: "doc", id: "cli/embeddings", label: "cortex embeddings" },
-    // { type: "doc", id: "cli/presets", label: "cortex presets" },
-    { type: "doc", id: "cli/pull", label: "cortex pull" },
-    { type: "doc", id: "cli/run", label: "cortex run" },
-    { type: "doc", id: "cli/models/index", label: "cortex models" },
-    { type: "doc", id: "cli/engines/index", label: "cortex engines" },
-    { type: "doc", id: "cli/stop", label: "cortex stop" },
-    { type: "doc", id: "cli/ps", label: "cortex ps" },
-    { type: "doc", id: "cli/update", label: "cortex update" },
-    // { type: "doc", id: "cli/telemetry", label: "cortex telemetry" },
-    // { type: "doc", id: "cli/benchmark", label: "cortex benchmark" },
-    // ARCHITECTURE
-    // {
-    //   type: "html",
-    //   value: "ARCHITECTURE",
-    //   className: "sidebar-divider",
-    // },
-    // { type: "doc", id: "architecture", label: "Cortex" },
-    // {
-    //   type: "category",
-    //   label: "Engines",
-    //   link: {
-    //     type: "generated-index",
-    //   },
-    //   collapsed: true,
-    //   items: [
-    //     { type: "doc", id: "cortex-llamacpp", label: "llama.cpp" },
-    //     { type: "doc", id: "cortex-tensorrt-llm", label: "TensorRT-LLM" },
-    //     { type: "doc", id: "cortex-onnx", label: "ONNX" },
-    //     {
-    //       type: "doc",
-    //       id: "integrate-remote-engine",
-    //       label: "Integrate Remote Engine",
-    //     },
-    //   ],
-    // },
-    // {
-    //   type: "category",
-    //   label: "Infrastructure",
-    //   link: {
-    //     type: "generated-index",
-    //   },
-    //   collapsed: true,
-    //   items: [
-    //     { type: "doc", id: "telemetry-architecture", label: "Telemetry Infra" },
-    //     {
-    //       type: "doc",
-    //       id: "benchmarking-architecture",
-    //       label: "Benchmarking Infra",
-    //     },
-    //   ],
-    // },
-    // {
-    //   type: "html",
-    //   value: "TROUBLESHOOTING",
-    //   className: "sidebar-divider",
-    // },
-    // { type: "doc", id: "troubleshooting", label: "Troubleshooting" },
-  ],
-  platform: [
-    {
-      type: "html",
-      value:
-        '<div class="mt-4"><a class="menu__link" href="/docs/"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide mr-2 lucide-brain-circuit w-7 h-7 p-1 border rounded border-gray-200 dark:border-gray-700"><path d="M12 5a3 3 0 1 0-5.997.125 4 4 0 0 0-2.526 5.77 4 4 0 0 0 .556 6.588A4 4 0 1 0 12 18Z"></path><path d="M9 13a4.5 4.5 0 0 0 3-4"></path><path d="M6.003 5.125A3 3 0 0 0 6.401 6.5"></path><path d="M3.477 10.896a4 4 0 0 1 .585-.396"></path><path d="M6 18a4 4 0 0 1-1.967-.516"></path><path d="M12 13h4"></path><path d="M12 18h6a2 2 0 0 1 2 2v1"></path><path d="M12 8h8"></path><path d="M16 8V5a2 2 0 0 1 2-2"></path><circle cx="16" cy="13" r=".5"></circle><circle cx="18" cy="3" r=".5"></circle><circle cx="20" cy="21" r=".5"></circle><circle cx="20" cy="8" r=".5"></circle></svg>Cortex</a></div>',
-    },
-    {
-      type: "html",
-      value:
-        '<div><a class="menu__link" href="/docs/cortex-platform/"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide mr-2 lucide-blocks w-7 h-7 p-1 border rounded border-gray-200 dark:border-gray-700"><rect width="7" height="7" x="14" y="3" rx="1"></rect><path d="M10 21V8a1 1 0 0 0-1-1H4a1 1 0 0 0-1 1v12a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-5a1 1 0 0 0-1-1H3"></path></svg>Platform<span class="bg-black dark:bg-white dark:text-black text-white rounded-full px-2 py-1 ml-2 text-xs">Coming Soon</span></a></div>',
+      type: "category",
+      label: "Hardware Management",
+      link: { type: "doc", id: "capabilities/hardware/index" },
+      collapsed: true,
+      items: [
+      ],
     },
+    { type: "doc", id: "capabilities/text-generation", label: "Text Generation" },
+    // { type: "doc", id: "capabilities/image-generation", label: "Image Generation" },
+    // { type: "doc", id: "capabilities/vision", label: "Vision" },
+    // { type: "doc", id: "capabilities/audio-generation", label: "Audio Generation" },
+    // { type: "doc", id: "capabilities/text-to-speech", label: "Text to Speech" },
+    // { type: "doc", id: "capabilities/speech-to-text", label: "Speech to text" },
+    { type: "doc", id: "capabilities/embeddings", label: "Embeddings" },
+    // { type: "doc", id: "capabilities/moderation", label: "Moderation" },
+    // { type: "doc", id: "capabilities/reasoning", label: "Reasoning" },
     {
       type: "html",
-      value: "GET STARTED",
+      value: "GUIDES",
       className: "sidebar-divider",
     },
-    "cortex-platform/about",
+    { type: "doc", id: "guides/function-calling", label: "Function Calling"},
+    { type: "doc", id: "guides/structured-outputs", label: "Structured Outputs"},
     {
       type: "html",
-      value: "ENDPOINTS",
+      value: "ASSISTANTS",
       className: "sidebar-divider",
     },
-    { type: "doc", id: "cortex-platform/benchmarking", label: "Benchmarking" },
-    {
-        type: "html",
-        value: "ARCHITECTURE",
-        className: "sidebar-divider",
-      },
-      { type: "doc", id: "architecture", label: "Cortex" },
-      {
-        type: "category",
-        label: "Engines",
-        link: {
-          type: "generated-index",
-        },
-        collapsed: true,
-        items: [
-          { type: "doc", id: "cortex-llamacpp", label: "llama.cpp" },
-          { type: "doc", id: "cortex-tensorrt-llm", label: "TensorRT-LLM" },
-          { type: "doc", id: "cortex-onnx", label: "ONNX" },
-          {
-            type: "doc",
-            id: "integrate-remote-engine",
-            label: "Integrate Remote Engine",
-          },
-        ],
-      },
+    { type: "doc", id: "assistants/index", label: "Assistants"},
     {
       type: "category",
-      label: "Infrastructure",
-      link: {
-        type: "generated-index",
-      },
+      label: "Tools",
+      link: { type: "doc", id: "assistants/tools/index" },
       collapsed: true,
       items: [
-        { type: "doc", id: "telemetry-architecture", label: "Telemetry Infra" },
-        {
-          type: "doc",
-          id: "benchmarking-architecture",
-          label: "Benchmarking Infra",
-        },
+        // { type: "doc", id: "assistants/tools/file-search", label: "File Search" },
       ],
     },
     {
@@ -313,19 +164,19 @@ const sidebars: SidebarsConfig = {
       value: "CLI",
       className: "sidebar-divider",
     },
-    // { type: "doc", id: "cli/cortex", label: "cortex" },
-    // { type: "doc", id: "cli/chat", label: "cortex chat" },
-    // { type: "doc", id: "cli/embeddings", label: "cortex embeddings" },
-    { type: "doc", id: "cli/presets", label: "cortex presets" },
-    // { type: "doc", id: "cli/pull", label: "cortex pull" },
-    // { type: "doc", id: "cli/run", label: "cortex run" },
-    // { type: "doc", id: "cli/models/index", label: "cortex models" },
-    // { type: "doc", id: "cli/engines/index", label: "cortex engines" },
-    // { type: "doc", id: "cli/stop", label: "cortex stop" },
-    // { type: "doc", id: "cli/ps", label: "cortex ps" },
-    // { type: "doc", id: "cli/telemetry", label: "cortex telemetry" },
-    { type: "doc", id: "cli/benchmark", label: "cortex benchmark" },
-  ],
+    { type: "doc", id: "cli/cortex", label: "cortex" },
+    { type: "doc", id: "cli/start", label: "cortex start" },
+    { type: "doc", id: "cli/chat", label: "cortex chat" },
+    { type: "doc", id: "cli/embeddings", label: "cortex embeddings" },
+    // { type: "doc", id: "cli/presets", label: "cortex presets" },
+    { type: "doc", id: "cli/pull", label: "cortex pull" },
+    { type: "doc", id: "cli/run", label: "cortex run" },
+    { type: "doc", id: "cli/models/index", label: "cortex models" },
+    { type: "doc", id: "cli/engines/index", label: "cortex engines" },
+    { type: "doc", id: "cli/stop", label: "cortex stop" },
+    { type: "doc", id: "cli/ps", label: "cortex ps" },
+    { type: "doc", id: "cli/update", label: "cortex update" }, 
+  ]
 };
 
 export default sidebars;

From 4d59e9292778c4200740a99879ab019aa8fc86d8 Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 15:35:06 +0800
Subject: [PATCH 08/13] Update sidebars and OpenAPI spec

---
 docs/sidebars.ts                | 2 +-
 docs/static/openapi/cortex.json | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 95dbda3fa..7f6010171 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -122,7 +122,7 @@ const sidebars: SidebarsConfig = {
     },
     {
       type: "category",
-      label: "Hardware Management",
+      label: "Hardware Awareness",
       link: { type: "doc", id: "capabilities/hardware/index" },
       collapsed: true,
       items: [
diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 0f715456d..32d1fc02e 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -172,7 +172,7 @@
             }
           }
         },
-        "tags": ["Inference"]
+        "tags": ["Chat"]
       }
     },
     "/v1/models/pull": {
@@ -1190,7 +1190,7 @@
   },
   "tags": [
     {
-      "name": "Inference",
+      "name": "Chat",
       "description": "This endpoint initiates interaction with a Large Language Models (LLM)."
     },
     {
@@ -1222,7 +1222,7 @@
     {
       "name": "CORTEX",
       "tags": [
-        "Inference",
+        "Chat",
         "Engines",
         "Events",
         "Models",

From d303c729ce03fe3f967f9aa094c4792785840f2e Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 16:45:33 +0800
Subject: [PATCH 09/13] Scaffold key points for Hardware Awareness

---
 docs/docs/capabilities/hardware/index.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/docs/capabilities/hardware/index.md b/docs/docs/capabilities/hardware/index.md
index 6b0a66657..da7bf2d4c 100644
--- a/docs/docs/capabilities/hardware/index.md
+++ b/docs/docs/capabilities/hardware/index.md
@@ -7,13 +7,16 @@ draft: True
 
 Cortex is designed to be Hardware Aware, meaning it can detect your hardware configuration and automatically set parameters to optimize performance and avoid hardware-related errors.
 
-## Hardware Awareness
+## Hardware Optimization
 
 Cortex's Hardware awareness allows it to do the following: 
 
-- Context Length Optimization
 - Context Length Optimization: Cortex maximizes the context length allowed by your hardware, ensuring that you can work with larger datasets and more complex models without performance degradation.
-- Preventing hardware-related errors
+- Engine Optimization: we detect your CPU and GPU, and maintain a list of optimized engines for each hardware configuration, e.g. taking advantage of AVX-2 and AVX-512 instructions on CPUs. 
+
+## Hardware Awareness
+
+- Preventing hardware-related error
 - Error Handling for Insufficient VRAM: When loading a second model, Cortex provides useful error messages if there is insufficient VRAM memory. This proactive approach helps prevent out-of-memory errors and guides users on how to resolve the issue.
 
 ### Model Compatibility
@@ -34,4 +37,3 @@ You also have the option to deactivate all GPUs, to run inference on only CPU an
 - Monitoring System Usage
 - Monitor VRAM Usage: Cortex keeps track of VRAM usage to prevent out-of-memory (OOM) errors. It ensures that VRAM is used efficiently and provides warnings when resources are running low.
 - Monitor System Resource Usage: Cortex continuously monitors the usage of system resources, including CPU, RAM, and GPUs. This helps in maintaining optimal performance and identifying potential bottlenecks.
-

From 5d8157b067bfdf47a10781ebf161de7b82c053c8 Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 17:31:54 +0800
Subject: [PATCH 10/13] Remove model.list

---
 docs/docs/capabilities/hardware/index.md     |  2 +-
 docs/docs/capabilities/models/model-yaml.mdx | 11 +----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/docs/docs/capabilities/hardware/index.md b/docs/docs/capabilities/hardware/index.md
index da7bf2d4c..acf190ecc 100644
--- a/docs/docs/capabilities/hardware/index.md
+++ b/docs/docs/capabilities/hardware/index.md
@@ -5,7 +5,7 @@ draft: True
 
 # Hardware Awareness
 
-Cortex is designed to be Hardware Aware, meaning it can detect your hardware configuration and automatically set parameters to optimize performance and avoid hardware-related errors.
+Cortex is designed to be hardware aware, meaning it can detect your hardware configuration and automatically set parameters to optimize compatibility and performance, and avoid hardware-related errors.
 
 ## Hardware Optimization
 
diff --git a/docs/docs/capabilities/models/model-yaml.mdx b/docs/docs/capabilities/models/model-yaml.mdx
index 602ba40cd..983f0f528 100644
--- a/docs/docs/capabilities/models/model-yaml.mdx
+++ b/docs/docs/capabilities/models/model-yaml.mdx
@@ -12,17 +12,8 @@ import TabItem from "@theme/TabItem";
 
 Cortex.cpp uses a `model.yaml` file to specify the configuration for running a model. Models can be downloaded from the Cortex Model Hub or Hugging Face repositories. Once downloaded, the model data is parsed and stored in the `models` folder.
 
-## `model.list`
-The `model.list` file acts as a registry for all model files used by Cortex.cpp. It keeps track of every downloaded and imported model by listing their details in a structured format. Each time a model is downloaded or imported, Cortex.cpp will automatically append an entry to `model.list` with the following format:
-```
-# Downloaded model
-<model-id> <author_repo-id> <branch-name> <path-to-model.yaml> <model-alias>
-
-# Imported model
-<model-id> local imported <path-to-model-id.yaml> <model-alias>
+## Structure of `model.yaml`
 
-```
-## `model.yaml` High Level Structure
 Here is an example of `model.yaml` format:
 ```yaml
 # BEGIN GENERAL METADATA

From ca6ef40dfed92f468ce2b592797578fb4c489130 Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 17:57:15 +0800
Subject: [PATCH 11/13] Update links

---
 docs/docs/architecture/data-folder.mdx  | 2 +-
 docs/docs/capabilities/models/index.mdx | 4 ++--
 docs/docs/engines/llamacpp.mdx          | 2 +-
 docs/docs/engines/onnx.mdx              | 2 +-
 docs/docs/engines/tensorrt-llm.mdx      | 2 +-
 docs/docs/quickstart.mdx                | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/docs/architecture/data-folder.mdx b/docs/docs/architecture/data-folder.mdx
index 7acfbd361..cda2a4402 100644
--- a/docs/docs/architecture/data-folder.mdx
+++ b/docs/docs/architecture/data-folder.mdx
@@ -132,7 +132,7 @@ The main directory that stores all Cortex-related files, located in the user's h
 #### `models/`
 Contains the AI models used by Cortex for processing and generating responses.
 :::info
-For more information regarding the `model.list` and `model.yaml`, please see [here](/docs/model-yaml).
+For more information regarding the `model.list` and `model.yaml`, please see [here](/docs/capabilities/models/model-yaml).
 :::
 #### `logs/`
 Stores log files that are essential for troubleshooting and monitoring the performance of the Cortex.cpp API server and CLI.
diff --git a/docs/docs/capabilities/models/index.mdx b/docs/docs/capabilities/models/index.mdx
index 0eecc9ee4..b6f4b9036 100644
--- a/docs/docs/capabilities/models/index.mdx
+++ b/docs/docs/capabilities/models/index.mdx
@@ -20,7 +20,7 @@ Cortex.cpp supports three model formats:
 - TensorRT-LLM
 
 :::info
-For details on each format, see the [Model Formats](/docs/model-yaml#model-formats) page.
+For details on each format, see the [Model Formats](/docs/capabilities/models/model-yaml#model-formats) page.
 :::
 
 ## Built-in Models 
@@ -38,5 +38,5 @@ You can see our full list of Built-in Models [here](/models).
 :::
 
 ## Next steps
-- Cortex requires a `model.yaml` file to run a model. Find out more [here](/docs/model-yaml).
+- Cortex requires a `model.yaml` file to run a model. Find out more [here](/docs/capabilities/models/model-yaml).
 - Cortex supports multiple model hubs hosting built-in models. See details [here](/docs/model-sources).
\ No newline at end of file
diff --git a/docs/docs/engines/llamacpp.mdx b/docs/docs/engines/llamacpp.mdx
index f65c15473..c550e2e92 100644
--- a/docs/docs/engines/llamacpp.mdx
+++ b/docs/docs/engines/llamacpp.mdx
@@ -13,7 +13,7 @@ Cortex uses `llama.cpp` as the default engine by default the `GGUF` format is su
 Cortex automatically generates any `GGUF` model from the HuggingFace repo that does not have the `model.yaml` file.
 :::
 
-## [`model.yaml`](/docs/model-yaml) Sample
+## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample
 ```yaml
 ## BEGIN GENERAL GGUF METADATA
 id: Mistral-Nemo-Instruct-2407 # Model ID unique between models (author / quantization)
diff --git a/docs/docs/engines/onnx.mdx b/docs/docs/engines/onnx.mdx
index 9a5e0092a..7110007d7 100644
--- a/docs/docs/engines/onnx.mdx
+++ b/docs/docs/engines/onnx.mdx
@@ -18,7 +18,7 @@ cortex engines onnx init
 ## Run an ONNX model
 cortex run openhermes-2.5:7b-onnx
 ```
-## [`model.yaml`](/docs/model-yaml) Sample
+## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample
 ```yaml
 name: openhermes-2.5
 model: openhermes
diff --git a/docs/docs/engines/tensorrt-llm.mdx b/docs/docs/engines/tensorrt-llm.mdx
index b03765b75..1a06b0a86 100644
--- a/docs/docs/engines/tensorrt-llm.mdx
+++ b/docs/docs/engines/tensorrt-llm.mdx
@@ -18,7 +18,7 @@ cortex engines tensorrt-llm init
 ## Run a TensorRT-LLM model
 cortex run openhermes-2.5:7b-tensorrt-llm
 ```
-## [`model.yaml`](/docs/model-yaml) Sample
+## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample
 ```yaml
 name: Openhermes-2.5 7b Linux Ada
 model: openhermes-2.5:7B-tensorrt-llm
diff --git a/docs/docs/quickstart.mdx b/docs/docs/quickstart.mdx
index 687707c66..293eff271 100644
--- a/docs/docs/quickstart.mdx
+++ b/docs/docs/quickstart.mdx
@@ -173,4 +173,4 @@ Now that Cortex.cpp is set up, here are the next steps to explore:
 
 1. Adjust the folder path and configuration using the [`.cortexrc`](/docs/basic-usage/cortexrc) file.
 2. Explore the Cortex.cpp [data folder](/docs/data-folder) to understand how it stores data.
-3. Learn about the structure of the [`model.yaml`](/docs/model-yaml) file in Cortex.cpp.
+3. Learn about the structure of the [`model.yaml`](/docs/capabilities/models/model-yaml) file in Cortex.cpp.

From 58be664b265261a7e74f4c93f649feb453cf1a72 Mon Sep 17 00:00:00 2001
From: Daniel <101145494+dan-jan@users.noreply.github.com>
Date: Sat, 2 Nov 2024 18:01:39 +0800
Subject: [PATCH 12/13] Update broken links

---
 docs/docs/quickstart.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs/quickstart.mdx b/docs/docs/quickstart.mdx
index 293eff271..2ebf53c7b 100644
--- a/docs/docs/quickstart.mdx
+++ b/docs/docs/quickstart.mdx
@@ -171,6 +171,6 @@ This command starts the Cortex.cpp API server at `localhost:39281`.
 ## What's Next?
 Now that Cortex.cpp is set up, here are the next steps to explore:
 
-1. Adjust the folder path and configuration using the [`.cortexrc`](/docs/basic-usage/cortexrc) file.
-2. Explore the Cortex.cpp [data folder](/docs/data-folder) to understand how it stores data.
+1. Adjust the folder path and configuration using the [`.cortexrc`](/docs/architecture/cortexrc) file.
+2. Explore the Cortex.cpp [data folder](/docs/architecture/data-folder) to understand how it stores data.
 3. Learn about the structure of the [`model.yaml`](/docs/capabilities/models/model-yaml) file in Cortex.cpp.

From 76d653f4c3a0f79526c5d06e44beec56a82ce5ba Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 4 Nov 2024 08:02:26 +0700
Subject: [PATCH 13/13] fix: server uploads to temporary directory (#1610)

* fix: server uploads to temporary directory

* fix: format

* fix: start/stop server

* fix: postinstall

* fix: stop server linux windows

* fix: build windows
---
 engine/main.cc                                 | 3 +++
 engine/templates/linux/postinst                | 1 +
 engine/templates/macos/postinstall             | 6 ++++++
 engine/templates/windows/installer-beta.iss    | 6 +++++-
 engine/templates/windows/installer-nightly.iss | 6 +++++-
 engine/templates/windows/installer.iss         | 6 +++++-
 6 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/engine/main.cc b/engine/main.cc
index 1e97384c8..afe843817 100644
--- a/engine/main.cc
+++ b/engine/main.cc
@@ -109,6 +109,9 @@ void RunServer(std::optional<int> port) {
   drogon::app().registerController(pm_ctl);
   drogon::app().registerController(server_ctl);
 
+  auto upload_path = std::filesystem::temp_directory_path() / "cortex-uploads";
+  drogon::app().setUploadPath(upload_path.string());
+
   LOG_INFO << "Server started, listening at: " << config.apiServerHost << ":"
            << config.apiServerPort;
   LOG_INFO << "Please load your model";
diff --git a/engine/templates/linux/postinst b/engine/templates/linux/postinst
index 401b9f4b8..583639b5e 100644
--- a/engine/templates/linux/postinst
+++ b/engine/templates/linux/postinst
@@ -9,3 +9,4 @@ fi
 USER_TO_RUN_AS=${SUDO_USER:-$(whoami)}
 echo "Download cortex.llamacpp engines by default for user $USER_TO_RUN_AS"
 sudo -u $USER_TO_RUN_AS env PATH=$PATH:/usr/lib/wsl/lib /usr/bin/$DESTINATION_BINARY_NAME engines install llama-cpp
+sudo -u $USER_TO_RUN_AS env PATH=$PATH:/usr/lib/wsl/lib /usr/bin/$DESTINATION_BINARY_NAME stop
diff --git a/engine/templates/macos/postinstall b/engine/templates/macos/postinstall
index 6d06c673d..38310a452 100644
--- a/engine/templates/macos/postinstall
+++ b/engine/templates/macos/postinstall
@@ -11,9 +11,15 @@ fi
 
 USER_TO_RUN_AS=$(stat -f "%Su" /dev/console)
 
+echo "Start server before downloading server for user $USER_TO_RUN_AS"
+sudo -u $USER_TO_RUN_AS /usr/local/bin/$DESTINATION_BINARY_NAME start
+
 echo "Download cortex.llamacpp engines by default for user $USER_TO_RUN_AS"
 sudo -u $USER_TO_RUN_AS /usr/local/bin/$DESTINATION_BINARY_NAME engines install llama-cpp
 
+echo "Stop server"
+sudo -u $USER_TO_RUN_AS /usr/local/bin/$DESTINATION_BINARY_NAME stop
+
 sudo chown -R $USER_TO_RUN_AS:staff "/Users/$USER_TO_RUN_AS/$DATA_FOLDER_NAME"
 sudo chown $USER_TO_RUN_AS:staff "/Users/$USER_TO_RUN_AS/$CONFIGURATION_FILE_NAME"
 
diff --git a/engine/templates/windows/installer-beta.iss b/engine/templates/windows/installer-beta.iss
index 7bb937d11..b8524afd0 100644
--- a/engine/templates/windows/installer-beta.iss
+++ b/engine/templates/windows/installer-beta.iss
@@ -36,7 +36,7 @@ Filename: "{app}\cortex-beta.exe"; Parameters: "stop"; StatusMsg: "Stopping cort
 procedure AddToUserPathAndInstallEngines;
 var
   ExpandedAppDir: String;
-  CmdLine, CortexInstallCmd: String;
+  CmdLine, CortexInstallCmd, CortexStopServerCmd: String;
   ResultCode: Integer;
   i: Integer;
   SkipPostInstall: Boolean;
@@ -84,6 +84,10 @@ begin
   CortexInstallCmd := Format('"%s\cortex-beta.exe" engines install llama-cpp', [ExpandedAppDir]);
   Exec('cmd.exe', '/C ' + CortexInstallCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
 
+  // Stop server
+  CortexStopServerCmd := Format('"%s\cortex-nightly.exe" stop', [ExpandedAppDir]);
+  Exec('cmd.exe', '/C ' + CortexStopServerCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
+
   // Set the progress bar to 90% after downloading the engine
   WizardForm.ProgressGauge.Position := 90;
   WizardForm.ProgressGauge.Update;
diff --git a/engine/templates/windows/installer-nightly.iss b/engine/templates/windows/installer-nightly.iss
index 21a681234..0cf5616fc 100644
--- a/engine/templates/windows/installer-nightly.iss
+++ b/engine/templates/windows/installer-nightly.iss
@@ -36,7 +36,7 @@ Filename: "{app}\cortex-nightly.exe"; Parameters: "stop"; StatusMsg: "Stopping c
 procedure AddToUserPathAndInstallEngines;
 var
   ExpandedAppDir: String;
-  CmdLine, CortexInstallCmd: String;
+  CmdLine, CortexInstallCmd, CortexStopServerCmd: String;
   ResultCode: Integer;
   i: Integer;
   SkipPostInstall: Boolean;
@@ -84,6 +84,10 @@ begin
   CortexInstallCmd := Format('"%s\cortex-nightly.exe" engines install llama-cpp', [ExpandedAppDir]);
   Exec('cmd.exe', '/C ' + CortexInstallCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
 
+  // Stop server
+  CortexStopServerCmd := Format('"%s\cortex-nightly.exe" stop', [ExpandedAppDir]);
+  Exec('cmd.exe', '/C ' + CortexStopServerCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
+
   // Set the progress bar to 90% after downloading the engine
   WizardForm.ProgressGauge.Position := 90;
   WizardForm.ProgressGauge.Update;
diff --git a/engine/templates/windows/installer.iss b/engine/templates/windows/installer.iss
index 5bbd83ec7..440d6bf93 100644
--- a/engine/templates/windows/installer.iss
+++ b/engine/templates/windows/installer.iss
@@ -36,7 +36,7 @@ Filename: "{app}\cortex.exe"; Parameters: "stop"; StatusMsg: "Stopping cortexcpp
 procedure AddToUserPathAndInstallEngines;
 var
   ExpandedAppDir: String;
-  CmdLine, CortexInstallCmd: String;
+  CmdLine, CortexInstallCmd, CortexStopServerCmd: String;
   ResultCode: Integer;
   i: Integer;
   SkipPostInstall: Boolean;
@@ -84,6 +84,10 @@ begin
   CortexInstallCmd := Format('"%s\cortex.exe" engines install llama-cpp', [ExpandedAppDir]);
   Exec('cmd.exe', '/C ' + CortexInstallCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
 
+  // Stop server
+  CortexStopServerCmd := Format('"%s\cortex-nightly.exe" stop', [ExpandedAppDir]);
+  Exec('cmd.exe', '/C ' + CortexStopServerCmd, '', SW_HIDE, ewWaitUntilTerminated, ResultCode);
+
   // Set the progress bar to 90% after downloading the engine
   WizardForm.ProgressGauge.Position := 90;
   WizardForm.ProgressGauge.Update;