Skip to content

Commit e2e1ddb

Browse files
matteoservamatteongxson
authored
server : Prefilling assistant message in openai compatible API (#13174)
* Prefilling assistant message in openai compatible API * fixed indentation * fixed code convention * simplify method usage * no more than one assistant message at end of messages * merge checks into prefill code * Update examples/server/utils.hpp --------- Co-authored-by: matteo <[email protected]> Co-authored-by: Xuan-Son Nguyen <[email protected]>
1 parent d9d398f commit e2e1ddb

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

examples/server/utils.hpp

+22
Original file line numberDiff line numberDiff line change
@@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse(
642642
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
643643
}
644644

645+
// if the assistant message appears at the end of list, we do not add end-of-turn token
646+
// for ex. this can be useful to modify the reasoning process in reasoning models
647+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
648+
common_chat_msg last_message;
649+
if (prefill_assistant_message) {
650+
last_message = inputs.messages.back();
651+
inputs.messages.pop_back();
652+
653+
/* sanity check, max one assistant message at the end of the list */
654+
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
655+
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
656+
}
657+
658+
inputs.extract_reasoning = false;
659+
inputs.add_generation_prompt = true;
660+
}
661+
645662
// Apply chat template to the list of messages
646663
auto chat_params = common_chat_templates_apply(tmpls, inputs);
647664

665+
/* Append assistant prefilled message */
666+
if (prefill_assistant_message) {
667+
chat_params.prompt += last_message.content;
668+
}
669+
648670
llama_params["chat_format"] = static_cast<int>(chat_params.format);
649671
llama_params["prompt"] = chat_params.prompt;
650672
if (!chat_params.grammar.empty()) {

0 commit comments

Comments
 (0)