Skip to content

Commit

Permalink
Merge branch 'master' into use-continuos-batching-by-default
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov authored Dec 26, 2024
2 parents e517917 + e8db2ef commit 17d4333
Show file tree
Hide file tree
Showing 20 changed files with 1,462 additions and 1,452 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ jobs:
- name: 'Whisper'
cmd: 'tests/python_tests/test_whisper_generate_api.py'
- name: 'LLM & VLM'
cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
defaults:
run:
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
env:
PYTHONPATH: "./build/:$PYTHONPATH"

Expand Down
2 changes: 1 addition & 1 deletion samples/export-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
diffusers==0.31.0 # For image generation pipelines
diffusers==0.32.1 # For image generation pipelines
timm==1.0.12 # For exporting InternVL2
torchvision # For visual language models
transformers>=4.43 # For Whisper
14 changes: 8 additions & 6 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
* @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
* Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
*
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
* @param presence_penalty reduces absolute log prob if the token was generated at least once.
* @param frequency_penalty reduces absolute log prob as many times as the token was generated.
*
* Beam search specific parameters:
* @param num_beams number of beams for beam search. 1 disables beam search.
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
Expand All @@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
* "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
* "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
*
* Random sampling parameters:
* Random (or multinomial) sampling parameters:
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
* @param temperature the value used to modulate token probabilities for random sampling.
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
* @param presence_penalty reduces absolute log prob if the token was generated at least once.
* @param frequency_penalty reduces absolute log prob as many times as the token was generated.
* @param rng_seed initializes random generator.
* @param num_return_sequences the number of sequences to generate from a single prompt.
*
* Assisting generation parameters:
* @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
Expand All @@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
size_t min_new_tokens = 0;
bool echo = false;
size_t logprobs = 0;

std::set<std::string> stop_strings;
// Default setting in vLLM (and OpenAI API) is not to include stop string in the output
bool include_stop_str_in_output = false;
Expand Down
30 changes: 15 additions & 15 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
*
* This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
* When this constructor is used bos, eos, pad token ids are expected to be in IR.
*
* This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
* When this constructor is used bos, eos, pad token ids are expected to be in IR.
* If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
* @param tokenizer_model_str tokenizer model string
* @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
Expand All @@ -55,9 +55,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
);

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
*
* This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
*
* This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
* tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
* are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
* @param model_str model string
Expand All @@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
ov::Tensor& detokenizer_weights_tensor,
Properties&&... properties
) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }

/**
* @brief ov::genai::Tokenizer constructor with variable number of properties
* @param model_str model string
Expand All @@ -93,7 +93,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor,
Properties&&... properties)
: Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }

/**
* @brief ov::genai::Tokenizer constructor with variable number of properties
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
Expand All @@ -111,7 +111,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @return pair of [input_ids, attention_mask]
*/
TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});

/**
* @brief encode batch of prompts. Left padding will be applied by default
* @param prompts vector storing batch of prompts
Expand All @@ -127,7 +127,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @param prompt std::string with input prompt
* @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
* @return pair of [input_ids, attention_mask]
*/
*/
template <typename... Properties>
util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) {
return encode(prompt, AnyMap{std::forward<Properties>(properties)...});
Expand Down Expand Up @@ -164,7 +164,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
}

/**
* @brief decode tokens.
* @brief decode tokens.
* @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
* @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
* @return vector of std::string, with size = batch_size
Expand All @@ -183,7 +183,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
}

/**
* @brief batched decoding of tokens.
* @brief batched decoding of tokens.
* @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
* @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
* @return vector of std::string, with size equal to batch_size
Expand All @@ -203,8 +203,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {

/**
* @brief Embeds input prompts with special tags for a chat scenario.
*
* For example, for Qwen family models, the prompt "1+1=" would be transformed into
*
* For example, for Qwen family models, the prompt "1+1=" would be transformed into
* <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n.
*
* @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...].
Expand All @@ -214,7 +214,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @throws Exception if the chat template was unable to parse the input history.
*/
std::string apply_chat_template(ChatHistory history,
bool add_generation_prompt,
bool add_generation_prompt,
const std::string& chat_template = {}) const;

/// @brief Override a chat_template read from tokenizer_config.json.
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ void GenerationConfig::validate() const {
"Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
if (is_beam_search()) {
OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
if (num_beam_groups > 1) {
OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
}
} else {
OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");
Expand Down
48 changes: 25 additions & 23 deletions src/cpp/src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,16 @@ class Tokenizer::TokenizerImpl {
public:
ov::CompiledModel m_tokenizer;
ov::CompiledModel m_detokenizer;

std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
// To change the adding special tokens mode we use a statefull subgraph,

// To change the adding special tokens mode we use a statefull subgraph,
// this flag holds the current state value of the CompiledModel.
bool m_add_special_tokens = true;
bool m_skip_special_tokens = true;
bool m_older_than_24_5 = false;

int64_t m_pad_token_id = -1;
int64_t m_bos_token_id = -1;
int64_t m_eos_token_id = -1;
Expand All @@ -111,6 +112,7 @@ class Tokenizer::TokenizerImpl {
void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
bool add_special_tokens_flag = m_add_special_tokens;
bool skip_special_tokens_flag = m_skip_special_tokens;

ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);

Expand All @@ -126,11 +128,11 @@ class Tokenizer::TokenizerImpl {
// state but the effect is incorrect.
return;
}

// add_special_tokens is managed by Select op with a bool input.
ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
*add_special_tensor.data<bool>() = add_special_tokens_flag;

// skip_special_tokens is managed by multiplication with a number, therefore i32.
ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
*skip_special_tensor.data<int>() = skip_special_tokens_flag;
Expand All @@ -148,32 +150,32 @@ class Tokenizer::TokenizerImpl {

TokenizerImpl() = default;

TokenizerImpl(const std::filesystem::path& models_papth, const ov::AnyMap& properties) {
setupTokenizer(models_papth, properties);
TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
setup_tokenizer(models_path, properties);
}

TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
setupTokenizer(models, properties);
setup_tokenizer(models, properties);
}

void setupTokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
ScopedVar env_manager(tokenizers_relative_to_genai().string());
auto core = get_core_singleton();

OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file");
OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");

std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
std::shared_ptr<ov::Model> ov_detokenizer = nullptr;

if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
}

if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
}

setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);

// If special tokens were not found from IR, try to read them from config.
// This will be triggered only for IRs older than 2024.3.
Expand All @@ -184,21 +186,20 @@ class Tokenizer::TokenizerImpl {
// Try to read tokenizer_config if some token ids or token str are not defined.
read_tokenizer_config_if_necessary(models_path);
}

// If chat_template was not found in IR, try to read them from config.
if (m_chat_template.empty()) {
m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path);
}
}


void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
auto [ov_tokenizer, ov_detokenizer] = models;
OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided");

auto core = get_core_singleton();
std::string device = "CPU"; // only CPU is supported for now

std::string version_str;
utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
// Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
Expand Down Expand Up @@ -231,7 +232,7 @@ class Tokenizer::TokenizerImpl {
return std::move(this->m_detokenizer.create_infer_request());
});
}

// Initialize tokenizer's cache to save time later.
if (m_tokenizer) {
// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
Expand Down Expand Up @@ -286,10 +287,11 @@ class Tokenizer::TokenizerImpl {

nlohmann::json data = nlohmann::json::parse(f);

using ov::genai::utils::read_json_param;
// they are in the format {"bos_token": { "content": "<s>",... }}
auto read_token_content_str = [&data](std::string key_name, std::string& val) {
if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); }
auto read_token_content_str = [&data](const std::string& key_name, std::string& val) {
if (val.empty() && data.contains(key_name)) {
utils::read_json_param(data[key_name], "content", val);
}
};
read_token_content_str(pad_token_key_name, m_pad_token);
read_token_content_str(bos_token_key_name, m_bos_token);
Expand Down Expand Up @@ -494,7 +496,7 @@ class Tokenizer::TokenizerImpl {
{"is none", "is undefined"},
{"= none", "= undefined"},
// Jinja2Cpp does not support Python-style slicing, e.g. [1:].
// If chat template contains such slicing, we replace it with
// If chat template contains such slicing, we replace it with
// a placeholder at the moment.
{"messages[1:]", "slice(messages, 1)"},
};
Expand Down Expand Up @@ -537,7 +539,7 @@ class Tokenizer::TokenizerImpl {
env.GetSettings().trimBlocks = true;
jinja2::Template tpl(&env);
tpl.Load(chat_tpl);

jinja2::UserCallable slice_callable = jinja2::MakeCallable(
[](const jinja2::GenericList& messages, const size_t& start) {
jinja2::ValuesList result;
Expand Down Expand Up @@ -607,7 +609,7 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
ScopedVar env_manager(tokenizers_relative_to_genai().string());
auto core = get_core_singleton();
auto model = core.read_model(model_str, weights_tensor);

auto parameters = model->get_parameters();
OPENVINO_ASSERT(!parameters.empty());
if (parameters.front()->get_element_type() == ov::element::string) {
Expand Down
Loading

0 comments on commit 17d4333

Please sign in to comment.