diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 47c7fcec85..c16d0ffde4 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -82,9 +82,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param streamer optional streamer * @return std::string decoded resulting text */ - std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); + std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt); - template util::EnableIfAllStringAny generate( std::string text, @@ -124,8 +123,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { */ EncodedResults generate(ov::Tensor input_ids, std::optional attention_mask, - OptionalGenerationConfig generation_config, - OptionalStreamerVariant streamer); + OptionalGenerationConfig generation_config=nullopt, + OptionalStreamerVariant streamer=nullopt); template util::EnableIfAllStringAny operator()( @@ -133,13 +132,12 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { Properties&&... properties) { return generate(text, AnyMap{std::forward(properties)...}); } - std::string operator()(std::string text, OptionalGenerationConfig generation_config={}); - DecodedResults operator()(std::vector text, OptionalGenerationConfig generation_config); - DecodedResults operator()(std::initializer_list text, OptionalGenerationConfig generation_config); + DecodedResults operator()(std::vector text, OptionalGenerationConfig generation_config=nullopt); + DecodedResults operator()(std::initializer_list text, OptionalGenerationConfig generation_config=nullopt); // generate with streamers - std::string operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); + std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt); std::string operator()(std::string text, OptionalStreamerVariant streamer); ov::Tokenizer get_tokenizer(); @@ -162,10 +160,33 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * All names match to names in cofnig except streamer. */ static constexpr ov::Property max_new_tokens{"max_new_tokens"}; +static constexpr ov::Property max_length{"max_length"}; +static constexpr ov::Property ignore_eos{"ignore_eos"}; + +static constexpr ov::Property num_beam_groups{"num_beam_groups"}; +static constexpr ov::Property num_beams{"num_beams"}; +static constexpr ov::Property diversity_penalty{"diversity_penalty"}; +static constexpr ov::Property length_penalty{"length_penalty"}; +static constexpr ov::Property num_return_sequences{"num_return_sequences"}; +static constexpr ov::Property no_repeat_ngram_size{"no_repeat_ngram_size"}; +static constexpr ov::Property stop_criteria{"stop_criteria"}; + static constexpr ov::Property temperature{"temperature"}; +static constexpr ov::Property top_p{"top_p"}; +static constexpr ov::Property top_k{"top_k"}; +static constexpr ov::Property do_sample{"do_sample"}; +static constexpr ov::Property repetition_penalty{"repetition_penalty"}; + + +static constexpr ov::Property pad_token_id{"pad_token_id"}; +static constexpr ov::Property bos_token_id{"bos_token_id"}; +static constexpr ov::Property eos_token_id{"eos_token_id"}; + +static constexpr ov::Property bos_token{"bos_token"}; +static constexpr ov::Property eos_token{"eos_token"}; -// It's problematic to store and automaticall convert std::variant in AnyMap -static constexpr ov::Property> streamer_lambda{"streamer_lambda"}; -static constexpr ov::Property> streamer{"streamer"}; +// only lambda streamer can be set via ov::streamer(),... syntaxic sugar, +// because std::variant> can not be stored in AnyMap +static constexpr ov::Property> streamer_lambda{"streamer"}; } // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 0d32f9fcda..3f0879d702 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -7,10 +7,21 @@ namespace ov { +/** + * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods + * + * @param m_tokenizer tokenizer +*/ class StreamerBase { public: + Tokenizer m_tokenizer; + StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {}; + StreamerBase() = default; + + /// @brief put is called every time new token is decoded virtual void put(int64_t token) = 0; - + + /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one virtual void end() = 0; }; diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 54e11eaf9f..0d55d9b0fe 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -12,7 +12,7 @@ namespace ov { /** -* @brief class used to encode prompts and decode resulting tokens +* @brief class is used to encode prompts and decode resulting tokens */ class OPENVINO_GENAI_EXPORTS Tokenizer { public: @@ -27,7 +27,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @brief encode a single prompt * @return pair of [input_ids, attention_mask] */ - std::pair encode(const std::string prompt); // todo: passing by reference fails + std::pair encode(const std::string prompt); /** * @brief encode batch of prompts. Left padding will be applied by default diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 7ea134d736..7e437ad281 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -170,6 +170,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, if (!generation_config.ignore_eos && all_are_eos) break; } + if (streamer) + streamer->end(); return results; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5ac804ade9..2e4c49337a 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -186,11 +186,6 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { return m_pimpl-> generate(texts, generation_config); } @@ -245,16 +240,11 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig return m_pimpl->generate(text, generation_config, streamer); } - std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); - - // todo: get attentions from properties? - if (config_map.count("streamer_lambda")) { - streamer = config_map.at("streamer_lambda").as>(); - } else if (config_map.count("streamer")) { - streamer = config_map.at("streamer").as>(); + if (config_map.count("streamer")) { + streamer = config_map.at("streamer").as>(); } return m_pimpl->generate(text, config, streamer); @@ -263,13 +253,10 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); - - // todo: get attentions from properties? - if (config_map.count("streamer_lambda")) { - streamer = config_map.at("streamer_lambda").as>(); - } else if (config_map.count("streamer")) { - streamer = config_map.at("streamer").as>(); + if (config_map.count("streamer")) { + streamer = config_map.at("streamer").as>(); } + std::optional attention_mask; return m_pimpl->generate(input_ids, attention_mask, config, streamer); } diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index 6e5bd4ee8d..a1d2f3b01d 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -1,7 +1,6 @@ #include "text_callback_streamer.hpp" namespace ov { - TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token) { m_tokenizer = tokenizer; @@ -17,11 +16,9 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin void TextCallbackStreamer::put(int64_t token) { std::stringstream res; - // do not print anything and flush cache if EOS token is met - if (token == m_tokenizer.get_eos_token_id()) { - end(); + // do nothing if token is met and if print_eos_token=false + if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id()) return; - } m_tokens_cache.push_back(token); std::string text = m_tokenizer.decode(m_tokens_cache); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 6f6179baa4..a11cfb471a 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -53,58 +53,99 @@ class Tokenizer::TokenizerImpl { int64_t m_eos_token_id = 2; TokenizerImpl() = default; - TokenizerImpl(std::string tokenizers_path, const std::string device); + TokenizerImpl(std::string tokenizers_path, const std::string device) { + ov::Core core; + + if (ov::generate_utils::is_xml(tokenizers_path)) + OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); + + // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + core.add_extension(OPENVINO_TOKENIZERS_PATH); + + std::shared_ptr tokenizer_model, detokenizer_model; + try { + tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml"); + detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml"); + } catch (...) { + OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that " + "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\""); + } + m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request(); + m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request(); + + auto rt_info = tokenizer_model->get_rt_info(); + if (rt_info.count("eos_token_id") > 0) + m_eos_token_id = rt_info["eos_token_id"].as(); + if (rt_info.count("bos_token_id") > 0) + m_bos_token_id = rt_info["bos_token_id"].as(); + if (rt_info.count("pad_token_id") > 0) + m_pad_token_id = rt_info["pad_token_id"].as(); + } - std::pair encode(std::string prompt); - std::pair encode(std::vector& prompts); - std::string decode(std::vector tokens); - std::vector decode(ov::Tensor tokens); - std::vector decode(std::vector> lines); -}; + std::pair encode(std::string prompt) { + size_t batch_size = 1; + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); + m_tokenize_request.infer(); + return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; + } -Tokenizer::TokenizerImpl::TokenizerImpl(std::string tokenizers_path, std::string device): m_device(device) { - ov::Core core; - - if (ov::generate_utils::is_xml(tokenizers_path)) - OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); - - // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - core.add_extension(OPENVINO_TOKENIZERS_PATH); - - std::shared_ptr tokenizer_model, detokenizer_model; - try { - tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml"); - detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml"); - } catch (...) { - OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that " - "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\""); + std::pair encode(std::vector& prompts) { + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + auto size_ = m_tokenize_request.get_input_tensor().get_shape(); + m_tokenize_request.infer(); + + ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); + // todo: fix mask filled with '2' instead of '0' + ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask"); + int64_t* attention_mask_data = attention_mask.data(); + std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); + + return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; } - m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request(); - m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request(); - auto rt_info = tokenizer_model->get_rt_info(); - if (rt_info.count("eos_token_id") > 0) - m_eos_token_id = rt_info["eos_token_id"].as(); - if (rt_info.count("bos_token_id") > 0) - m_bos_token_id = rt_info["bos_token_id"].as(); - if (rt_info.count("pad_token_id") > 0) - m_pad_token_id = rt_info["pad_token_id"].as(); -} + std::string decode(std::vector tokens) { + size_t batch_size = 1; + m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); + m_detokenizer_request.infer(); + return m_detokenizer_request.get_output_tensor().data()[0]; + } + + std::vector decode(ov::Tensor tokens) { + m_detokenizer_request.set_input_tensor(tokens); + auto shape = tokens.get_shape(); + auto data = tokens.data(); + m_detokenizer_request.infer(); + auto res = m_detokenizer_request.get_output_tensor(); + + std::vector strings; + for (int i = 0; i < res.get_shape()[0]; ++i) { + strings.emplace_back(res.data()[i]); + } + return strings; + } + + std::vector decode(std::vector> lines) { + // todo: implement calling detokenizer in a single batch + std::vector results; + for (auto& line: lines){ + ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()}; + m_detokenizer_request.set_input_tensor(tokens); + m_detokenizer_request.infer(); + auto res = m_detokenizer_request.get_output_tensor(); + auto res_str = res.data()[0]; + results.emplace_back(res_str); + } + + return results; + } +}; Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) { m_pimpl = std::make_shared(tokenizers_path, device); } std::pair Tokenizer::encode(const std::string prompt) { - return m_pimpl->encode(prompt); -} - -std::pair Tokenizer::TokenizerImpl::encode(std::string prompt) { - size_t batch_size = 1; - m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); - m_tokenize_request.infer(); - - return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; + return m_pimpl->encode(std::move(prompt)); } std::pair Tokenizer::encode(std::vector& prompts) { @@ -115,74 +156,22 @@ std::pair Tokenizer::encode(std::vector&& p return m_pimpl->encode(prompts); } -std::pair Tokenizer::TokenizerImpl::encode(std::vector& prompts) { - m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); - auto size_ = m_tokenize_request.get_input_tensor().get_shape(); - m_tokenize_request.infer(); - - ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); - // todo: fix mask filled with '2' instead of '0' - ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask"); - int64_t* attention_mask_data = attention_mask.data(); - std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); - - return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; -} - std::pair Tokenizer::encode(std::initializer_list& text) { return encode(std::vector(text.begin(), text.end())); } - std::string Tokenizer::decode(std::vector tokens) { return m_pimpl->decode(tokens); } -std::string Tokenizer::TokenizerImpl::decode(std::vector tokens) { - size_t batch_size = 1; - m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); - m_detokenizer_request.infer(); - return m_detokenizer_request.get_output_tensor().data()[0]; -} - std::vector Tokenizer::decode(ov::Tensor tokens) { return m_pimpl->decode(tokens); } -std::vector Tokenizer::TokenizerImpl::decode(ov::Tensor tokens) { - m_detokenizer_request.set_input_tensor(tokens); - auto shape = tokens.get_shape(); - auto data = tokens.data(); - m_detokenizer_request.infer(); - auto res = m_detokenizer_request.get_output_tensor(); - - std::vector strings; - for (int i = 0; i < res.get_shape()[0]; ++i) { - strings.emplace_back(res.data()[i]); - } - return strings; -} - std::vector Tokenizer::decode(std::vector> lines) { return m_pimpl->decode(lines); } -std::vector Tokenizer::TokenizerImpl::decode(std::vector> lines) { - // todo: implement calling detokenizer in a single batch - - std::vector results; - for (auto& line: lines){ - ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()}; - m_detokenizer_request.set_input_tensor(tokens); - m_detokenizer_request.infer(); - auto res = m_detokenizer_request.get_output_tensor(); - auto res_str = res.data()[0]; - results.emplace_back(res_str); - } - - return results; -} - int64_t Tokenizer::get_bos_token_id() const { return m_pimpl->m_bos_token_id; } diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 69398e1aac..74cbe7e27d 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -26,9 +26,7 @@ std::string stop_criteria_to_str(const ov::GenerationConfig& config) { } } -std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { - // Create a new GenerationConfig instance and initialize from kwargs - ov::GenerationConfig config = pipeline.get_generation_config(); +void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& kwargs) { if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast(); if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast(); @@ -49,10 +47,21 @@ std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text, if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast(); if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast(); if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast(); +} +// operator() and generate methods are identical, operator() is just an alias for generate +std::string call_with_kwargs(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { + // Create a new GenerationConfig instance and initialize from kwargs + ov::GenerationConfig config = pipeline.get_generation_config(); + update_config_from_kwargs(config, kwargs); return pipeline(text, config); } +std::string call_with_config(ov::LLMPipeline& pipe, const std::string& text, const ov::GenerationConfig& config) { + std::shared_ptr streamer; + return pipe(text, config); +} + PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; @@ -62,7 +71,20 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}) .def(py::init(), py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}) - .def("__call__", &call_with_config) + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) + .def("generate", py::overload_cast(&call_with_kwargs)) + .def("generate", py::overload_cast(&call_with_config)) + + // todo: if input_ids is a ov::Tensor/numpy tensor + // todo: implement calling generate/operator() with StreamerBase or lambda streamer + // signature to be implemented: + // EncodedResults generate(ov::Tensor input_ids, + // std::optional attention_mask, + // OptionalGenerationConfig generation_config=nullopt, + // OptionalStreamerVariant streamer=nullopt); + + .def("get_tokenizer", &LLMPipeline::get_tokenizer) .def("start_chat", &ov::LLMPipeline::start_chat) .def("finish_chat", &ov::LLMPipeline::finish_chat) @@ -75,6 +97,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "Tokenizer") .def(py::init<>()) .def(py::init(), py::arg("tokenizers_path"), py::arg("device") = "CPU") + + // todo: implement encode/decode when for numpy inputs and outputs .def("encode", py::overload_cast(&ov::Tokenizer::encode), "Encode a single prompt") // TODO: common.h(1106...) template argument deduction/substitution failed: // .def("encode", py::overload_cast&>(&ov::Tokenizer::encode), "Encode multiple prompts") diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp index f1c7745c87..84e07c394b 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp @@ -23,8 +23,9 @@ int main(int argc, char* argv[]) { // Model, tokenizer and generation_config.json will be loaded from the model_path. // If generation_config.json is not found default velues for gready search will be used + // ov::streamer_lambda([](std::string subword){std::cout << subword << std::flush;}) ov::LLMPipeline pipe(model_path, device); - // cout << prompt << pipe(prompt) << endl; + // cout << prompt << pipe(prompt, ov::max_new_tokens(1000)) << endl; // todo: syntactic sugar to specify generation configs in place // cout << prompt << pipe(prompt, ov::max_new_tokens(100)) << endl;