Skip to content

Commit

Permalink
Merge branch 'master' into cb-by-default
Browse files Browse the repository at this point in the history
  • Loading branch information
andrei-kochin authored Jan 15, 2025
2 parents 4136cfe + 2e5c2a1 commit abc02db
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bandit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: 3.11
- run: python -m pip install bandit==1.8.0
- run: python -m pip install bandit
- run: python -m bandit --recursive --configfile bandit.yml .
2 changes: 1 addition & 1 deletion bandit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
# IPAS Required Checkers. Do not disable these
# Additional checkers may be added if desired
tests:
[ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413']
[ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413']

# (optional) list skipped test IDs here, eg '[B101, B406]':
# The following checkers are not required but be added to tests list if desired
Expand Down
46 changes: 35 additions & 11 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,19 +690,38 @@ namespace static_llm {
StatefulLLMPipeline::StatefulLLMPipeline(
const std::filesystem::path& models_path,
const ov::genai::Tokenizer& tokenizer,
const std::string&,
const std::string& device,
const ov::AnyMap& config
) : LLMPipelineImplBase(tokenizer,
utils::from_config_json_if_exists(models_path)),
m_sampler(m_tokenizer) {

auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
ov::AnyMap properties = config;

auto compiled = setupAndCompileModel(model, model_desc, properties);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
const auto use_blob = pop_or_default(properties, "USE_BLOB", false);
if (use_blob) {
auto blob_path = pop_or_default(properties, "BLOB_PATH", std::string{});
if (blob_path.empty()) {
blob_path = (models_path / "openvino_model.blob").string();
}
if (!std::filesystem::exists(blob_path)) {
OPENVINO_THROW("Blob file is not found at: " + blob_path);
}
std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
if (!fin.is_open()) {
OPENVINO_THROW("Blob file can't be opened: " + blob_path);
}
auto compiled = genai::utils::singleton_core().import_model(fin, device, {});
m_max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
auto min_resp_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
m_kvcache_total = m_max_prompt_len + min_resp_len;
m_request = compiled.create_infer_request();
} else {
auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
ov::AnyMap properties = config;
auto compiled = setupAndCompileModel(model, model_desc, properties);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
}
}


Expand All @@ -721,11 +740,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
m_sampler.set_seed(m_generation_config.rng_seed);
}

std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
const std::shared_ptr<ov::Model>& model,
void StatefulLLMPipeline::updateStatefulConfig(
const ModelConfigDesc& model_desc,
ov::AnyMap& pipeline_config) {

const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
m_max_prompt_len = kMaxPromptLen;
Expand Down Expand Up @@ -755,6 +772,13 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(

// Replace CACHE_DIR option if NPUW is enabled
set_npuw_cache_dir(pipeline_config);
}

std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
const std::shared_ptr<ov::Model>& model,
const ModelConfigDesc& model_desc,
ov::AnyMap& pipeline_config) {
updateStatefulConfig(model_desc, pipeline_config);

return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
}
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/src/llm_pipeline_static.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
const ModelConfigDesc& model_desc,
ov::AnyMap& pipeline_config);

void updateStatefulConfig(
const ModelConfigDesc& model_desc,
ov::AnyMap& pipeline_config);

DecodedResults generate(
StringInputs inputs,
OptionalGenerationConfig generation_config,
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ class InputsEmbedder::IInputsEmbedder {
auto start_tokenizer_time = std::chrono::steady_clock::now();
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));

// some symbols combinations can be encoded by the tokenizer in different ways
// if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
Expand Down Expand Up @@ -211,8 +213,6 @@ class InputsEmbedder::IInputsEmbedder {
if (m_last_disappeared_token.has_value())
encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
}
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
m_templated_chat_history = std::move(new_templated_chat_history);
m_tokenized_history.clear();
std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
Expand Down

0 comments on commit abc02db

Please sign in to comment.