diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 7b4c998b21..4b1a516257 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -702,7 +702,7 @@ jobs: run: | source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Download and convert MiniCPM-V-2_6 model and an image run: | python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv @@ -710,11 +710,36 @@ jobs: source ./ov/setupvars.sh optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg + - name: Generate reference + shell: python + run: | + from optimum.intel.openvino import OVModelForVisualCausalLM + from transformers import AutoProcessor + from PIL import Image + import cv2 + import numpy as np + res = 448, 448 + lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255 + lines = lines.reshape([*res, 3]) + cv2.imwrite("lines.png", lines) + lines = Image.open("lines.png").convert('RGB') + model_id = "openbmb/MiniCPM-V-2_6" + processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(./)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True) + inputs = processor([prompt], [lines], return_tensors="pt") + model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", device="CPU", trust_remote_code=True) + result = model.generate(**inputs, max_new_tokens=200) + decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] + print(decoded) + with open("ref.txt", "w") as f: + f.write(f"question:\n{decoded}\n----------\nquestion:\n") + - name: Run visual_language_chat C++ sample - MiniCPM-V-2_6 run: > source ./ov/setupvars.sh - && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg - <<< $'What is on the image?\nWhat is special on the image?' + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ lines.png + <<< $'What is unusual on this image?' | tee cpp.txt + - run: diff cpp.txt ref.txt - name: Download and convert LLaVa 1.5 model and an image run: | source ./ov/setupvars.sh diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp index 4e51ff49f1..dea75959f5 100644 --- a/src/cpp/src/visual_language/clip.hpp +++ b/src/cpp/src/visual_language/clip.hpp @@ -6,25 +6,9 @@ #include #include -//#define CLIP_DEBUG_FUNCTIONS -enum projector_type { - PROJECTOR_TYPE_RESAMPLER, - PROJECTOR_TYPE_UNKNOWN, -}; - struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; - bool has_minicpmv_projector = false; - - float image_mean[3]; - float image_std[3]; - int32_t ftype = 1; - - std::vector buf_compute_meta; - - projector_type proj_type = PROJECTOR_TYPE_RESAMPLER; - size_t patch_size = 0; + float image_mean[3] = {0.0f, 0.0f, 0.0f}; + float image_std[3] = {1.0f, 1.0f, 1.0f}; size_t image_size = 0; }; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 3aabffed93..43e866d1fc 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -557,6 +557,13 @@ ov::Tensor pack_image_features_llava_next( return result; } } + +// It's not possible to pass a GPU tensor from one model to another GPU +// model on a different ov::Core instance. +ov::Core singleton_core() { + static ov::Core core; + return core; +} } class ov::genai::VLMPipeline::VLMPipelineImpl { @@ -604,21 +611,22 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { ) }, m_tokenizer{Tokenizer(model_dir.string(), device_config)}, - m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}), + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, singleton_core()), m_is_chat_conversation{false}, m_image_id{0} { + ov::Core core = singleton_core(); if (m_vlm_config.model_type == VLMModelType::MINICPM) { - m_resampler = ov::Core{}.compile_model( + m_resampler = core.compile_model( model_dir / "openvino_resampler_model.xml", device, device_config ).create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - m_embedding = ov::Core{}.compile_model( + m_embedding = core.compile_model( model_dir / "openvino_text_embeddings_model.xml", device, device_config ).create_infer_request(); - m_language = ov::Core{}.compile_model( + m_language = core.compile_model( model_dir / "openvino_language_model.xml", device, device_config ).create_infer_request(); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 65fa614d87..2e5ad9272b 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -242,7 +242,6 @@ ov::Tensor prepare_vis_position_ids( }); size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w; ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}}; - // throw std::runtime_error(""); int64_t* res_data = position_ids.data(); std::fill_n(res_data, position_ids.get_size(), 0); @@ -285,66 +284,84 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o std::vector> results; std::vector> sizes; - // std::vector img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 std::vector> preprocessed{imgs.size()}; - std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip](const std::vector& row) { + size_t max_h = 0, max_w = 0, n_images = 0; + std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector& row) { std::vector processed_row{row.size()}; - std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip](const clip_image_u8& raw) { - return clip_image_preprocess(ctx_clip, raw); + std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) { + clip_image_f32 im = clip_image_preprocess(ctx_clip, raw); + max_h = std::max(size_t(im.ny), max_h); + max_w = std::max(size_t(im.nx), max_w); + ++n_images; + return im; }); return processed_row; }); + ov::Tensor batched_images{ov::element::f32, {n_images, 3, max_h, max_w}}; + float* batched_data = batched_images.data(); const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); - ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size}; - ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())}; - ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size); + std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), batched_data); + if (1 < preprocessed.size()) { + for (size_t row = 1; row < preprocessed.size(); ++row) { + size_t n_slices = preprocessed.at(row).size(); + for (size_t col = 0; col < n_slices; ++col) { + const clip_image_f32& elem = preprocessed.at(row).at(col); + std::copy(elem.buf.begin(), elem.buf.end(), batched_data + ((row - 1) * n_slices + col + 1) * 3 * max_h * max_w); + } + } + } + ov::Tensor pixel_values = preprocess_for_encoder(batched_images, patch_size); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); + + ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, max_h / patch_size * max_w / patch_size}}; + float* attention_data = patch_attention_mask.data(); + std::fill_n(attention_data, patch_attention_mask.get_size(), 0.0f); + std::fill_n(attention_data, resized_preprocessed.ny / patch_size * resized_preprocessed.nx / patch_size, 1.0f); + if (1 < preprocessed.size()) { + for (size_t row = 1; row < preprocessed.size(); ++row) { + size_t n_slices = preprocessed.at(row).size(); + for (size_t col = 0; col < n_slices; ++col) { + const clip_image_f32& elem = preprocessed.at(row).at(col); + std::fill_n(attention_data + ((row - 1) * n_slices + col + 1) * max_h / patch_size * max_w / patch_size, elem.ny / patch_size * elem.nx / patch_size, 1.0f); + } + } + } encoder.set_tensor("patch_attention_mask", patch_attention_mask); - ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + + ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size}; + std::vector tgt_sizes{resized_source_size}; + if (1 < preprocessed.size()) { + for (const std::vector& row : preprocessed) { + for (const clip_image_f32& elem : row) { + tgt_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size}); + } + } + } + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size); encoder.set_tensor("position_ids", position_ids); encoder.infer(); const ov::Tensor& output_tensor = encoder.get_output_tensor(); - ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()}; - output_tensor.copy_to(resized_source); if (1 == preprocessed.size()) { + ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()}; + output_tensor.copy_to(resized_source); return {std::move(resized_source), resized_source_size}; } - ImageSize raw_size{ - size_t(preprocessed.at(1).at(0).ny), - size_t(preprocessed.at(1).at(0).nx) - }; - ImageSize slices_size{ - raw_size.height / patch_size, - raw_size.width / patch_size - }; - size_t n_patches = slices_size.height * slices_size.width, - old_hidden_size = resized_source.get_shape().at(2); + size_t old_hidden_size = output_tensor.get_shape().at(2); + const float* out = output_tensor.data(); + ov::Tensor resized_source{ov::element::f32, {1, resized_source_size.height * resized_source_size.width, old_hidden_size}}; + std::copy_n(out, resized_source.get_size(), resized_source.data()); + + size_t n_patches = tgt_sizes.at(1).height * tgt_sizes.at(1).width; ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}}; - for (size_t row = 1; row < preprocessed.size(); ++row) { - for (size_t col = 0; col < preprocessed.at(row).size(); ++col) { - clip_image_f32& elem = preprocessed.at(row).at(col); - ov::Tensor pixel_values = preprocess_for_encoder( - {ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()}, - patch_size - ); - encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); - encoder.set_tensor("patch_attention_mask", patch_attention_mask); - ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); - encoder.set_tensor("position_ids", position_ids); - const ov::Tensor& old = encoder.get_output_tensor(); - encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size}); - encoder.infer(); - encoder.set_output_tensor(old); + for (size_t col = 0; col < preprocessed.size() - 1; ++col) { + for (size_t row = 0; row < preprocessed.at(1).size(); ++row) { + std::copy_n(out + (col * preprocessed.at(1).size() + row + 1) * n_patches * old_hidden_size, n_patches * old_hidden_size, encoded_slices.data() + (col * preprocessed.at(1).size() + row) * n_patches * old_hidden_size); } } - return {resized_source, resized_source_size, encoded_slices, slices_size}; + return {resized_source, resized_source_size, encoded_slices, tgt_sizes.at(1)}; } ProcessorConfig from_any_map( @@ -504,7 +521,6 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& co EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) { clip_ctx ctx_clip; - ctx_clip.patch_size = m_processor_config.patch_size; ctx_clip.image_size = m_processor_config.image_size; std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);