From 59984e969cfe4083a317314a1bdb4d44640a4eab Mon Sep 17 00:00:00 2001
From: Tomasz Krupa <tomasz.krupa@intel.com>
Date: Fri, 13 Dec 2024 06:10:50 +0000
Subject: [PATCH] [GPU] Enable weightless cache with precision conversion
 (#27742)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Details:
This change makes constants which undergo precision conversion during
transformation pipeline or graph optimization eligible for weightless
caching. Information about precision conversion which happened before
export to cache is recorded in the cache file. During the import from
cache, functionally equivalent conversions are performed.

Besides the unit tests in model_cache.cpp I tested accuracy and
performance of llama-2-7b-chat with FP16 inference mode. Performance
impact (weightless caching is OPTIMIZE_SIZE):

   | OPTIMIZE_SPEED | OPTIMIZE_SIZE
-- | -- | --
FP16 model import, no cache | 25.4 s | 13.6 s
FP16 model import, cache exists | 6.2 s | 6.4 s
FP32 model import, no cache | 57.6 | 45.8 s
FP32 model import, cache exists | 8.5 s | 15.2 s

   | OPTIMIZE_SPEED | OPTIMIZE_SIZE
-- | -- | --
FP16 model cache size | 13 GB | 6.1 MB
FP32 model cache size | 13 GB | 6.2 MB

Model import time is the measurement of from_pretrained() call when
running the llama model with openvino.genai/tools/llm_bench tool.

Question to reviewers: I'm unsure if the condition in
ov::WeightlessCacheAttribute::is_copyable() is not too lenient.
Specifically, I'm thinking of a scenario where a single complex
transformation changes constant's data type AND something else at the
same time. This would render the constant eligible for weightless
caching even though the reconstruction of transformations during the
cache load is not aware of anything besides the data type change (which
would break the feature). Does such complex transformation exist?

### Tickets:
 - CVS-157081
---
 .../src/transformations/convert_precision.cpp |   8 +
 .../tests/utils/convert_precision.cpp         |  36 +++
 .../rt_info/weightless_caching_attributes.hpp |   7 +-
 src/frontends/ir/src/ir_deserializer.cpp      |   6 +-
 .../include/intel_gpu/primitives/data.hpp     | 219 +++++++++++++++---
 .../graph_optimizer/propagate_constants.cpp   |  67 +++++-
 .../src/graph/include/pass_manager.h          |   7 +-
 .../intel_gpu/src/plugin/program_builder.cpp  |  11 +-
 .../tests/functional/behavior/model_cache.cpp | 116 ++++++----
 .../unit/shape_infer/eltwise_si_test.cpp      |   6 +-
 10 files changed, 375 insertions(+), 108 deletions(-)
diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp
index 8a2985a284769a..aa067da4f360fd 100644
--- a/src/common/transformations/src/transformations/convert_precision.cpp
+++ b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "itt.hpp"
+#include "openvino/core/rt_info/weightless_caching_attributes.hpp"
 #include "openvino/op/ops.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/pass/manager.hpp"
@@ -1405,6 +1406,13 @@ bool fuse_type_to_constant(const std::shared_ptr<ov::Node>& node,
         new_const->validate_and_infer_types();
         new_const->set_friendly_name(constant->get_friendly_name());
         ov::copy_runtime_info(constant, new_const);
+
+        const auto& rt_info = node->get_rt_info();
+        auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
+        if (weightless_caching_attr != rt_info.end()) {
+            new_const->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] =
+                weightless_caching_attr->second;
+        }
         return true;
     }
     return false;
diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp
index 318f15ab1a64dc..c2b7133506aebe 100644
--- a/src/common/transformations/tests/utils/convert_precision.cpp
+++ b/src/common/transformations/tests/utils/convert_precision.cpp
@@ -13,6 +13,7 @@
 
 #include "common_test_utils/ov_test_utils.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/rt_info/weightless_caching_attributes.hpp"
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/opsets/opset10.hpp"
 #include "openvino/opsets/opset15.hpp"
@@ -2702,3 +2703,38 @@ TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_orig_types
     FunctionsComparator::Result result = func_comparator(model_ref, model);
     ASSERT_TRUE(result.valid) << result.message;
 }
+
+TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_weightless_cache_info_as_rt_attribute) {
+    pass::Manager manager;
+
+    auto some_value = opset10::Constant::create(element::f32, Shape{1}, {2});
+    auto& node_rt_info = some_value->get_rt_info();
+    ov::WeightlessCacheAttribute attr(element::f32.size(), 0, element::f32);
+    node_rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] = attr;
+
+    ov::ParameterVector inputParams;
+    ov::ResultVector results;
+    results.push_back(std::make_shared<ov::op::v0::Result>(some_value->output(0)));
+    auto model = std::make_shared<ov::Model>(results, inputParams);
+
+    type_to_fuse_map empty_type_to_fuse_map = {};
+    bool keep_precision_sensitive_in_fp32 = false;
+    bool convert_input_output_precision = false;
+    bool store_original_precision_as_rt_attribute = true;
+    manager.register_pass<pass::ConvertPrecision>(precisions_map{{element::f32, element::f16}},
+                                                  empty_type_to_fuse_map,
+                                                  keep_precision_sensitive_in_fp32,
+                                                  convert_input_output_precision,
+                                                  store_original_precision_as_rt_attribute);
+    manager.run_passes(model);
+
+    const auto& ops = model->get_ops();
+    auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr<Node>& node) {
+        return ov::op::util::is_constant(node);
+    });
+
+    ASSERT_TRUE(it != ops.end());
+    const auto& new_rt_info = (*it)->get_rt_info();
+    auto weightless_caching_attr_it = new_rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
+    ASSERT_TRUE(weightless_caching_attr_it != new_rt_info.end());
+}
diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp
index fedcb030fb52cf..e3cf2609b26c8d 100644
--- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp
+++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "openvino/core/core_visibility.hpp"
+#include "openvino/core/node.hpp"
 #include "openvino/core/runtime_attribute.hpp"
 
 namespace ov {
@@ -25,14 +26,16 @@ class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
 
     WeightlessCacheAttribute() = delete;
 
-    WeightlessCacheAttribute(size_t original_size, size_t bin_offset)
+    WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
         : original_size(original_size),
-          bin_offset(bin_offset) {}
+          bin_offset(bin_offset),
+          original_dtype(original_dtype) {}
 
     bool is_copyable() const override;
 
     size_t original_size;
     size_t bin_offset;
+    ov::element::Type original_dtype;
 };
 
 }  // namespace ov
diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp
index 2d1dfba956ea72..d7e250f9916302 100644
--- a/src/frontends/ir/src/ir_deserializer.cpp
+++ b/src/frontends/ir/src/ir_deserializer.cpp
@@ -950,10 +950,12 @@ std::shared_ptr<ov::Node> ov::XmlDeserializer::create_node(const std::vector<ov:
         }
         const auto size = dn.attribute("size");
         const auto offset = dn.attribute("offset");
-        if (size && offset) {
+        const auto element_type = dn.attribute("element_type");
+        if (size && offset && element_type) {
             rtInfo[ov::WeightlessCacheAttribute::get_type_info_static()] =
                 ov::WeightlessCacheAttribute(static_cast<size_t>(pugixml::get_uint64_attr(dn, "size")),
-                                             static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset")));
+                                             static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset")),
+                                             ov::element::Type(pugixml::get_str_attr(dn, "element_type")));
         }
     }
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp
index 461f063ec26bc5..8a9a35b1e92fe9 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp
@@ -4,15 +4,170 @@
 
 #pragma once
 #include <climits>
+#include <algorithm>
 
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/memory.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/util/op_types.hpp"
+#include "openvino/pass/manager.hpp"
 #include "openvino/runtime/shared_buffer.hpp"
 #include "openvino/util/mmap_object.hpp"
 #include "primitive.hpp"
+#include "transformations/convert_precision.hpp"
 
 namespace cldnn {
 
+struct weights_mem {
+    std::shared_ptr<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>> shared_buf = nullptr;
+    std::shared_ptr<ov::op::v0::Constant> transformed_constant = nullptr;
+
+    const uint8_t* get_loaded_data() {
+        if (transformed_constant) {
+            return reinterpret_cast<const uint8_t*>(transformed_constant->get_data_ptr());
+        }
+        OPENVINO_ASSERT(shared_buf);
+        return shared_buf->get_ptr<uint8_t>();
+    }
+};
+
+struct weightless_cache_manager {
+    void set_constant_info(size_t bin_offset,
+                           size_t original_size,
+                           ov::element::Type original_dtype,
+                           ov::element::Type curr_dtype,
+                           ov::Shape shape) {
+        this->bin_offset = bin_offset;
+        this->original_size = original_size;
+        this->original_dtype = original_dtype;
+        this->curr_dtype = curr_dtype;
+        this->shape = shape;
+        do_weightless_caching = true;
+
+        if (original_dtype != curr_dtype) {
+            do_precision_conversion = true;
+        }
+    }
+
+    void invalidate() {
+        do_weightless_caching = false;
+    }
+
+    void set_new_dtype(ov::element::Type curr_dtype) {
+        this->curr_dtype = curr_dtype;
+        do_precision_conversion = original_dtype != curr_dtype;
+    }
+
+    bool save(BinaryOutputBuffer& ob, size_t data_size) const {
+        if (!do_weightless_caching) {
+            ob << false;
+            return false;
+        }
+
+        ob << true;
+        ob << bin_offset;
+        ob << do_precision_conversion;
+        if (do_precision_conversion) {
+            ob << original_size;
+            ob << make_data(&original_dtype, sizeof(ov::element::Type));
+            ob << make_data(&curr_dtype, sizeof(ov::element::Type));
+
+            size_t num_dims = shape.size();
+            ob << make_data(&num_dims, sizeof(size_t));
+            ob << make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type));
+        }
+        return true;
+    }
+
+    std::shared_ptr<weights_mem> load(BinaryInputBuffer& ib,
+                                      std::shared_ptr<ov::MappedMemory> mapped_weights,
+                                      size_t data_size) {
+        ib >> do_weightless_caching;
+        if (!do_weightless_caching) {
+            return nullptr;
+        }
+
+        OPENVINO_ASSERT(mapped_weights != nullptr, "mmap object is null");
+
+        ib >> bin_offset;
+        ib >> do_precision_conversion;
+        if (do_precision_conversion) {
+            ib >> original_size;
+            ib >> make_data(&original_dtype, sizeof(ov::element::Type));
+            ib >> make_data(&curr_dtype, sizeof(ov::element::Type));
+
+            size_t num_dims = 0;
+            ib >> make_data(&num_dims, sizeof(size_t));
+            shape.resize(num_dims);
+            ib >> make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type));
+        } else {
+            original_size = data_size;
+        }
+
+        auto mem_obj = std::make_shared<weights_mem>();
+        mem_obj->shared_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(
+            mapped_weights->data() + bin_offset,
+            original_size,
+            mapped_weights);
+
+        if (should_run_transformations()) {
+            run_transformations(mem_obj);
+        }
+        return mem_obj;
+    }
+
+private:
+    bool do_weightless_caching = false;
+    bool do_precision_conversion = false;
+
+    size_t bin_offset = SIZE_MAX;
+    size_t original_size = SIZE_MAX;
+    ov::element::Type original_dtype = ov::element::Type_t::undefined;
+    ov::element::Type curr_dtype = ov::element::Type_t::undefined;
+    ov::Shape shape;
+
+    bool should_run_transformations() {
+        return do_precision_conversion;
+    }
+
+    void run_transformations(std::shared_ptr<weights_mem> mem_obj) {
+        auto orig_constant = std::make_shared<ov::op::v0::Constant>(original_dtype,
+                                                                    shape,
+                                                                    mem_obj->shared_buf->get_ptr(),
+                                                                    mem_obj->shared_buf);
+
+        ov::ParameterVector inputParams;
+        ov::ResultVector results;
+        results.push_back(std::make_shared<ov::op::v0::Result>(orig_constant->output(0)));
+        auto model = std::make_shared<ov::Model>(results, inputParams, "aux");
+
+        ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations");
+
+        if (do_precision_conversion) {
+            precisions_map fp_convert_precision_map = {
+                {original_dtype, curr_dtype}};
+            type_to_fuse_map empty_fuse_map = {};
+            const bool keep_precision_sensitive_in_fp32 = false;
+            const bool convert_input_output_precision = false;
+            const bool store_original_precision_as_rt_attribute = true;
+            manager.register_pass<ov::pass::ConvertPrecision>(fp_convert_precision_map,
+                                                              empty_fuse_map,
+                                                              keep_precision_sensitive_in_fp32,
+                                                              convert_input_output_precision,
+                                                              store_original_precision_as_rt_attribute);
+        }
+
+        manager.run_passes(model);
+        const auto& ops = model->get_ops();
+        auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr<ov::Node>& node) {
+            return ov::op::util::is_constant(node);
+        });
+        OPENVINO_ASSERT(it != ops.end());
+        mem_obj->transformed_constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(*it);
+        OPENVINO_ASSERT(mem_obj->transformed_constant->get_element_type() == curr_dtype);
+    }
+};
+
 /// @brief Provides input data to topology.
 /// @details This primitive allows to pass data which is known at topology creation.
 /// For example, weights and biases for scoring networks.
@@ -20,21 +175,32 @@ namespace cldnn {
 struct data : public primitive_base<data> {
     CLDNN_DECLARE_PRIMITIVE(data)
 
-    data() : primitive_base("", {}) {}
+    data() : primitive_base("", {}) {
+        cache_info = std::make_shared<weightless_cache_manager>();
+    }
 
     /// @brief Constructs data primitive.
     /// @param id This primitive id.
     /// @param mem @ref memory object which contains data.
     /// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build.
-    data(const primitive_id& id, memory::ptr mem)
-        : primitive_base(id, {}), mem(std::move(mem)) {}
+    data(const primitive_id& id, memory::ptr mem) : primitive_base(id, {}), mem(std::move(mem)) {
+        cache_info = std::make_shared<weightless_cache_manager>();
+    }
+
+    data(const primitive_id& id, memory::ptr mem, std::shared_ptr<weightless_cache_manager> cache_info)
+        : primitive_base(id, {}),
+          mem(std::move(mem)),
+          cache_info(cache_info) {
+        if (!cache_info) {
+            this->cache_info = std::make_shared<weightless_cache_manager>();
+        }
+    }
 
     /// @brief @ref memory object which contains data.
     /// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build.
     memory::ptr mem;
 
-    size_t original_size = SIZE_MAX;
-    size_t bin_offset = SIZE_MAX;
+    std::shared_ptr<weightless_cache_manager> cache_info;
 
     size_t hash() const override {
         size_t seed = primitive::hash();
@@ -53,13 +219,8 @@ struct data : public primitive_base<data> {
         size_t data_size = mem->size();
         ob << make_data(&data_size, sizeof(size_t));
 
-        bool is_cache_without_weights = bin_offset != SIZE_MAX && data_size == original_size;
-
-        if (is_cache_without_weights) {
-            ob << true;
-            ob << bin_offset;
-        } else {
-            ob << false;
+        bool do_weightless_caching = cache_info->save(ob, data_size);
+        if (!do_weightless_caching) {
             if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
                 ob << make_data(mem->buffer_ptr(), data_size);
             } else {
@@ -88,26 +249,12 @@ struct data : public primitive_base<data> {
 
         mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false);
 
-        bool is_cache_without_weights;
-        ib >> is_cache_without_weights;
-        if (is_cache_without_weights && mapped_weights == nullptr) {
-            OPENVINO_THROW("mmap object is null");
-        }
-
-        std::shared_ptr<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>> shared_buf;
-        if (is_cache_without_weights) {
-            ib >> bin_offset;
-            original_size = data_size;
-
-            shared_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(
-                mapped_weights->data() + bin_offset,
-                data_size,
-                mapped_weights);
-        }
+        auto mem_obj = cache_info->load(ib, mapped_weights, data_size);
+        bool is_weightless_caching_enabled = mem_obj != nullptr;
 
         if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
-            if (is_cache_without_weights) {
-                std::memcpy(reinterpret_cast<uint8_t*>(mem->buffer_ptr()), shared_buf->get_ptr<uint8_t>(), data_size);
+            if (is_weightless_caching_enabled) {
+                std::memcpy(reinterpret_cast<uint8_t*>(mem->buffer_ptr()), mem_obj->get_loaded_data(), data_size);
             } else {
                 ib >> make_data(mem->buffer_ptr(), data_size);
             }
@@ -116,8 +263,8 @@ struct data : public primitive_base<data> {
             auto& strm = ib.get_engine().get_service_stream();
             if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) {
                 std::vector<uint8_t> _buf(data_size);
-                if (is_cache_without_weights) {
-                    std::memcpy(reinterpret_cast<uint8_t*>(_buf.data()), shared_buf->get_ptr<uint8_t>(), data_size);
+                if (is_weightless_caching_enabled) {
+                    std::memcpy(reinterpret_cast<uint8_t*>(_buf.data()), mem_obj->get_loaded_data(), data_size);
                 } else {
                     ib >> make_data(_buf.data(), data_size);
                 }
@@ -135,9 +282,9 @@ struct data : public primitive_base<data> {
                     size_t copy_size =
                         (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset);
                     if (buf_flag) {
-                        if (is_cache_without_weights) {
+                        if (is_weightless_caching_enabled) {
                             std::memcpy(reinterpret_cast<uint8_t*>(_buf1.data()),
-                                        shared_buf->get_ptr<uint8_t>() + dst_offset,
+                                        mem_obj->get_loaded_data() + dst_offset,
                                         copy_size);
                         } else {
                             ib >> make_data(_buf1.data(), copy_size);
@@ -148,9 +295,9 @@ struct data : public primitive_base<data> {
                         }
                         ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking);
                     } else {
-                        if (is_cache_without_weights) {
+                        if (is_weightless_caching_enabled) {
                             std::memcpy(reinterpret_cast<uint8_t*>(_buf2.data()),
-                                        shared_buf->get_ptr<uint8_t>() + dst_offset,
+                                        mem_obj->get_loaded_data() + dst_offset,
                                         copy_size);
                         } else {
                             ib >> make_data(_buf2.data(), copy_size);
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp
index 85173e9eb33e7c..a4129800733875 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp
@@ -74,11 +74,14 @@ void propagate_constants::run(program& p) {
     // replace all constant nodes which are relevant for inference (either used by non-const user or marked as output)
     // with recomputed cldnn::data
     for (auto& cout : to_replace) {
-        auto& id_to_replace = cout.first;
-        auto mem_impl = cout.second;
-
-        auto const_data =
-            std::make_shared<data>("_cldnn_const_prop_" + id_to_replace, mem_impl /* <<< REMOVE ME WHEN POSSIBLE */);
+        auto& id_to_replace = std::get<0>(cout);
+        auto mem_impl = std::get<1>(cout);
+        auto cache_info = std::get<2>(cout);
+        auto in_layout = std::get<3>(cout);
+
+        auto const_data = std::make_shared<data>("_cldnn_const_prop_" + id_to_replace,
+                                                 mem_impl, /* <<< REMOVE ME WHEN POSSIBLE */
+                                                 cache_info);
         auto& new_node = p.get_or_create(const_data);
         auto& curr_node = p.get_node(id_to_replace);
 
@@ -92,6 +95,25 @@ void propagate_constants::run(program& p) {
             }
         }
 
+        auto is_reorder_with_only_dtype_change = [&](program_node& dst) {
+            if (!in_layout) {
+                return false;
+            }
+            auto& dst_layout = dst.get_output_layout();
+            if (in_layout->data_type == dst_layout.data_type) {
+                return false;
+            }
+
+            auto aux_layout = dst_layout;
+            aux_layout.data_type = in_layout->data_type;
+            return aux_layout == *in_layout.get();
+        };
+        if (is_reorder_with_only_dtype_change(new_node)) {
+            new_node.as<data>().get_primitive()->cache_info->set_new_dtype(new_node.get_output_layout().data_type);
+        } else {
+            new_node.as<data>().get_primitive()->cache_info->invalidate();
+        }
+
         curr_node.dependencies.clear();
         // remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any
         // dependencies)
@@ -113,9 +135,10 @@ bool propagate_constants::has_non_const_user(program_node& node) const {
     return false;
 }
 
-std::list<std::pair<primitive_id, memory::ptr>> propagate_constants::calculate(engine& engine,
-                                                                               const ExecutionConfig& config,
-                                                                               std::shared_ptr<ov::threading::IStreamsExecutor> task_executor) {
+std::list<std::tuple<primitive_id, memory::ptr, std::shared_ptr<weightless_cache_manager>, std::shared_ptr<layout>>>
+propagate_constants::calculate(engine& engine,
+                               const ExecutionConfig& config,
+                               std::shared_ptr<ov::threading::IStreamsExecutor> task_executor) {
     if (!has_non_trivial_constants)
         return {};
 
@@ -123,15 +146,37 @@ std::list<std::pair<primitive_id, memory::ptr>> propagate_constants::calculate(e
     cf_config.set_property(ov::intel_gpu::optimize_data(false));
     cf_config.set_property(ov::intel_gpu::custom_outputs(const_outputs));
     network::ptr net = network::build_network(engine, nodes, cf_config, task_executor, true);
-    for (auto& cin : const_inputs)
+    std::map<primitive_id, std::pair<std::shared_ptr<weightless_cache_manager>, std::shared_ptr<layout>>>
+        weightless_cache_map;
+    for (auto& cin : const_inputs) {
         net->set_input_data(cin->id(), cin->get_attached_memory_ptr());
 
+        auto users = cin->get_users();
+        if (users.size() == 1 && users.front()->is_type<reorder>()) {
+            auto rprim = users.front()->as<reorder>().get_primitive();
+            auto id = rprim->id;
+            auto cache_ptr = cin->as<data>().get_primitive()->cache_info;
+            auto layout_ptr = std::make_shared<layout>(cin->get_output_layout());
+            weightless_cache_map.emplace(id, std::make_pair(cache_ptr, layout_ptr));
+        }
+    }
+
     net->execute({});
     net->reset_execution(true);  // wait for computations to complete
     auto outputs = net->get_outputs();
 
-    std::list<std::pair<primitive_id, memory::ptr>> ret;
-    for (auto& out : outputs) ret.push_back({out->id(), out->output_memory_ptr()});
+    std::list<std::tuple<primitive_id, memory::ptr, std::shared_ptr<weightless_cache_manager>, std::shared_ptr<layout>>>
+        ret;
+    for (auto& out : outputs) {
+        std::shared_ptr<weightless_cache_manager> cache_ptr = nullptr;
+        std::shared_ptr<layout> layout_ptr = nullptr;
+        auto it = weightless_cache_map.find(out->id());
+        if (it != weightless_cache_map.end()) {
+            cache_ptr = it->second.first;
+            layout_ptr = it->second.second;
+        }
+        ret.push_back({out->id(), out->output_memory_ptr(), cache_ptr, layout_ptr});
+    }
 
     return ret;
 }
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
index 490076a37f788e..0b7c3d85c37e27 100644
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -211,9 +211,10 @@ class propagate_constants : public base_pass {
 
 private:
     void run(program& p) override;
-    std::list<std::pair<primitive_id, memory::ptr>> calculate(engine& engine,
-                                                              const ExecutionConfig& config,
-                                                              std::shared_ptr<ov::threading::IStreamsExecutor> task_executor);
+    std::list<std::tuple<primitive_id, memory::ptr, std::shared_ptr<weightless_cache_manager>, std::shared_ptr<layout>>>
+    calculate(engine& engine,
+              const ExecutionConfig& config,
+              std::shared_ptr<ov::threading::IStreamsExecutor> task_executor);
     bool has_non_const_user(program_node& node) const;
     void handle_constant(program& prog, program_node& node);
     void add_constant(program& prog, program_node& node);
diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
index 368e25abe2ddac..a9bb813d0ce587 100644
--- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
@@ -14,6 +14,7 @@
 
 #include "intel_gpu/plugin/common_utils.hpp"
 #include "intel_gpu/plugin/program_builder.hpp"
+#include "intel_gpu/primitives/data.hpp"
 #include "intel_gpu/runtime/itt.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "intel_gpu/primitives/mutable_data.hpp"
@@ -311,11 +312,15 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptr<cldnn::pr
     if (this->m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE) {
         if (auto data_prim = dynamic_cast<cldnn::data*>(prim.get())) {
             auto rt_info = op.get_rt_info();
+
             auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
             if (weightless_cache_attr != rt_info.end()) {
-                data_prim->bin_offset = weightless_cache_attr->second.as<ov::WeightlessCacheAttribute>().bin_offset;
-                data_prim->original_size =
-                    weightless_cache_attr->second.as<ov::WeightlessCacheAttribute>().original_size;
+                auto& attr = weightless_cache_attr->second.as<ov::WeightlessCacheAttribute>();
+                data_prim->cache_info->set_constant_info(attr.bin_offset,
+                                                         attr.original_size,
+                                                         attr.original_dtype,
+                                                         op.get_output_element_type(0),
+                                                         op.get_output_shape(0));
             }
         }
     }
diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
index 839b2640ca180c..17e1ed6d0a9bbe 100644
--- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
+++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
@@ -8,48 +8,40 @@
 #include "common_test_utils/common_utils.hpp"
 #include "common_test_utils/file_utils.hpp"
 #include "common_test_utils/ov_tensor_utils.hpp"
-#include "common_test_utils/subgraph_builders/2_input_subtract.hpp"
-#include "common_test_utils/subgraph_builders/concat_with_params.hpp"
-#include "common_test_utils/subgraph_builders/conv_bias.hpp"
-#include "common_test_utils/subgraph_builders/conv_pool_relu.hpp"
-#include "common_test_utils/subgraph_builders/conv_pool_relu_no_reshapes.hpp"
-#include "common_test_utils/subgraph_builders/conv_pool_relu_non_zero.hpp"
-#include "common_test_utils/subgraph_builders/convert_transpose.hpp"
-#include "common_test_utils/subgraph_builders/detection_output.hpp"
-#include "common_test_utils/subgraph_builders/kso_func.hpp"
-#include "common_test_utils/subgraph_builders/matmul_bias.hpp"
-#include "common_test_utils/subgraph_builders/multi_single_conv.hpp"
-#include "common_test_utils/subgraph_builders/multiple_input_outpput_double_concat.hpp"
-#include "common_test_utils/subgraph_builders/nested_branch_conv_concat.hpp"
-#include "common_test_utils/subgraph_builders/nested_split_conv_concat.hpp"
 #include "common_test_utils/subgraph_builders/read_concat_split_assign.hpp"
 #include "common_test_utils/subgraph_builders/single_concat_with_constant.hpp"
-#include "common_test_utils/subgraph_builders/single_conv.hpp"
-#include "common_test_utils/subgraph_builders/single_split.hpp"
-#include "common_test_utils/subgraph_builders/split_concat.hpp"
-#include "common_test_utils/subgraph_builders/split_conv_concat.hpp"
-#include "common_test_utils/subgraph_builders/split_multi_conv_concat.hpp"
 #include "common_test_utils/subgraph_builders/ti_with_lstm_cell.hpp"
 #include "common_test_utils/test_common.hpp"
 #include "openvino/pass/serialize.hpp"
 
 namespace {
-class CheckWeightlessCacheAccuracy : public ::testing::Test,
-                                     public ::testing::WithParamInterface<bool> {
+typedef std::tuple<bool, ov::element::Type, ov::element::Type> testParams;
+
+class CheckWeightlessCacheAccuracy : public ::testing::Test, public ::testing::WithParamInterface<testParams> {
 public:
-    static std::string get_test_case_name(::testing::TestParamInfo<bool> obj) {
-        bool use_compile_model_api = obj.param;
+    static std::string get_test_case_name(::testing::TestParamInfo<testParams> obj) {
+        bool use_compile_model_api_;
+        ov::element::Type inference_mode_;
+        ov::element::Type model_dtype_;
+        std::tie(use_compile_model_api_, inference_mode_, model_dtype_) = obj.param;
 
         std::ostringstream result;
-        result << "use_compile_model_api=" << use_compile_model_api;
+        const char separator = '_';
+        result << "use_compile_model_api=" << use_compile_model_api_ << separator;
+        result << "inference_mode=" << inference_mode_ << separator;
+        result << "model_dtype=" << model_dtype_;
         return result.str();
     }
+
 protected:
     std::shared_ptr<ov::Model> model;
     std::string xml_path;
     std::string bin_path;
     std::string cache_path;
-    bool use_compile_model_api; // for loading from cache
+    std::string cache_dir;
+    bool use_compile_model_api;  // for loading from cache
+    ov::element::Type inference_mode;
+    ov::element::Type model_dtype;
 
     void SetUp() override;
     void TearDown() override;
@@ -61,36 +53,46 @@ void CheckWeightlessCacheAccuracy::SetUp() {
     xml_path = filePrefix + ".xml";
     bin_path = filePrefix + ".bin";
     cache_path = filePrefix + ".blob";
-    use_compile_model_api = GetParam();
+    cache_dir = filePrefix + "_cache_dir";
+
+    std::tie(use_compile_model_api, inference_mode, model_dtype) = GetParam();
 }
 
 void CheckWeightlessCacheAccuracy::TearDown() {
     std::remove(xml_path.c_str());
     std::remove(bin_path.c_str());
     std::remove(cache_path.c_str());
+
+    ov::test::utils::removeFilesWithExt(cache_dir, "blob");
+    ov::test::utils::removeFilesWithExt(cache_dir, "cl_cache");
+    ov::test::utils::removeDir(cache_dir);
 }
 
 void CheckWeightlessCacheAccuracy::run() {
-    ov::AnyMap config = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE) };
-    ov::AnyMap config_with_weights_path = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), ov::weights_path(bin_path) };
+    ov::AnyMap config = {ov::cache_dir(cache_dir),
+                         ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE),
+                         ov::hint::inference_precision(inference_mode)};
+    ov::AnyMap config_with_weights_path = {ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE),
+                                           ov::weights_path(bin_path),
+                                           ov::hint::inference_precision(inference_mode)};
     auto core = ov::test::utils::PluginCache::get().core();
     ov::pass::Serialize(xml_path, bin_path).run_on_model(model);
 
     ov::CompiledModel compiled_model;
-    OV_ASSERT_NO_THROW(compiled_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config));
+    compiled_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config);
 
-    auto ofstr = std::ofstream(cache_path, std::ofstream::binary);
-    OV_ASSERT_NO_THROW(compiled_model.export_model(ofstr));
-    ofstr.close();
+    if (!use_compile_model_api) {
+        auto ofstr = std::ofstream(cache_path, std::ofstream::binary);
+        compiled_model.export_model(ofstr);
+        ofstr.close();
+    }
 
     auto ifstr = std::ifstream(cache_path, std::ifstream::binary);
     ov::CompiledModel imported_model;
     if (use_compile_model_api) {
-        OV_ASSERT_NO_THROW(imported_model =
-                               core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config));
+        imported_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config);
     } else {
-        OV_ASSERT_NO_THROW(imported_model =
-                               core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path));
+        imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path);
     }
     ifstr.close();
 
@@ -99,39 +101,57 @@ void CheckWeightlessCacheAccuracy::run() {
 
     for (size_t param_idx = 0; param_idx < model->get_parameters().size(); ++param_idx) {
         auto input = model->get_parameters().at(param_idx);
-        auto tensor = ov::test::utils::create_and_fill_tensor(input->get_element_type(), input->get_shape());
+        auto tensor = ov::test::utils::create_and_fill_tensor_real_distribution(input->get_element_type(),
+                                                                                input->get_shape(),
+                                                                                -100,
+                                                                                100,
+                                                                                param_idx);
         orig_req.set_tensor(input, tensor);
         new_req.set_tensor(input, tensor);
     }
 
-    OV_ASSERT_NO_THROW(orig_req.infer());
-    OV_ASSERT_NO_THROW(new_req.infer());
+    orig_req.infer();
+    new_req.infer();
 
     auto result_vector = model->get_results();
     for (auto& res : result_vector) {
         auto orig_out = orig_req.get_tensor(res);
         auto new_out = new_req.get_tensor(res);
-        ov::test::utils::compare(orig_out, new_out);
+        ov::test::utils::compare(orig_out, new_out, inference_mode);
     }
 }
 
 TEST_P(CheckWeightlessCacheAccuracy, ReadConcatSplitAssign) {
-    model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, ov::element::f16);
-    run();
+    OV_ASSERT_NO_THROW(model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, model_dtype));
+    OV_ASSERT_NO_THROW(run());
 }
 
 TEST_P(CheckWeightlessCacheAccuracy, SingleConcatWithConstant) {
-    model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, ov::element::f16);
-    run();
+    OV_ASSERT_NO_THROW(model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, model_dtype));
+    OV_ASSERT_NO_THROW(run());
 }
 
 TEST_P(CheckWeightlessCacheAccuracy, TiWithLstmCell) {
-    model = ov::test::utils::make_ti_with_lstm_cell(ov::element::f16);
-    run();
+    OV_ASSERT_NO_THROW(model = ov::test::utils::make_ti_with_lstm_cell(model_dtype));
+    OV_ASSERT_NO_THROW(run());
 }
 
-INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, CheckWeightlessCacheAccuracy,
-                         ::testing::Bool(),
+const std::vector<ov::element::Type> inference_modes = {
+    ov::element::f32,
+    ov::element::f16,
+};
+
+const std::vector<ov::element::Type> model_dtypes = {
+    ov::element::f32,
+    ov::element::f16,
+    ov::element::bf16,
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy,
+                         CheckWeightlessCacheAccuracy,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::ValuesIn(inference_modes),
+                                            ::testing::ValuesIn(model_dtypes)),
                          CheckWeightlessCacheAccuracy::get_test_case_name);
 
 }  // namespace
diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp
index 7abdbcb8c2fc52..7b4f27b5af05b4 100644
--- a/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp
@@ -23,11 +23,11 @@ using namespace ov;
 namespace shape_infer_tests {
 
 struct eltwise_test_params {
-    layout input1_layout;
-    layout input2_layout;
+    cldnn::layout input1_layout;
+    cldnn::layout input2_layout;
     eltwise_mode mode;
     AutoBroadcastSpec auto_broadcast_spec;
-    layout expected_layout;
+    cldnn::layout expected_layout;
     std::vector<tensor> stride;
 };