[GPU] Enable weightless cache with precision conversion (#27742)

### Details: This change makes constants which undergo precision conversion during transformation pipeline or graph optimization eligible for weightless caching. Information about precision conversion which happened before export to cache is recorded in the cache file. During the import from cache, functionally equivalent conversions are performed. Besides the unit tests in model_cache.cpp I tested accuracy and performance of llama-2-7b-chat with FP16 inference mode. Performance impact (weightless caching is OPTIMIZE_SIZE): | OPTIMIZE_SPEED | OPTIMIZE_SIZE -- | -- | -- FP16 model import, no cache | 25.4 s | 13.6 s FP16 model import, cache exists | 6.2 s | 6.4 s FP32 model import, no cache | 57.6 | 45.8 s FP32 model import, cache exists | 8.5 s | 15.2 s | OPTIMIZE_SPEED | OPTIMIZE_SIZE -- | -- | -- FP16 model cache size | 13 GB | 6.1 MB FP32 model cache size | 13 GB | 6.2 MB Model import time is the measurement of from_pretrained() call when running the llama model with openvino.genai/tools/llm_bench tool. Question to reviewers: I'm unsure if the condition in ov::WeightlessCacheAttribute::is_copyable() is not too lenient. Specifically, I'm thinking of a scenario where a single complex transformation changes constant's data type AND something else at the same time. This would render the constant eligible for weightless caching even though the reconstruction of transformations during the cache load is not aware of anything besides the data type change (which would break the feature). Does such complex transformation exist? ### Tickets: - CVS-157081
openvinotoolkit · Dec 13, 2024 · 59984e9 · 59984e9
1 parent 8e7ff7b
commit 59984e9
Show file tree

Hide file tree

Showing 10 changed files with 375 additions and 108 deletions.
diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "itt.hpp"
+#include "openvino/core/rt_info/weightless_caching_attributes.hpp"
 #include "openvino/op/ops.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/pass/manager.hpp"
@@ -1405,6 +1406,13 @@ bool fuse_type_to_constant(const std::shared_ptr<ov::Node>& node,
         new_const->validate_and_infer_types();
         new_const->set_friendly_name(constant->get_friendly_name());
         ov::copy_runtime_info(constant, new_const);
+
+        const auto& rt_info = node->get_rt_info();
+        auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
+        if (weightless_caching_attr != rt_info.end()) {
+            new_const->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] =
+                weightless_caching_attr->second;
+        }
         return true;
     }
     return false;

diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp
@@ -13,6 +13,7 @@
 
 #include "common_test_utils/ov_test_utils.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/rt_info/weightless_caching_attributes.hpp"
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/opsets/opset10.hpp"
 #include "openvino/opsets/opset15.hpp"
@@ -2702,3 +2703,38 @@ TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_orig_types
     FunctionsComparator::Result result = func_comparator(model_ref, model);
     ASSERT_TRUE(result.valid) << result.message;
 }
+
+TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_weightless_cache_info_as_rt_attribute) {
+    pass::Manager manager;
+
+    auto some_value = opset10::Constant::create(element::f32, Shape{1}, {2});
+    auto& node_rt_info = some_value->get_rt_info();
+    ov::WeightlessCacheAttribute attr(element::f32.size(), 0, element::f32);
+    node_rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] = attr;
+
+    ov::ParameterVector inputParams;
+    ov::ResultVector results;
+    results.push_back(std::make_shared<ov::op::v0::Result>(some_value->output(0)));
+    auto model = std::make_shared<ov::Model>(results, inputParams);
+
+    type_to_fuse_map empty_type_to_fuse_map = {};
+    bool keep_precision_sensitive_in_fp32 = false;
+    bool convert_input_output_precision = false;
+    bool store_original_precision_as_rt_attribute = true;
+    manager.register_pass<pass::ConvertPrecision>(precisions_map{{element::f32, element::f16}},
+                                                  empty_type_to_fuse_map,
+                                                  keep_precision_sensitive_in_fp32,
+                                                  convert_input_output_precision,
+                                                  store_original_precision_as_rt_attribute);
+    manager.run_passes(model);
+
+    const auto& ops = model->get_ops();
+    auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr<Node>& node) {
+        return ov::op::util::is_constant(node);
+    });
+
+    ASSERT_TRUE(it != ops.end());
+    const auto& new_rt_info = (*it)->get_rt_info();
+    auto weightless_caching_attr_it = new_rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
+    ASSERT_TRUE(weightless_caching_attr_it != new_rt_info.end());
+}
diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "openvino/core/core_visibility.hpp"
+#include "openvino/core/node.hpp"
 #include "openvino/core/runtime_attribute.hpp"
 
 namespace ov {
@@ -25,14 +26,16 @@ class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
 
     WeightlessCacheAttribute() = delete;
 
-    WeightlessCacheAttribute(size_t original_size, size_t bin_offset)
+    WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
         : original_size(original_size),
-          bin_offset(bin_offset) {}
+          bin_offset(bin_offset),
+          original_dtype(original_dtype) {}
 
     bool is_copyable() const override;
 
     size_t original_size;
     size_t bin_offset;
+    ov::element::Type original_dtype;
 };
 
 }  // namespace ov
diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp
@@ -950,10 +950,12 @@ std::shared_ptr<ov::Node> ov::XmlDeserializer::create_node(const std::vector<ov:
         }
         const auto size = dn.attribute("size");
         const auto offset = dn.attribute("offset");
-        if (size && offset) {
+        const auto element_type = dn.attribute("element_type");
+        if (size && offset && element_type) {
             rtInfo[ov::WeightlessCacheAttribute::get_type_info_static()] =
                 ov::WeightlessCacheAttribute(static_cast<size_t>(pugixml::get_uint64_attr(dn, "size")),
-                                             static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset")));
+                                             static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset")),
+                                             ov::element::Type(pugixml::get_str_attr(dn, "element_type")));
         }
     }