-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve memory footprint of columnar deserialization using relevant filter #895
base: main
Are you sure you want to change the base?
Changes from 20 commits
b4b8eae
32b637b
fd8ff8c
d983c77
cc6c422
af44232
926993b
9db08f9
4d8e19e
d12b5f5
f41351c
d0887bc
0060c9a
f4e780c
159eae8
2cdd407
34614ca
81fa0a3
474f468
96d7c79
1dd9f1c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -152,6 +152,14 @@ struct DefaultNullVisitor : msgpack::null_visitor { | |||||
} | ||||||
}; | ||||||
|
||||||
struct NullVisitorCheckMap : DefaultNullVisitor { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's not really a null-visitor anymore, right?
Suggested change
|
||||||
bool has_map{false}; | ||||||
bool start_map(uint32_t /*num_kv_pairs*/) { | ||||||
has_map = true; | ||||||
return true; | ||||||
} | ||||||
}; | ||||||
|
||||||
template <class T> struct DefaultErrorVisitor : DefaultNullVisitor { | ||||||
static constexpr std::string_view static_err_msg = "Unexpected data type!\n"; | ||||||
|
||||||
|
@@ -348,6 +356,7 @@ template <> struct ValueVisitor<RealValue<asymmetric_t>> : DefaultErrorVisitor<V | |||||
|
||||||
class Deserializer { | ||||||
using DefaultNullVisitor = detail::DefaultNullVisitor; | ||||||
using NullVisitorCheckMap = detail::NullVisitorCheckMap; | ||||||
template <class map_array> using MapArrayVisitor = detail::MapArrayVisitor<map_array>; | ||||||
using StringVisitor = detail::StringVisitor; | ||||||
using BoolVisitor = detail::BoolVisitor; | ||||||
|
@@ -363,6 +372,7 @@ class Deserializer { | |||||
std::string_view component; | ||||||
Idx size; | ||||||
size_t offset; | ||||||
bool has_map; | ||||||
}; | ||||||
using DataByteMeta = std::vector<std::vector<ComponentByteMeta>>; | ||||||
using AttributeByteMeta = std::vector<std::pair<std::string_view, std::vector<std::string_view>>>; | ||||||
|
@@ -491,6 +501,12 @@ class Deserializer { | |||||
msgpack::parse(data_, size_, offset_, visitor); | ||||||
} | ||||||
|
||||||
bool parse_skip_check_map() { | ||||||
NullVisitorCheckMap visitor{}; | ||||||
msgpack::parse(data_, size_, offset_, visitor); | ||||||
return visitor.has_map; | ||||||
} | ||||||
|
||||||
WritableDataset pre_parse() { | ||||||
try { | ||||||
return pre_parse_impl(); | ||||||
|
@@ -563,7 +579,7 @@ class Deserializer { | |||||
|
||||||
WritableDataset handler{is_batch_, batch_size, dataset, *meta_data_}; | ||||||
count_data(handler, data_counts); | ||||||
parse_predefined_attributes(handler.dataset(), attributes); | ||||||
parse_predefined_attributes(handler, attributes); | ||||||
return handler; | ||||||
} | ||||||
|
||||||
|
@@ -584,8 +600,9 @@ class Deserializer { | |||||
return attributes; | ||||||
} | ||||||
|
||||||
void parse_predefined_attributes(MetaDataset const& dataset, AttributeByteMeta const& attributes) { | ||||||
void parse_predefined_attributes(WritableDataset& handler, AttributeByteMeta const& attributes) { | ||||||
root_key_ = "attributes"; | ||||||
MetaDataset const& dataset = handler.dataset(); | ||||||
for (auto const& single_component : attributes) { | ||||||
component_key_ = single_component.first; | ||||||
MetaComponent const* const component = &dataset.get_component(component_key_); | ||||||
|
@@ -595,6 +612,10 @@ class Deserializer { | |||||
attributes_per_component.push_back(&component->get_attribute(single_component.second[element_number_])); | ||||||
} | ||||||
attributes_[component] = std::move(attributes_per_component); | ||||||
// set attribute intidation if enabled | ||||||
if (handler.get_component_info(component_key_).has_attribute_indications) { | ||||||
handler.set_attribute_indications(component_key_, attributes_[component]); | ||||||
} | ||||||
element_number_ = -1; | ||||||
} | ||||||
component_key_ = {}; | ||||||
|
@@ -628,9 +649,10 @@ class Deserializer { | |||||
while (n_components-- != 0) { | ||||||
component_key_ = parse_string(); | ||||||
Idx const component_size = parse_map_array<visit_array_t, stay_offset>().size; | ||||||
count_per_scenario.push_back({component_key_, component_size, offset_}); | ||||||
// skip all the real content | ||||||
parse_skip(); | ||||||
size_t const scenario_offset = offset_; | ||||||
// skip all the real content but check if it has map | ||||||
bool const has_map = parse_skip_check_map(); | ||||||
count_per_scenario.push_back({component_key_, component_size, scenario_offset, has_map}); | ||||||
} | ||||||
component_key_ = {}; | ||||||
return count_per_scenario; | ||||||
|
@@ -678,7 +700,14 @@ class Deserializer { | |||||
elements_per_scenario < 0 ? std::reduce(counter.cbegin(), counter.cend()) : // aggregation | ||||||
elements_per_scenario * batch_size; // multiply | ||||||
handler.add_component_info(component_key_, elements_per_scenario, total_elements); | ||||||
msg_data_offsets_.push_back(component_byte_meta); | ||||||
// check if all scenarios does not have any map | ||||||
bool const has_attribute_indications = std::none_of(component_byte_meta.cbegin(), component_byte_meta.cend(), | ||||||
[](auto const& x) { return x.has_map; }); | ||||||
msg_data_offsets_.push_back(std::move(component_byte_meta)); | ||||||
// enable attribute indications if possible | ||||||
if (has_attribute_indications) { | ||||||
handler.enable_atrribute_indications(component_key_); | ||||||
} | ||||||
component_key_ = {}; | ||||||
} | ||||||
|
||||||
|
@@ -747,6 +776,16 @@ class Deserializer { | |||||
auto const reordered_attribute_buffers = detail::is_columnar_v<row_or_column_t> | ||||||
? detail::reordered_attribute_buffers(buffer, attributes) | ||||||
: std::vector<AttributeBuffer<void>>{}; | ||||||
// for columnar buffer | ||||||
// if there is no intersection between the pre-defined attributes and the user provied buffer | ||||||
// and the whole component does not have map | ||||||
// skip the whole component for all scenarios and all elements | ||||||
if constexpr (std::same_as<row_or_column_t, columnar_t>) { | ||||||
if (info.has_attribute_indications && reordered_attribute_buffers.empty()) { | ||||||
component_key_ = ""; | ||||||
return; | ||||||
} | ||||||
} | ||||||
|
||||||
BufferView const buffer_view{ | ||||||
.buffer = &buffer, .idx = 0, .reordered_attribute_buffers = reordered_attribute_buffers}; | ||||||
|
@@ -780,6 +819,16 @@ class Deserializer { | |||||
return; | ||||||
} | ||||||
|
||||||
// for columnar buffer | ||||||
// if there is no intersection between the pre-defined attributes and the usered provied buffer | ||||||
// and this scenario does not have map | ||||||
// skip the whole scenario for this compoment for all elements | ||||||
if constexpr (std::same_as<decltype(row_or_column_tag), columnar_t>) { | ||||||
if (buffer_view.reordered_attribute_buffers.empty() && !msg_data.has_map) { | ||||||
return; | ||||||
} | ||||||
} | ||||||
|
||||||
// set offset and skip array header | ||||||
offset_ = msg_data.offset; | ||||||
parse_map_array<visit_array_t, move_forward>(); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,6 +83,46 @@ PGM_API PGM_Idx PGM_dataset_info_elements_per_scenario(PGM_Handle* handle, PGM_D | |
*/ | ||
PGM_API PGM_Idx PGM_dataset_info_total_elements(PGM_Handle* handle, PGM_DatasetInfo const* info, PGM_Idx component_idx); | ||
|
||
/** | ||
* @brief Return if a component has attribute indications. | ||
* | ||
* Attribute indications are used to indicate the presence of meaningful attributes for a certain component in the | ||
* dataset. | ||
Comment on lines
+87
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe it's good to also mention the behavior if it has both preamble attribute indications and in-data attributes |
||
* | ||
* @param handle | ||
* @param info A pointer to the info object. | ||
* @param component_idx The index number of the component. | ||
* @return 1 if the component has attribute indications, 0 if it does not. | ||
*/ | ||
PGM_API PGM_Idx PGM_dataset_info_has_attribute_indications(PGM_Handle* handle, PGM_DatasetInfo const* info, | ||
PGM_Idx component_idx); | ||
|
||
/** | ||
* @brief Return the number of attribute indications for a component.s | ||
* | ||
* @param handle | ||
* @param info A pointer to the info object. | ||
* @param component_idx The index number of the component. | ||
* @return The number of attribute indications for the component. | ||
* It is UB if PGM_dataset_info_has_attribute_indications() returns zero. | ||
*/ | ||
PGM_API PGM_Idx PGM_dataset_info_n_attribute_indications(PGM_Handle* handle, PGM_DatasetInfo const* info, | ||
PGM_Idx component_idx); | ||
|
||
/** | ||
* @brief Return the name of the i-th attribute indication for a component. | ||
* | ||
* @param handle | ||
* @param info A pointer to the info object. | ||
* @param component_idx The index number of the component. | ||
* @param attribute_idx The index number of attribute indication. | ||
* @return A pointer to the null-terminated string of the attribute indication. | ||
* The pointer has the same lifetime as the input info pointer. | ||
* It is UB if PGM_dataset_info_has_attribute_indications() returns zero, or if attribute_idx is out of bounds. | ||
*/ | ||
PGM_API char const* PGM_dataset_info_attribute_name(PGM_Handle* handle, PGM_DatasetInfo const* info, | ||
PGM_Idx component_idx, PGM_Idx attribute_idx); | ||
|
||
/** | ||
* @brief Create an instance of PGM_ConstDataset. | ||
* @param handle | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo