Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve memory footprint of columnar deserialization using relevant filter #895

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ struct ComponentInfo {
// for non-uniform component, this is -1, we use indptr to describe the elements per scenario
Idx elements_per_scenario{};
Idx total_elements{};
// does the dataset contain only a subset of attributes with meaningful value
bool has_attribute_indications{false};
std::vector<MetaAttribute const*> attribute_indications{};
};

struct DatasetInfo {
Expand Down Expand Up @@ -336,6 +339,21 @@ template <dataset_type_tag dataset_type_> class Dataset {
add_component_info_impl(component, elements_per_scenario, total_elements);
}

void enable_atrribute_indications(std::string_view component)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

Suggested change
void enable_atrribute_indications(std::string_view component)
void enable_attribute_indications(std::string_view component)

requires is_indptr_mutable_v<dataset_type>
{
Idx const idx = find_component(component, true);
dataset_info_.component_info[idx].has_attribute_indications = true;
}

void set_attribute_indications(std::string_view component, std::span<MetaAttribute const*> attribute_indications)
requires is_indptr_mutable_v<dataset_type>
{
Idx const idx = find_component(component, true);
dataset_info_.component_info[idx].attribute_indications = {attribute_indications.begin(),
attribute_indications.end()};
}

void add_buffer(std::string_view component, std::integral auto elements_per_scenario_,
std::integral auto total_elements_, Indptr* indptr, Data* data)
requires(!is_indptr_mutable_v<dataset_type>)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ struct DefaultNullVisitor : msgpack::null_visitor {
}
};

struct NullVisitorCheckMap : DefaultNullVisitor {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not really a null-visitor anymore, right?

Suggested change
struct NullVisitorCheckMap : DefaultNullVisitor {
struct CheckHasMap : DefaultNullVisitor {

bool has_map{false};
bool start_map(uint32_t /*num_kv_pairs*/) {
has_map = true;
return true;
}
};

template <class T> struct DefaultErrorVisitor : DefaultNullVisitor {
static constexpr std::string_view static_err_msg = "Unexpected data type!\n";

Expand Down Expand Up @@ -348,6 +356,7 @@ template <> struct ValueVisitor<RealValue<asymmetric_t>> : DefaultErrorVisitor<V

class Deserializer {
using DefaultNullVisitor = detail::DefaultNullVisitor;
using NullVisitorCheckMap = detail::NullVisitorCheckMap;
template <class map_array> using MapArrayVisitor = detail::MapArrayVisitor<map_array>;
using StringVisitor = detail::StringVisitor;
using BoolVisitor = detail::BoolVisitor;
Expand All @@ -363,6 +372,7 @@ class Deserializer {
std::string_view component;
Idx size;
size_t offset;
bool has_map;
};
using DataByteMeta = std::vector<std::vector<ComponentByteMeta>>;
using AttributeByteMeta = std::vector<std::pair<std::string_view, std::vector<std::string_view>>>;
Expand Down Expand Up @@ -491,6 +501,12 @@ class Deserializer {
msgpack::parse(data_, size_, offset_, visitor);
}

bool parse_skip_check_map() {
NullVisitorCheckMap visitor{};
msgpack::parse(data_, size_, offset_, visitor);
return visitor.has_map;
}

WritableDataset pre_parse() {
try {
return pre_parse_impl();
Expand Down Expand Up @@ -563,7 +579,7 @@ class Deserializer {

WritableDataset handler{is_batch_, batch_size, dataset, *meta_data_};
count_data(handler, data_counts);
parse_predefined_attributes(handler.dataset(), attributes);
parse_predefined_attributes(handler, attributes);
return handler;
}

Expand All @@ -584,8 +600,9 @@ class Deserializer {
return attributes;
}

void parse_predefined_attributes(MetaDataset const& dataset, AttributeByteMeta const& attributes) {
void parse_predefined_attributes(WritableDataset& handler, AttributeByteMeta const& attributes) {
root_key_ = "attributes";
MetaDataset const& dataset = handler.dataset();
for (auto const& single_component : attributes) {
component_key_ = single_component.first;
MetaComponent const* const component = &dataset.get_component(component_key_);
Expand All @@ -595,6 +612,10 @@ class Deserializer {
attributes_per_component.push_back(&component->get_attribute(single_component.second[element_number_]));
}
attributes_[component] = std::move(attributes_per_component);
// set attribute intidation if enabled
if (handler.get_component_info(component_key_).has_attribute_indications) {
handler.set_attribute_indications(component_key_, attributes_[component]);
}
element_number_ = -1;
}
component_key_ = {};
Expand Down Expand Up @@ -628,9 +649,10 @@ class Deserializer {
while (n_components-- != 0) {
component_key_ = parse_string();
Idx const component_size = parse_map_array<visit_array_t, stay_offset>().size;
count_per_scenario.push_back({component_key_, component_size, offset_});
// skip all the real content
parse_skip();
size_t const scenario_offset = offset_;
// skip all the real content but check if it has map
bool const has_map = parse_skip_check_map();
count_per_scenario.push_back({component_key_, component_size, scenario_offset, has_map});
}
component_key_ = {};
return count_per_scenario;
Expand Down Expand Up @@ -678,7 +700,14 @@ class Deserializer {
elements_per_scenario < 0 ? std::reduce(counter.cbegin(), counter.cend()) : // aggregation
elements_per_scenario * batch_size; // multiply
handler.add_component_info(component_key_, elements_per_scenario, total_elements);
msg_data_offsets_.push_back(component_byte_meta);
// check if all scenarios does not have any map
bool const has_attribute_indications = std::none_of(component_byte_meta.cbegin(), component_byte_meta.cend(),
[](auto const& x) { return x.has_map; });
msg_data_offsets_.push_back(std::move(component_byte_meta));
// enable attribute indications if possible
if (has_attribute_indications) {
handler.enable_atrribute_indications(component_key_);
}
component_key_ = {};
}

Expand Down Expand Up @@ -747,6 +776,16 @@ class Deserializer {
auto const reordered_attribute_buffers = detail::is_columnar_v<row_or_column_t>
? detail::reordered_attribute_buffers(buffer, attributes)
: std::vector<AttributeBuffer<void>>{};
// for columnar buffer
// if there is no intersection between the pre-defined attributes and the user provied buffer
// and the whole component does not have map
// skip the whole component for all scenarios and all elements
if constexpr (std::same_as<row_or_column_t, columnar_t>) {
if (info.has_attribute_indications && reordered_attribute_buffers.empty()) {
component_key_ = "";
return;
}
}

BufferView const buffer_view{
.buffer = &buffer, .idx = 0, .reordered_attribute_buffers = reordered_attribute_buffers};
Expand Down Expand Up @@ -780,6 +819,16 @@ class Deserializer {
return;
}

// for columnar buffer
// if there is no intersection between the pre-defined attributes and the usered provied buffer
// and this scenario does not have map
// skip the whole scenario for this compoment for all elements
if constexpr (std::same_as<decltype(row_or_column_tag), columnar_t>) {
if (buffer_view.reordered_attribute_buffers.empty() && !msg_data.has_map) {
return;
}
}

// set offset and skip array header
offset_ = msg_data.offset;
parse_map_array<visit_array_t, move_forward>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,46 @@ PGM_API PGM_Idx PGM_dataset_info_elements_per_scenario(PGM_Handle* handle, PGM_D
*/
PGM_API PGM_Idx PGM_dataset_info_total_elements(PGM_Handle* handle, PGM_DatasetInfo const* info, PGM_Idx component_idx);

/**
* @brief Return if a component has attribute indications.
*
* Attribute indications are used to indicate the presence of meaningful attributes for a certain component in the
* dataset.
Comment on lines +87 to +90
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe it's good to also mention the behavior if it has both preamble attribute indications and in-data attributes

*
* @param handle
* @param info A pointer to the info object.
* @param component_idx The index number of the component.
* @return 1 if the component has attribute indications, 0 if it does not.
*/
PGM_API PGM_Idx PGM_dataset_info_has_attribute_indications(PGM_Handle* handle, PGM_DatasetInfo const* info,
PGM_Idx component_idx);

/**
* @brief Return the number of attribute indications for a component.s
*
* @param handle
* @param info A pointer to the info object.
* @param component_idx The index number of the component.
* @return The number of attribute indications for the component.
* It is UB if PGM_dataset_info_has_attribute_indications() returns zero.
*/
PGM_API PGM_Idx PGM_dataset_info_n_attribute_indications(PGM_Handle* handle, PGM_DatasetInfo const* info,
PGM_Idx component_idx);

/**
* @brief Return the name of the i-th attribute indication for a component.
*
* @param handle
* @param info A pointer to the info object.
* @param component_idx The index number of the component.
* @param attribute_idx The index number of attribute indication.
* @return A pointer to the null-terminated string of the attribute indication.
* The pointer has the same lifetime as the input info pointer.
* It is UB if PGM_dataset_info_has_attribute_indications() returns zero, or if attribute_idx is out of bounds.
*/
PGM_API char const* PGM_dataset_info_attribute_name(PGM_Handle* handle, PGM_DatasetInfo const* info,
PGM_Idx component_idx, PGM_Idx attribute_idx);

/**
* @brief Create an instance of PGM_ConstDataset.
* @param handle
Expand Down
15 changes: 15 additions & 0 deletions power_grid_model_c/power_grid_model_c/src/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ PGM_Idx PGM_dataset_info_total_elements(PGM_Handle* /*unused*/, PGM_DatasetInfo
return info->component_info[component_idx].total_elements;
}

PGM_Idx PGM_dataset_info_has_attribute_indications(PGM_Handle* /* handle */, PGM_DatasetInfo const* info,
PGM_Idx component_idx) {
return static_cast<PGM_Idx>(info->component_info[component_idx].has_attribute_indications);
}

PGM_Idx PGM_dataset_info_n_attribute_indications(PGM_Handle* /* handle */, PGM_DatasetInfo const* info,
PGM_Idx component_idx) {
return static_cast<PGM_Idx>(info->component_info[component_idx].attribute_indications.size());
}

char const* PGM_dataset_info_attribute_name(PGM_Handle* /* handle */, PGM_DatasetInfo const* info,
PGM_Idx component_idx, PGM_Idx attribute_idx) {
return info->component_info[component_idx].attribute_indications[attribute_idx]->name;
}

// const dataset

PGM_ConstDataset* PGM_create_dataset_const(PGM_Handle* handle, char const* dataset, PGM_Idx is_batch,
Expand Down
18 changes: 18 additions & 0 deletions src/power_grid_model/_core/power_grid_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,24 @@ def dataset_info_elements_per_scenario( # type: ignore[empty-body]
def dataset_info_total_elements(self, info: DatasetInfoPtr, component_idx: int) -> int: # type: ignore[empty-body]
pass # pragma: no cover

@make_c_binding
def dataset_info_has_attribute_indications( # type: ignore[empty-body]
self, info: DatasetInfoPtr, component_idx: int
) -> int:
pass # pragma: no cover

@make_c_binding
def dataset_info_n_attribute_indications( # type: ignore[empty-body]
self, info: DatasetInfoPtr, component_idx: int
) -> int:
pass # pragma: no cover

@make_c_binding
def dataset_info_attribute_name( # type: ignore[empty-body]
self, info: DatasetInfoPtr, component_idx: int, attribute_idx: int
) -> str:
pass # pragma: no cover

@make_c_binding
def create_dataset_mutable( # type: ignore[empty-body]
self, dataset: str, is_batch: int, batch_size: int
Expand Down
29 changes: 28 additions & 1 deletion src/power_grid_model/_core/power_grid_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,28 @@ def total_elements(self) -> Mapping[ComponentType, int]:
for idx, component_name in enumerate(self.components())
}

def attribute_indications(self) -> Mapping[ComponentType, None | list[AttributeType]]:
"""
The attribute indications in the dataset.

Returns:
A map of component to its attribute indications.
None means no attribute indications
"""
result_dict: dict[ComponentType, None | list[AttributeType]] = {}
components = self.components()
for component_idx, component_name in enumerate(components):
has_indications = pgc.dataset_info_has_attribute_indications(self._info, component_idx)
if has_indications == 0:
result_dict[component_name] = None
else:
n_indications = pgc.dataset_info_n_attribute_indications(self._info, component_idx)
result_dict[component_name] = [
pgc.dataset_info_attribute_name(self._info, component_idx, attribute_idx)
for attribute_idx in range(n_indications)
]
return result_dict


class CMutableDataset:
"""
Expand Down Expand Up @@ -446,6 +468,7 @@ def _get_buffer_properties(self, info: CDatasetInfo) -> Mapping[ComponentType, B
components = info.components()
n_elements_per_scenario = info.elements_per_scenario()
n_total_elements = info.total_elements()
attribute_indications = info.attribute_indications()

return {
component: BufferProperties(
Expand All @@ -457,6 +480,7 @@ def _get_buffer_properties(self, info: CDatasetInfo) -> Mapping[ComponentType, B
columns=_get_filtered_attributes(
schema=self._schema[component],
component_data_filter=self._data_filter[component],
attribute_indication=attribute_indications[component],
),
)
for component in components
Expand Down Expand Up @@ -491,11 +515,14 @@ def _post_filtering(self):
def _get_filtered_attributes(
schema: ComponentMetaData,
component_data_filter: set[str] | list[str] | None | ComponentAttributeFilterOptions,
) -> list[str] | None:
attribute_indication: None | list[AttributeType],
) -> list[AttributeType] | None:
if component_data_filter is None:
return None

if isinstance(component_data_filter, ComponentAttributeFilterOptions):
if component_data_filter == ComponentAttributeFilterOptions.relevant and attribute_indication is not None:
return attribute_indication
return [] if schema.dtype.names is None else list(schema.dtype.names)

return list(component_data_filter)
Loading