-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement OD verifier algorithm #420
base: main
Are you sure you want to change the base?
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ using AlgorithmTypes = | |
Apriori, metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC, | ||
PyroUCC, cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, Mind, | ||
Fastod, GfdValidation, EGfdValidation, NaiveGfdValidation, order::Order, | ||
dd::Split>; | ||
dd::Split, od_verifier::ODVerifier>; | ||
|
||
// clang-format off | ||
/* Enumeration of all supported non-pipeline algorithms. If you implement a new | ||
|
@@ -76,7 +76,9 @@ BETTER_ENUM(AlgorithmType, char, | |
order, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that OD mining algorithms should be mentioned next to each other, so you can move 'order'. |
||
|
||
/* Differential dependencies mining algorithm */ | ||
split | ||
split, | ||
/* Canonical OD verifier algorithm */ | ||
od_verifier | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that OD verifier algorithms should be mentioned after OD mining algorithms. |
||
) | ||
// clang-format on | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
#pragma once | ||
|
||
#include "algorithms/od/fastod/fastod.h" | ||
#include "algorithms/od/od_verifier/od_verifier.h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. od_verifier should be mentioned in verification_algorithms.h |
||
#include "algorithms/od/order/order.h" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#include "od_verifier.h" | ||
|
||
#include "ascending_od/option.h" | ||
#include "config/equal_nulls/option.h" | ||
#include "config/indices/od_context.h" | ||
#include "config/indices/option.h" | ||
#include "config/tabular_data/input_table/option.h" | ||
#include "partition.h" | ||
|
||
namespace algos::od_verifier { | ||
|
||
ODVerifier::ODVerifier() : Algorithm({}) { | ||
RegisterOptions(); | ||
MakeOptionsAvailable({config::kTableOpt.GetName(), config::kEqualNullsOpt.GetName()}); | ||
} | ||
|
||
void ODVerifier::RegisterOptions() { | ||
auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); }; | ||
|
||
RegisterOption(config::kTableOpt(&input_table_)); | ||
RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_)); | ||
RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols)); | ||
RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols)); | ||
RegisterOption(config::kODContextOpt(&context_indices_)); | ||
RegisterOption(config::kAscendingODOpt(&ascending_)); | ||
} | ||
|
||
void ODVerifier::MakeExecuteOptsAvailable() { | ||
MakeOptionsAvailable({config::kLhsIndicesOpt.GetName(), config::kRhsIndicesOpt.GetName(), | ||
config::kODContextOpt.GetName(), config::kAscendingODOpt.GetName()}); | ||
} | ||
|
||
void ODVerifier::LoadDataInternal() { | ||
relation_ = ColumnLayoutRelationData::CreateFrom(*input_table_, is_null_equal_null_); | ||
|
||
if (relation_->GetColumnData().empty()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should maintain consistent style of writing braces. |
||
throw std::runtime_error("Got an empty dataset: OD verifying is meaningless."); | ||
} | ||
input_table_->Reset(); | ||
data_ = std::make_shared<DataFrame>(DataFrame::FromInputTable(input_table_)); | ||
if (data_->GetColumnCount() == 0) | ||
throw std::runtime_error("Got an empty dataset: OD verifying is meaningless."); | ||
} | ||
|
||
unsigned long long ODVerifier::ExecuteInternal() { | ||
auto start_time = std::chrono::system_clock::now(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better to use existing methods. Use util::TimedInvoke (example). |
||
if (ascending_) | ||
VerifyOD<true>(); | ||
else | ||
VerifyOD<false>(); | ||
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>( | ||
std::chrono::system_clock::now() - start_time); | ||
return elapsed_milliseconds.count(); | ||
} | ||
|
||
template <bool Ascending> | ||
void ODVerifier::VerifyOD() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method combines two logical parts, so these parts should be implemented in different methods that can be invoked from |
||
AttributeSet context; | ||
|
||
for (auto column : context_indices_) context.Set(column); | ||
|
||
fastod::ComplexStrippedPartition stripped_partition_swap( | ||
(partition_cache_.GetStrippedPartition(context, data_))); | ||
|
||
if (stripped_partition_swap.Swap<Ascending>(lhs_indices_[0], rhs_indices_[0])) { | ||
Partition part{stripped_partition_swap}; | ||
std::vector<std::pair<int, int>> violates( | ||
part.FindViolationsBySwap<Ascending>(lhs_indices_[0], rhs_indices_[0])); | ||
|
||
for (auto position_violate : violates) | ||
row_violate_ods_by_swap_.push_back(position_violate.second + 1); | ||
} | ||
|
||
context.Set(lhs_indices_[0]); | ||
fastod::ComplexStrippedPartition stripped_partition_split( | ||
partition_cache_.GetStrippedPartition(context, data_)); | ||
|
||
if (stripped_partition_split.Split(rhs_indices_[0])) { | ||
Partition part{stripped_partition_split}; | ||
std::vector<std::pair<int, int>> violates(part.FindViolationsBySplit(rhs_indices_[0])); | ||
|
||
for (auto position_violate : violates) | ||
row_violate_ods_by_split_.push_back(position_violate.second + 1); | ||
} | ||
std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end()); | ||
std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end()); | ||
} | ||
|
||
} // namespace algos::od_verifier |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#pragma once | ||
|
||
#include "algorithms/algorithm.h" | ||
#include "algorithms/od/fastod/model/canonical_od.h" | ||
#include "config/indices/type.h" | ||
#include "model/table/column_layout_relation_data.h" | ||
|
||
namespace algos::od_verifier { | ||
|
||
class ODVerifier : public Algorithm { | ||
private: | ||
using IndicesType = config::IndicesType; | ||
using IndexType = config::IndexType; | ||
using DataFrame = fastod::DataFrame; | ||
using PartitionCache = fastod::PartitionCache; | ||
using AscCanonicalOD = fastod::AscCanonicalOD; | ||
using DescCanonicalOD = fastod::DescCanonicalOD; | ||
using SimpleCanonicalOD = fastod::SimpleCanonicalOD; | ||
using AttributeSet = fastod::AttributeSet; | ||
|
||
// input data | ||
config::InputTable input_table_; | ||
config::EqNullsType is_null_equal_null_; | ||
IndicesType lhs_indices_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Canonical ODs contain only one attribute (or don't contain attributes) in left side. Why |
||
IndicesType rhs_indices_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Canonical ODs contain only one attribute in right side. Why |
||
IndicesType context_indices_; | ||
bool ascending_; | ||
|
||
// auxiliary data | ||
std::shared_ptr<ColumnLayoutRelationData> relation_; | ||
std::shared_ptr<DataFrame> data_; | ||
PartitionCache partition_cache_; | ||
|
||
// rows that vioalates ods | ||
std::vector<int> row_violate_ods_by_swap_; | ||
std::vector<int> row_violate_ods_by_split_; | ||
|
||
// load input data | ||
void RegisterOptions(); | ||
void MakeExecuteOptsAvailable() override; | ||
void LoadDataInternal() override; | ||
|
||
// runs the algorithm and measures its time | ||
unsigned long long ExecuteInternal() override; | ||
|
||
// checks whether OD is violated and finds the rows where it is violated | ||
template <bool Ascending> | ||
void VerifyOD(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Template methods should be implemented in header file. |
||
|
||
// reset statistic of violations | ||
void ResetState() override { | ||
row_violate_ods_by_swap_.clear(); | ||
row_violate_ods_by_split_.clear(); | ||
} | ||
|
||
public: | ||
// checks whether the OD has broken | ||
bool ODHolds() const { | ||
return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better to implement methods in .cpp. |
||
} | ||
|
||
// base constructor | ||
ODVerifier(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Constructors should be mentioned before methods. |
||
|
||
// Returns the number of rows that violate the OD by split | ||
size_t GetNumRowsViolateBySplit() const { | ||
return row_violate_ods_by_split_.size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better to implement methods in .cpp. |
||
} | ||
|
||
// Returns the number of rows that violate the OD by swap | ||
size_t GetNumRowsViolateBySwap() const { | ||
return row_violate_ods_by_swap_.size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better to implement methods in .cpp. |
||
} | ||
}; | ||
|
||
} // namespace algos::od_verifier |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#include "partition.h" | ||
|
||
#include <strings.h> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this include necessary? If not, remove it. |
||
#include <utility> | ||
#include <vector> | ||
|
||
namespace algos::od_verifier { | ||
|
||
std::vector<std::pair<int, int>> Partition::CommonViolationBySplit(model::ColumnIndex right) const { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better to have |
||
std::vector<std::pair<int, int>> violates; | ||
|
||
for (size_t begin_pointer = 0; begin_pointer < sp_begins_->size() - 1; begin_pointer++) { | ||
size_t const group_begin = (*sp_begins_)[begin_pointer]; | ||
size_t const group_end = (*sp_begins_)[begin_pointer + 1]; | ||
|
||
int const group_value = data_->GetValue((*sp_indexes_)[group_begin], right); | ||
|
||
for (size_t i = group_begin + 1; i < group_end; i++) { | ||
if (data_->GetValue((*sp_indexes_)[i], right) != group_value) { | ||
violates.emplace_back(std::pair<int, int>(right, (*sp_indexes_)[i])); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can omit creating std::pair due to using emplace_back. |
||
} | ||
} | ||
} | ||
|
||
return violates; | ||
} | ||
|
||
std::vector<std::pair<int, int>> Partition::RangeBasedViolationBySplit( | ||
model::ColumnIndex right) const { | ||
std::vector<std::pair<int, int>> violates; | ||
|
||
for (size_t begin_pointer = 0; begin_pointer < rb_begins_->size() - 1; ++begin_pointer) { | ||
size_t const group_begin = (*rb_begins_)[begin_pointer]; | ||
size_t const group_end = (*rb_begins_)[begin_pointer + 1]; | ||
|
||
int const group_value = data_->GetValue((*rb_indexes_)[group_begin].first, right); | ||
|
||
for (size_t i = group_begin; i < group_end; ++i) { | ||
algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i]; | ||
|
||
for (size_t j = range.first; j <= range.second; ++j) { | ||
if (data_->GetValue(j, right) != group_value) { | ||
violates.emplace_back(std::pair<int, int>(right, j)); | ||
} | ||
} | ||
} | ||
} | ||
|
||
return violates; | ||
} | ||
|
||
std::vector<std::pair<int, int>> Partition::FindViolationsBySplit(model::ColumnIndex right) const { | ||
return is_stripped_partition_ ? CommonViolationBySplit(right) | ||
: RangeBasedViolationBySplit(right); | ||
} | ||
|
||
} // namespace algos::od_verifier |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#pragma once | ||
|
||
#include "algorithms/od/fastod/partitions/complex_stripped_partition.h" | ||
|
||
namespace algos::od_verifier { | ||
|
||
class Partition : protected algos::fastod::ComplexStrippedPartition { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Name of this class should be more specific. |
||
private: | ||
std::vector<std::pair<int, int>> CommonViolationBySplit(model::ColumnIndex right) const; | ||
|
||
std::vector<std::pair<int, int>> RangeBasedViolationBySplit(model::ColumnIndex right) const; | ||
|
||
public: | ||
Partition() : algos::fastod::ComplexStrippedPartition() {} | ||
|
||
Partition(algos::fastod::ComplexStrippedPartition const& daddy) | ||
: algos::fastod::ComplexStrippedPartition(daddy) {} | ||
|
||
std::vector<std::pair<int, int>> FindViolationsBySplit(model::ColumnIndex right) const; | ||
|
||
template <bool Ascending> | ||
std::vector<std::pair<int, int>> FindViolationsBySwap(model::ColumnIndex left, | ||
model::ColumnIndex right) const { | ||
size_t const group_count = is_stripped_partition_ ? sp_begins_->size() : rb_begins_->size(); | ||
std::vector<std::pair<int, int>> violates; | ||
|
||
for (size_t begin_pointer = 0; begin_pointer < group_count - 1; begin_pointer++) { | ||
size_t const group_begin = is_stripped_partition_ ? (*sp_begins_)[begin_pointer] | ||
: (*rb_begins_)[begin_pointer]; | ||
|
||
size_t const group_end = is_stripped_partition_ ? (*sp_begins_)[begin_pointer + 1] | ||
: (*rb_begins_)[begin_pointer + 1]; | ||
|
||
std::vector<std::pair<int, int>> values; | ||
std::vector<int> row_pos; | ||
|
||
if (is_stripped_partition_) { | ||
values.reserve(group_end - group_begin); | ||
|
||
for (size_t i = group_begin; i < group_end; ++i) { | ||
size_t const index = (*sp_indexes_)[i]; | ||
|
||
values.emplace_back(data_->GetValue(index, left), | ||
data_->GetValue(index, right)); | ||
row_pos.emplace_back(index); | ||
} | ||
} else { | ||
for (size_t i = group_begin; i < group_end; ++i) { | ||
algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i]; | ||
|
||
for (size_t j = range.first; j <= range.second; ++j) { | ||
values.emplace_back(data_->GetValue(j, left), data_->GetValue(j, right)); | ||
} | ||
} | ||
} | ||
|
||
if constexpr (Ascending) { | ||
std::sort(values.begin(), values.end(), | ||
[](auto const& p1, auto const& p2) { return p1.first < p2.first; }); | ||
} else { | ||
std::sort(values.begin(), values.end(), | ||
[](auto const& p1, auto const& p2) { return p2.first < p1.first; }); | ||
} | ||
|
||
size_t prev_group_max_index = 0; | ||
size_t current_group_max_index = 0; | ||
bool is_first_group = true; | ||
|
||
for (size_t i = 0; i < values.size(); i++) { | ||
auto const& [first, second] = values[i]; | ||
|
||
if (i != 0 && values[i - 1].first != first) { | ||
is_first_group = false; | ||
prev_group_max_index = current_group_max_index; | ||
current_group_max_index = i; | ||
} else if (values[current_group_max_index].second <= second) { | ||
current_group_max_index = i; | ||
} | ||
|
||
if (!is_first_group && values[prev_group_max_index].second > second) { | ||
violates.push_back(std::pair<int, int>(right, row_pos[i])); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
} | ||
} | ||
} | ||
|
||
return violates; | ||
} | ||
}; | ||
|
||
} // namespace algos::od_verifier |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#include "ascending_od/option.h" | ||
|
||
#include "ascending_od/type.h" | ||
#include "config/names_and_descriptions.h" | ||
|
||
namespace config { | ||
extern CommonOption<AscendingODFlagType> const kAscendingODOpt{names::kAscendingOD, | ||
descriptions::kDAscendingOD, true}; | ||
} // namespace config |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#pragma once | ||
|
||
#include "config/ascending_od/type.h" | ||
#include "config/common_option.h" | ||
|
||
namespace config { | ||
extern CommonOption<AscendingODFlagType> const kAscendingODOpt; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'AscendingOD' option is useful only for OD verifier and won't be needed by other algorithms. So I think that it should be removed. Instead, you can use |
||
|
||
} // namespace config |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#pragma once | ||
|
||
namespace config { | ||
using AscendingODFlagType = bool; | ||
} // namespace config |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#include "config/indices/od_context.h" | ||
|
||
#include "config/names_and_descriptions.h" | ||
#include "indices/type.h" | ||
|
||
namespace config { | ||
extern CommonOption<IndicesType> const kODContextOpt{names::kODContext, descriptions::kDODContext, | ||
IndicesType({})}; | ||
} // namespace config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think OD verifier algorithms should be mentioned after OD mining algorithms.