Desbordante · vano105 · May 27, 2024 · May 27, 2024 · May 27, 2024 · May 27, 2024
diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h
@@ -11,7 +11,7 @@ using AlgorithmTypes =
                    Apriori, metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC,
                    PyroUCC, cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, Mind,
                    Fastod, GfdValidation, EGfdValidation, NaiveGfdValidation, order::Order,
-                   dd::Split>;
+                   dd::Split, od_verifier::ODVerifier>;
 
 // clang-format off
 /* Enumeration of all supported non-pipeline algorithms. If you implement a new
@@ -76,7 +76,9 @@ BETTER_ENUM(AlgorithmType, char,
     order,
 
 /* Differential dependencies mining algorithm */
-    split
+    split,
+/* Canonical OD verifier algorithm */
+    od_verifier
 )
 // clang-format on
 

diff --git a/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h b/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h
@@ -9,7 +9,7 @@
 namespace algos::fastod {
 
 class ComplexStrippedPartition {
-private:
+protected:
     std::shared_ptr<std::vector<size_t>> sp_indexes_;
     std::shared_ptr<std::vector<size_t>> sp_begins_;
     std::shared_ptr<std::vector<DataFrame::Range>> rb_indexes_;

diff --git a/src/core/algorithms/od/mining_algorithms.h b/src/core/algorithms/od/mining_algorithms.h
@@ -1,4 +1,5 @@
 #pragma once
 
 #include "algorithms/od/fastod/fastod.h"
+#include "algorithms/od/od_verifier/od_verifier.h"
 #include "algorithms/od/order/order.h"
diff --git a/src/core/algorithms/od/od_verifier/od_verifier.cpp b/src/core/algorithms/od/od_verifier/od_verifier.cpp
@@ -0,0 +1,89 @@
+#include "od_verifier.h"
+
+#include "ascending_od/option.h"
+#include "config/equal_nulls/option.h"
+#include "config/indices/od_context.h"
+#include "config/indices/option.h"
+#include "config/tabular_data/input_table/option.h"
+#include "partition.h"
+
+namespace algos::od_verifier {
+
+ODVerifier::ODVerifier() : Algorithm({}) {
+    RegisterOptions();
+    MakeOptionsAvailable({config::kTableOpt.GetName(), config::kEqualNullsOpt.GetName()});
+}
+
+void ODVerifier::RegisterOptions() {
+    auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); };
+
+    RegisterOption(config::kTableOpt(&input_table_));
+    RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_));
+    RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols));
+    RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols));
+    RegisterOption(config::kODContextOpt(&context_indices_));
+    RegisterOption(config::kAscendingODOpt(&ascending_));
+}
+
+void ODVerifier::MakeExecuteOptsAvailable() {
+    MakeOptionsAvailable({config::kLhsIndicesOpt.GetName(), config::kRhsIndicesOpt.GetName(),
+                          config::kODContextOpt.GetName(), config::kAscendingODOpt.GetName()});
+}
+
+void ODVerifier::LoadDataInternal() {
+    relation_ = ColumnLayoutRelationData::CreateFrom(*input_table_, is_null_equal_null_);
+
+    if (relation_->GetColumnData().empty()) {
+        throw std::runtime_error("Got an empty dataset: OD verifying is meaningless.");
+    }
+    input_table_->Reset();
+    data_ = std::make_shared<DataFrame>(DataFrame::FromInputTable(input_table_));
+    if (data_->GetColumnCount() == 0)
+        throw std::runtime_error("Got an empty dataset: OD verifying is meaningless.");
+}
+
+unsigned long long ODVerifier::ExecuteInternal() {
+    auto start_time = std::chrono::system_clock::now();
+    if (ascending_)
+        VerifyOD<true>();
+    else
+        VerifyOD<false>();
+    auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::system_clock::now() - start_time);
+    return elapsed_milliseconds.count();
+}
+
+template <bool Ascending>
+void ODVerifier::VerifyOD() {
+    AttributeSet context;
+
+    for (auto column : context_indices_) context.Set(column);
+
+    fastod::ComplexStrippedPartition stripped_partition_swap(
+            (partition_cache_.GetStrippedPartition(context, data_)));
+
+    if (stripped_partition_swap.Swap<Ascending>(lhs_indices_[0], rhs_indices_[0])) {
+        Partition part{stripped_partition_swap};
+        std::vector<std::pair<int, int>> violates(
+                part.FindViolationsBySwap<Ascending>(lhs_indices_[0], rhs_indices_[0]));
+
+        for (auto position_violate : violates)
+            row_violate_ods_by_swap_.push_back(position_violate.second + 1);
+    }
+
+    context.Set(lhs_indices_[0]);
+    fastod::ComplexStrippedPartition stripped_partition_split(
+            partition_cache_.GetStrippedPartition(context, data_));
+
+    if (stripped_partition_split.Split(rhs_indices_[0])) {
+        Partition part{stripped_partition_split};
+        std::vector<std::pair<int, int>> violates(part.FindViolationsBySplit(rhs_indices_[0]));
+
+        for (auto position_violate : violates)
+            row_violate_ods_by_split_.push_back(position_violate.second + 1);
+    }
+    std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end());
+    std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end());
+}
+
+}  // namespace algos::od_verifier
diff --git a/src/core/algorithms/od/od_verifier/od_verifier.h b/src/core/algorithms/od/od_verifier/od_verifier.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "algorithms/algorithm.h"
+#include "algorithms/od/fastod/model/canonical_od.h"
+#include "config/indices/type.h"
+#include "model/table/column_layout_relation_data.h"
+
+namespace algos::od_verifier {
+
+class ODVerifier : public Algorithm {
+private:
+    using IndicesType = config::IndicesType;
+    using IndexType = config::IndexType;
+    using DataFrame = fastod::DataFrame;
+    using PartitionCache = fastod::PartitionCache;
+    using AscCanonicalOD = fastod::AscCanonicalOD;
+    using DescCanonicalOD = fastod::DescCanonicalOD;
+    using SimpleCanonicalOD = fastod::SimpleCanonicalOD;
+    using AttributeSet = fastod::AttributeSet;
+
+    // input data
+    config::InputTable input_table_;
+    config::EqNullsType is_null_equal_null_;
+    IndicesType lhs_indices_;
+    IndicesType rhs_indices_;
+    IndicesType context_indices_;
+    bool ascending_;
+
+    // auxiliary data
+    std::shared_ptr<ColumnLayoutRelationData> relation_;
+    std::shared_ptr<DataFrame> data_;
+    PartitionCache partition_cache_;
+
+    // rows that vioalates ods
+    std::vector<int> row_violate_ods_by_swap_;
+    std::vector<int> row_violate_ods_by_split_;
+
+    // load input data
+    void RegisterOptions();
+    void MakeExecuteOptsAvailable() override;
+    void LoadDataInternal() override;
+
+    // runs the algorithm and measures its time
+    unsigned long long ExecuteInternal() override;
+
+    // checks whether OD is violated and finds the rows where it is violated
+    template <bool Ascending>
+    void VerifyOD();
+
+    // reset statistic of violations
+    void ResetState() override {
+        row_violate_ods_by_swap_.clear();
+        row_violate_ods_by_split_.clear();
+    }
+
+public:
+    // checks whether the OD has broken
+    bool ODHolds() const {
+        return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty();
+    }
+
+    // base constructor
+    ODVerifier();
+
+    // Returns the number of rows that violate the OD by split
+    size_t GetNumRowsViolateBySplit() const {
+        return row_violate_ods_by_split_.size();
+    }
+
+    // Returns the number of rows that violate the OD by swap
+    size_t GetNumRowsViolateBySwap() const {
+        return row_violate_ods_by_swap_.size();
+    }
+};
+
+}  // namespace algos::od_verifier
diff --git a/src/core/algorithms/od/od_verifier/partition.cpp b/src/core/algorithms/od/od_verifier/partition.cpp
@@ -0,0 +1,57 @@
+#include "partition.h"
+
+#include <strings.h>
+#include <utility>
+#include <vector>
+
+namespace algos::od_verifier {
+
+std::vector<std::pair<int, int>> Partition::CommonViolationBySplit(model::ColumnIndex right) const {
+    std::vector<std::pair<int, int>> violates;
+
+    for (size_t begin_pointer = 0; begin_pointer < sp_begins_->size() - 1; begin_pointer++) {
+        size_t const group_begin = (*sp_begins_)[begin_pointer];
+        size_t const group_end = (*sp_begins_)[begin_pointer + 1];
+
+        int const group_value = data_->GetValue((*sp_indexes_)[group_begin], right);
+
+        for (size_t i = group_begin + 1; i < group_end; i++) {
+            if (data_->GetValue((*sp_indexes_)[i], right) != group_value) {
+                violates.emplace_back(std::pair<int, int>(right, (*sp_indexes_)[i]));
+            }
+        }
+    }
+
+    return violates;
+}
+
+std::vector<std::pair<int, int>> Partition::RangeBasedViolationBySplit(
+        model::ColumnIndex right) const {
+    std::vector<std::pair<int, int>> violates;
+
+    for (size_t begin_pointer = 0; begin_pointer < rb_begins_->size() - 1; ++begin_pointer) {
+        size_t const group_begin = (*rb_begins_)[begin_pointer];
+        size_t const group_end = (*rb_begins_)[begin_pointer + 1];
+
+        int const group_value = data_->GetValue((*rb_indexes_)[group_begin].first, right);
+
+        for (size_t i = group_begin; i < group_end; ++i) {
+            algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i];
+
+            for (size_t j = range.first; j <= range.second; ++j) {
+                if (data_->GetValue(j, right) != group_value) {
+                    violates.emplace_back(std::pair<int, int>(right, j));
+                }
+            }
+        }
+    }
+
+    return violates;
+}
+
+std::vector<std::pair<int, int>> Partition::FindViolationsBySplit(model::ColumnIndex right) const {
+    return is_stripped_partition_ ? CommonViolationBySplit(right)
+                                  : RangeBasedViolationBySplit(right);
+}
+
+}  // namespace algos::od_verifier
diff --git a/src/core/algorithms/od/od_verifier/partition.h b/src/core/algorithms/od/od_verifier/partition.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include "algorithms/od/fastod/partitions/complex_stripped_partition.h"
+
+namespace algos::od_verifier {
+
+class Partition : protected algos::fastod::ComplexStrippedPartition {
+private:
+    std::vector<std::pair<int, int>> CommonViolationBySplit(model::ColumnIndex right) const;
+
+    std::vector<std::pair<int, int>> RangeBasedViolationBySplit(model::ColumnIndex right) const;
+
+public:
+    Partition() : algos::fastod::ComplexStrippedPartition() {}
+
+    Partition(algos::fastod::ComplexStrippedPartition const& daddy)
+        : algos::fastod::ComplexStrippedPartition(daddy) {}
+
+    std::vector<std::pair<int, int>> FindViolationsBySplit(model::ColumnIndex right) const;
+
+    template <bool Ascending>
+    std::vector<std::pair<int, int>> FindViolationsBySwap(model::ColumnIndex left,
+                                                          model::ColumnIndex right) const {
+        size_t const group_count = is_stripped_partition_ ? sp_begins_->size() : rb_begins_->size();
+        std::vector<std::pair<int, int>> violates;
+
+        for (size_t begin_pointer = 0; begin_pointer < group_count - 1; begin_pointer++) {
+            size_t const group_begin = is_stripped_partition_ ? (*sp_begins_)[begin_pointer]
+                                                              : (*rb_begins_)[begin_pointer];
+
+            size_t const group_end = is_stripped_partition_ ? (*sp_begins_)[begin_pointer + 1]
+                                                            : (*rb_begins_)[begin_pointer + 1];
+
+            std::vector<std::pair<int, int>> values;
+            std::vector<int> row_pos;
+
+            if (is_stripped_partition_) {
+                values.reserve(group_end - group_begin);
+
+                for (size_t i = group_begin; i < group_end; ++i) {
+                    size_t const index = (*sp_indexes_)[i];
+
+                    values.emplace_back(data_->GetValue(index, left),
+                                        data_->GetValue(index, right));
+                    row_pos.emplace_back(index);
+                }
+            } else {
+                for (size_t i = group_begin; i < group_end; ++i) {
+                    algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i];
+
+                    for (size_t j = range.first; j <= range.second; ++j) {
+                        values.emplace_back(data_->GetValue(j, left), data_->GetValue(j, right));
+                    }
+                }
+            }
+
+            if constexpr (Ascending) {
+                std::sort(values.begin(), values.end(),
+                          [](auto const& p1, auto const& p2) { return p1.first < p2.first; });
+            } else {
+                std::sort(values.begin(), values.end(),
+                          [](auto const& p1, auto const& p2) { return p2.first < p1.first; });
+            }
+
+            size_t prev_group_max_index = 0;
+            size_t current_group_max_index = 0;
+            bool is_first_group = true;
+
+            for (size_t i = 0; i < values.size(); i++) {
+                auto const& [first, second] = values[i];
+
+                if (i != 0 && values[i - 1].first != first) {
+                    is_first_group = false;
+                    prev_group_max_index = current_group_max_index;
+                    current_group_max_index = i;
+                } else if (values[current_group_max_index].second <= second) {
+                    current_group_max_index = i;
+                }
+
+                if (!is_first_group && values[prev_group_max_index].second > second) {
+                    violates.push_back(std::pair<int, int>(right, row_pos[i]));
+                }
+            }
+        }
+
+        return violates;
+    }
+};
+
+}  // namespace algos::od_verifier
diff --git a/src/core/config/ascending_od/option.cpp b/src/core/config/ascending_od/option.cpp
@@ -0,0 +1,9 @@
+#include "ascending_od/option.h"
+
+#include "ascending_od/type.h"
+#include "config/names_and_descriptions.h"
+
+namespace config {
+extern CommonOption<AscendingODFlagType> const kAscendingODOpt{names::kAscendingOD,
+                                                               descriptions::kDAscendingOD, true};
+}  // namespace config
diff --git a/src/core/config/ascending_od/option.h b/src/core/config/ascending_od/option.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "config/ascending_od/type.h"
+#include "config/common_option.h"
+
+namespace config {
+extern CommonOption<AscendingODFlagType> const kAscendingODOpt;
+
+}  // namespace config
diff --git a/src/core/config/ascending_od/type.h b/src/core/config/ascending_od/type.h
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace config {
+using AscendingODFlagType = bool;
+}  // namespace config
diff --git a/src/core/config/descriptions.h b/src/core/config/descriptions.h
@@ -44,6 +44,8 @@ constexpr auto kDItemColumnIndex = "index of the column where an item name is st
 constexpr auto kDFirstColumnTId = "indicates that the first column contains the transaction IDs";
 auto const kDMetric = details::kDMetricString.c_str();
 constexpr auto kDLhsIndices = "LHS column indices";
+constexpr auto kDODContext = "context columns indices";
+constexpr auto kDAscendingOD = "flag shows whether the dependence is ascending or descending";
 constexpr auto kDRhsIndices = "RHS column indices";
 constexpr auto kDRhsIndex = "RHS column index";
 constexpr auto kDUCCIndices = "column indices for UCC verification";

diff --git a/src/core/config/indices/od_context.cpp b/src/core/config/indices/od_context.cpp
@@ -0,0 +1,9 @@
+#include "config/indices/od_context.h"
+
+#include "config/names_and_descriptions.h"
+#include "indices/type.h"
+
+namespace config {
+extern CommonOption<IndicesType> const kODContextOpt{names::kODContext, descriptions::kDODContext,
+                                                     IndicesType({})};
+}  // namespace config