Refactor code in tests related to hash tests

Desbordante · Nov 28, 2023 · de7f408 · de7f408
1 parent 23cbf9c
commit de7f408
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 123 deletions.
diff --git a/src/tests/datasets.h b/src/tests/datasets.h
@@ -6,78 +6,18 @@
 
 static const auto test_data_dir = std::filesystem::current_path() / "input_data";
 
-struct Dataset {
-    std::string name;
-    size_t hash;
-    char separator;
-    bool has_header;
-};
+namespace tests {
 
+/// dataset configuration info to create an input table
 struct DatasetInfo {
     std::string name;
     char separator;
     bool has_header;
 };
 
-class LightDatasets {
-public:
-    static inline const std::array<Dataset, 11> datasets_ = {
-            {{"CIPublicHighway10k.csv", 33398, ',', true},
-             {"neighbors10k.csv", 43368, ',', true},
-             {"WDC_astronomical.csv", 22281, ',', true},
-             {"WDC_age.csv", 19620, ',', true},
-             {"WDC_appearances.csv", 25827, ',', true},
-             {"WDC_astrology.csv", 40815, ',', true},
-             {"WDC_game.csv", 6418, ',', true},
-             {"WDC_science.csv", 19620, ',', true},
-             {"WDC_symbols.csv", 28289, ',', true},
-             {"breast_cancer.csv", 15121, ',', true},
-             {"WDC_kepler.csv", 63730, ',', true}}};
+/// a pair consisting of a dataset and the expected hash
+using DatasetHashPair = std::pair<DatasetInfo, size_t>;
+/// a pair consisting of a vector of datasets and an expected hash
+using DatasetsHashPair = std::pair<std::vector<DatasetInfo>, size_t>;
 
-    // DEPRECATED -- just use
-    // for (auto dataset : LightDatasets::datasets) { ... }
-    static size_t DatasetQuantity() {
-        return datasets_.size();
-    }
-    static std::string DatasetName(size_t i) {
-        return datasets_[i].name;
-    }
-    static char Separator(size_t i) {
-        return datasets_[i].separator;
-    }
-    static bool HasHeader(size_t i) {
-        return datasets_[i].has_header;
-    }
-    static unsigned int Hash(size_t i) {
-        return datasets_[i].hash;
-    }
-};
-
-class HeavyDatasets {
-public:
-    static inline const std::array<Dataset, 6> datasets_ = {
-            {{"adult.csv", 23075, ';', false},
-             {"CIPublicHighway.csv", 13035, ',', true},
-             {"EpicMeds.csv", 50218, '|', true},
-             {"EpicVitals.csv", 2083, '|', true},
-             {"iowa1kk.csv", 28573, ',', true},
-             {"LegacyPayors.csv", 43612, '|', true}}};
-
-    // DEPRECATED -- just use
-    // for (auto dataset : HeavyDatasets::datasets) { ... }
-    static size_t DatasetQuantity() {
-        return datasets_.size();
-    }
-    static std::string DatasetName(size_t i) {
-        return datasets_[i].name;
-    }
-    static char Separator(size_t i) {
-        return datasets_[i].separator;
-    }
-    static bool HasHeader(size_t i) {
-        return datasets_[i].has_header;
-    }
-    static unsigned int Hash(size_t i) {
-        return datasets_[i].hash;
-    }
-};
+}  // namespace tests
diff --git a/src/tests/test_algorithm.cpp → src/tests/test_fd_algorithm.cpp b/src/tests/test_algorithm.cpp → src/tests/test_fd_algorithm.cpp
@@ -16,7 +16,7 @@
 #include "algorithms/fd/tane/tane.h"
 #include "datasets.h"
 #include "model/table/relational_schema.h"
-#include "testing_utils.h"
+#include "test_fd_util.h"
 
 using std::string, std::vector;
 using ::testing::ContainerEq, ::testing::Eq;
@@ -103,14 +103,14 @@ TYPED_TEST_P(AlgorithmTest, WorksOnWideDataset) {
     ASSERT_TRUE(CheckFdListEquality(true_fd_collection, algorithm->FdList()));
 }
 
-TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
+template <typename T>
+void PerformConsistentHashTestOn(std::vector<tests::DatasetHashPair> const& datasets) {
     try {
-        for (auto const& dataset : LightDatasets::datasets_) {
-            auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator,
-                                                                  dataset.has_header);
+        for (auto const& [dataset, hash] : datasets) {
+            auto algorithm =
+                    T::CreateAlgorithmInstance(dataset.name, dataset.separator, dataset.has_header);
             algorithm->Execute();
-            std::cout << dataset.name << std::endl;
-            EXPECT_EQ(algorithm->Fletcher16(), dataset.hash)
+            EXPECT_EQ(algorithm->Fletcher16(), hash)
                     << "FD collection hash changed for " << dataset.name;
         }
     } catch (std::runtime_error& e) {
@@ -120,20 +120,12 @@ TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
     SUCCEED();
 }
 
+TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
+    PerformConsistentHashTestOn<TestFixture>(TestFixture::light_datasets_);
+}
+
 TYPED_TEST_P(AlgorithmTest, HeavyDatasetsConsistentHash) {
-    try {
-        for (auto const& dataset : HeavyDatasets::datasets_) {
-            auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator,
-                                                                  dataset.has_header);
-            algorithm->Execute();
-            EXPECT_EQ(algorithm->Fletcher16(), dataset.hash)
-                    << "The new algorithm and Pyro yield different results at " << dataset.name;
-        }
-    } catch (std::runtime_error& e) {
-        std::cout << "Exception raised in test: " << e.what() << std::endl;
-        FAIL();
-    }
-    SUCCEED();
+    PerformConsistentHashTestOn<TestFixture>(TestFixture::heavy_datasets_);
 }
 
 TYPED_TEST_P(AlgorithmTest, ConsistentRepeatedExecution) {

diff --git a/src/tests/test_fd_mine.cpp b/src/tests/test_fd_mine.cpp
@@ -14,6 +14,7 @@
 #include "config/names.h"
 #include "datasets.h"
 #include "model/table/relational_schema.h"
+#include "test_fd_util.h"
 
 using ::testing::ContainerEq, ::testing::Eq;
 
@@ -42,7 +43,7 @@ std::unique_ptr<FDAlgorithm> CreateFD_MineAlgorithmInstance(std::string const& p
     return algos::CreateAndLoadAlgorithm<Fd_mine>(FD_MineGetParamMap(path, separator, has_header));
 }
 
-class AlgorithmTest : public LightDatasets, public HeavyDatasets, public ::testing::Test {};
+using FDMineAlgorithmTest = AlgorithmTest<Fd_mine>;
 
 std::vector<unsigned int> FD_MineBitsetToIndexVector(boost::dynamic_bitset<> const& bitset) {
     std::vector<unsigned int> res;
@@ -148,13 +149,13 @@ void MinimizeFDs(std::list<FD>& fd_collection) {
     }
 }
 
-TEST_F(AlgorithmTest, FD_Mine_ReturnsSameAsPyro) {
+TEST_F(FDMineAlgorithmTest, FD_Mine_ReturnsSameAsPyro) {
     namespace onam = config::names;
 
     try {
-        for (Dataset const& dataset : LightDatasets::datasets_) {
+        for (auto const& [dataset, hash] : FDMineAlgorithmTest::light_datasets_) {
             // TODO: change this hotfix
-            if (dataset.name == "breast_cancer.csv") {
+            if (dataset.name == tests::kbreast_cancer.name) {
                 continue;
             }
             auto path = test_data_dir / dataset.name;

diff --git a/src/tests/testing_utils.h → src/tests/test_fd_util.h b/src/tests/testing_utils.h → src/tests/test_fd_util.h
@@ -6,6 +6,7 @@
 
 #include "algorithms/algo_factory.h"
 #include "algorithms/fd/fd_algorithm.h"
+#include "all_datasets_info.h"
 #include "config/error/type.h"
 #include "config/names.h"
 #include "datasets.h"
@@ -38,10 +39,32 @@ class AlgorithmTest : public ::testing::Test {
         };
     }
 
+public:
     static std::unique_ptr<algos::FDAlgorithm> CreateAlgorithmInstance(const std::string& filename,
                                                                        char separator = ',',
                                                                        bool has_header = true) {
         return algos::CreateAndLoadAlgorithm<T>(
                 GetParamMap(test_data_dir / filename, separator, has_header));
     }
+
+    static inline const std::vector<tests::DatasetHashPair> light_datasets_ = {
+            {{tests::kCIPublicHighway10k, 33398},
+             {tests::kneighbors10k, 43368},
+             {tests::kWDC_astronomical, 22281},
+             {tests::kWDC_age, 19620},
+             {tests::kWDC_appearances, 25827},
+             {tests::kWDC_astrology, 40815},
+             {tests::kWDC_game, 6418},
+             {tests::kWDC_science, 19620},
+             {tests::kWDC_symbols, 28289},
+             {tests::kbreast_cancer, 15121},
+             {tests::kWDC_kepler, 63730}}};
+
+    static inline const std::vector<tests::DatasetHashPair> heavy_datasets_ = {
+            {{tests::kadult, 23075},
+             {tests::kCIPublicHighway, 13035},
+             {tests::kEpicMeds, 50218},
+             {tests::kEpicVitals, 2083},
+             {tests::kiowa1kk, 28573},
+             {tests::kLegacyPayors, 43612}}};
 };
diff --git a/src/tests/test_ucc_algorithms.cpp b/src/tests/test_ucc_algorithms.cpp
@@ -10,6 +10,7 @@
 #include "algorithms/ucc/hyucc/hyucc.h"
 #include "algorithms/ucc/ucc.h"
 #include "algorithms/ucc/ucc_algorithm.h"
+#include "all_datasets_info.h"
 #include "config/thread_number/type.h"
 #include "datasets.h"
 
@@ -53,42 +54,42 @@ class UCCAlgorithmTest : public ::testing::Test {
                 GetParamMap(test_data_dir / filename, separator, has_header));
     }
 
-    static inline const std::vector<Dataset> light_datasets_ = {
-        {"WDC_astronomical.csv", 2089541732445U, ',', true},
-        {"WDC_symbols.csv", 1, ',', true},
-        {"WDC_science.csv", 2658842082150U, ',', true},
-        {"WDC_satellites.csv", 5208443370856032U, ',', true},
-        {"WDC_appearances.csv", 82369238361U, ',', true},
-        {"WDC_astrology.csv", 79554241843163108U, ',', true},
-        {"WDC_game.csv", 2555214540772530U, ',', true},
-        {"WDC_kepler.csv", 82426217315737U, ',', true},
-        {"WDC_planetz.csv", 2555214540772530U, ',', true},
-        {"WDC_age.csv", 2658842082150U, ',', true},
-        {"TestWide.csv", 2555250373874U, ',', true},
-        {"abalone.csv", 16581571148699134255U, ',', true},
-        {"iris.csv", 1, ',', false},
-        {"adult.csv", 1, ';', false},
-        {"breast_cancer.csv", 16854900230774656828U, ',', true},
+    static inline const std::vector<DatasetHashPair> light_datasets_ = {
+        {kWDC_astronomical, 2089541732445U},
+        {kWDC_symbols, 1},
+        {kWDC_science, 2658842082150U},
+        {kWDC_satellites, 5208443370856032U},
+        {kWDC_appearances, 82369238361U},
+        {kWDC_astrology, 79554241843163108U},
+        {kWDC_game, 2555214540772530U},
+        {kWDC_kepler, 82426217315737U},
+        {kWDC_planetz, 2555214540772530U},
+        {kWDC_age, 2658842082150U},
+        {kTestWide, 2555250373874U},
+        {kabalone, 16581571148699134255U},
+        {kiris, 1},
+        {kadult, 1},
+        {kbreast_cancer, 16854900230774656828U},
         // Possibly heavy datasets, if another less efficient algorithm than HyUCC is not
         // able to process these move them to heavy_datasets_
-        {"neighbors10k.csv", 170971924188219U, ',', true},
+        {kneighbors10k, 170971924188219U},
 #if 0
-        {"neighbors50k.csv", 1, ',', true},
+        {kneighbors50k, 1},
 #endif
-        {"neighbors100k.csv", 170971924188219U, ',', true},
-        {"CIPublicHighway10k.csv", 82369238361U, ',', true},
-        {"CIPublicHighway700.csv", 82369238361U, ',', true},
+        {kneighbors100k, 170971924188219U},
+        {kCIPublicHighway10k, 82369238361U},
+        {kCIPublicHighway700, 82369238361U},
     };
 
-    static inline const std::vector<Dataset> heavy_datasets_ = {
-        {"EpicVitals.csv", 1, '|', true},
-        {"EpicMeds.csv", 59037771758954037U, '|', true},
-        {"iowa1kk.csv", 2654435863U, ',', true},
+    static inline const std::vector<DatasetHashPair> heavy_datasets_ = {
+        {kEpicVitals, 1},
+        {kEpicMeds, 59037771758954037U},
+        {kiowa1kk, 2654435863U},
 #if 0
-        {"fd-reduced-30.csv", 275990379954778425U, ',', true},
-        {"flight_1k.csv", 2512091017708538662U, ';', true},
-        {"plista_1k.csv", 1, ';', false},
-        {"letter.csv", 1, ',', false},
+        {kfd_reduced_30, 275990379954778425U},
+        {kflight_1k, 2512091017708538662U},
+        {kplista_1k, 1},
+        {kletter, 1},
 #endif
     };
 };
@@ -120,8 +121,8 @@ std::size_t Hash(std::vector<std::vector<unsigned>> const& vec) {
 }
 
 template <typename T>
-void PerformConsistentHashTestOn(std::vector<Dataset> const& datasets) {
-    for (Dataset const& dataset : datasets) {
+void PerformConsistentHashTestOn(std::vector<DatasetHashPair> const& datasets) {
+    for (auto const& [dataset, hash] : datasets) {
         try {
             auto ucc_algo =
                     T::CreateAlgorithmInstance(dataset.name, dataset.separator, dataset.has_header);
@@ -133,7 +134,7 @@ void PerformConsistentHashTestOn(std::vector<Dataset> const& datasets) {
             std::transform(actual_list.begin(), actual_list.end(), std::back_inserter(actual),
                            [](Vertical const& v) { return v.GetColumnIndicesAsVector(); });
             std::sort(actual.begin(), actual.end());
-            EXPECT_EQ(Hash(actual), dataset.hash) << "Wrong hash on dataset " << dataset.name;
+            EXPECT_EQ(Hash(actual), hash) << "Wrong hash on dataset " << dataset.name;
         } catch (std::exception const& e) {
             std::cout << "An exception with message: " << e.what() << "\n\tis thrown on dataset "
                       << dataset.name << '\n';