Skip to content

Commit

Permalink
Refactor code in tests related to hash tests
Browse files Browse the repository at this point in the history
  • Loading branch information
vs9h committed Nov 28, 2023
1 parent 23cbf9c commit de7f408
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 123 deletions.
74 changes: 7 additions & 67 deletions src/tests/datasets.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,18 @@

static const auto test_data_dir = std::filesystem::current_path() / "input_data";

struct Dataset {
std::string name;
size_t hash;
char separator;
bool has_header;
};
namespace tests {

/// dataset configuration info to create an input table
struct DatasetInfo {
std::string name;
char separator;
bool has_header;
};

class LightDatasets {
public:
static inline const std::array<Dataset, 11> datasets_ = {
{{"CIPublicHighway10k.csv", 33398, ',', true},
{"neighbors10k.csv", 43368, ',', true},
{"WDC_astronomical.csv", 22281, ',', true},
{"WDC_age.csv", 19620, ',', true},
{"WDC_appearances.csv", 25827, ',', true},
{"WDC_astrology.csv", 40815, ',', true},
{"WDC_game.csv", 6418, ',', true},
{"WDC_science.csv", 19620, ',', true},
{"WDC_symbols.csv", 28289, ',', true},
{"breast_cancer.csv", 15121, ',', true},
{"WDC_kepler.csv", 63730, ',', true}}};
/// a pair consisting of a dataset and the expected hash
using DatasetHashPair = std::pair<DatasetInfo, size_t>;
/// a pair consisting of a vector of datasets and an expected hash
using DatasetsHashPair = std::pair<std::vector<DatasetInfo>, size_t>;

// DEPRECATED -- just use
// for (auto dataset : LightDatasets::datasets) { ... }
static size_t DatasetQuantity() {
return datasets_.size();
}
static std::string DatasetName(size_t i) {
return datasets_[i].name;
}
static char Separator(size_t i) {
return datasets_[i].separator;
}
static bool HasHeader(size_t i) {
return datasets_[i].has_header;
}
static unsigned int Hash(size_t i) {
return datasets_[i].hash;
}
};

class HeavyDatasets {
public:
static inline const std::array<Dataset, 6> datasets_ = {
{{"adult.csv", 23075, ';', false},
{"CIPublicHighway.csv", 13035, ',', true},
{"EpicMeds.csv", 50218, '|', true},
{"EpicVitals.csv", 2083, '|', true},
{"iowa1kk.csv", 28573, ',', true},
{"LegacyPayors.csv", 43612, '|', true}}};

// DEPRECATED -- just use
// for (auto dataset : HeavyDatasets::datasets) { ... }
static size_t DatasetQuantity() {
return datasets_.size();
}
static std::string DatasetName(size_t i) {
return datasets_[i].name;
}
static char Separator(size_t i) {
return datasets_[i].separator;
}
static bool HasHeader(size_t i) {
return datasets_[i].has_header;
}
static unsigned int Hash(size_t i) {
return datasets_[i].hash;
}
};
} // namespace tests
32 changes: 12 additions & 20 deletions src/tests/test_algorithm.cpp → src/tests/test_fd_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "algorithms/fd/tane/tane.h"
#include "datasets.h"
#include "model/table/relational_schema.h"
#include "testing_utils.h"
#include "test_fd_util.h"

using std::string, std::vector;
using ::testing::ContainerEq, ::testing::Eq;
Expand Down Expand Up @@ -103,14 +103,14 @@ TYPED_TEST_P(AlgorithmTest, WorksOnWideDataset) {
ASSERT_TRUE(CheckFdListEquality(true_fd_collection, algorithm->FdList()));
}

TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
template <typename T>
void PerformConsistentHashTestOn(std::vector<tests::DatasetHashPair> const& datasets) {
try {
for (auto const& dataset : LightDatasets::datasets_) {
auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator,
dataset.has_header);
for (auto const& [dataset, hash] : datasets) {
auto algorithm =
T::CreateAlgorithmInstance(dataset.name, dataset.separator, dataset.has_header);
algorithm->Execute();
std::cout << dataset.name << std::endl;
EXPECT_EQ(algorithm->Fletcher16(), dataset.hash)
EXPECT_EQ(algorithm->Fletcher16(), hash)
<< "FD collection hash changed for " << dataset.name;
}
} catch (std::runtime_error& e) {
Expand All @@ -120,20 +120,12 @@ TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
SUCCEED();
}

TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) {
PerformConsistentHashTestOn<TestFixture>(TestFixture::light_datasets_);
}

TYPED_TEST_P(AlgorithmTest, HeavyDatasetsConsistentHash) {
try {
for (auto const& dataset : HeavyDatasets::datasets_) {
auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator,
dataset.has_header);
algorithm->Execute();
EXPECT_EQ(algorithm->Fletcher16(), dataset.hash)
<< "The new algorithm and Pyro yield different results at " << dataset.name;
}
} catch (std::runtime_error& e) {
std::cout << "Exception raised in test: " << e.what() << std::endl;
FAIL();
}
SUCCEED();
PerformConsistentHashTestOn<TestFixture>(TestFixture::heavy_datasets_);
}

TYPED_TEST_P(AlgorithmTest, ConsistentRepeatedExecution) {
Expand Down
9 changes: 5 additions & 4 deletions src/tests/test_fd_mine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "config/names.h"
#include "datasets.h"
#include "model/table/relational_schema.h"
#include "test_fd_util.h"

using ::testing::ContainerEq, ::testing::Eq;

Expand Down Expand Up @@ -42,7 +43,7 @@ std::unique_ptr<FDAlgorithm> CreateFD_MineAlgorithmInstance(std::string const& p
return algos::CreateAndLoadAlgorithm<Fd_mine>(FD_MineGetParamMap(path, separator, has_header));
}

class AlgorithmTest : public LightDatasets, public HeavyDatasets, public ::testing::Test {};
using FDMineAlgorithmTest = AlgorithmTest<Fd_mine>;

std::vector<unsigned int> FD_MineBitsetToIndexVector(boost::dynamic_bitset<> const& bitset) {
std::vector<unsigned int> res;
Expand Down Expand Up @@ -148,13 +149,13 @@ void MinimizeFDs(std::list<FD>& fd_collection) {
}
}

TEST_F(AlgorithmTest, FD_Mine_ReturnsSameAsPyro) {
TEST_F(FDMineAlgorithmTest, FD_Mine_ReturnsSameAsPyro) {
namespace onam = config::names;

try {
for (Dataset const& dataset : LightDatasets::datasets_) {
for (auto const& [dataset, hash] : FDMineAlgorithmTest::light_datasets_) {
// TODO: change this hotfix
if (dataset.name == "breast_cancer.csv") {
if (dataset.name == tests::kbreast_cancer.name) {
continue;
}
auto path = test_data_dir / dataset.name;
Expand Down
23 changes: 23 additions & 0 deletions src/tests/testing_utils.h → src/tests/test_fd_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "algorithms/algo_factory.h"
#include "algorithms/fd/fd_algorithm.h"
#include "all_datasets_info.h"
#include "config/error/type.h"
#include "config/names.h"
#include "datasets.h"
Expand Down Expand Up @@ -38,10 +39,32 @@ class AlgorithmTest : public ::testing::Test {
};
}

public:
static std::unique_ptr<algos::FDAlgorithm> CreateAlgorithmInstance(const std::string& filename,
char separator = ',',
bool has_header = true) {
return algos::CreateAndLoadAlgorithm<T>(
GetParamMap(test_data_dir / filename, separator, has_header));
}

static inline const std::vector<tests::DatasetHashPair> light_datasets_ = {
{{tests::kCIPublicHighway10k, 33398},
{tests::kneighbors10k, 43368},
{tests::kWDC_astronomical, 22281},
{tests::kWDC_age, 19620},
{tests::kWDC_appearances, 25827},
{tests::kWDC_astrology, 40815},
{tests::kWDC_game, 6418},
{tests::kWDC_science, 19620},
{tests::kWDC_symbols, 28289},
{tests::kbreast_cancer, 15121},
{tests::kWDC_kepler, 63730}}};

static inline const std::vector<tests::DatasetHashPair> heavy_datasets_ = {
{{tests::kadult, 23075},
{tests::kCIPublicHighway, 13035},
{tests::kEpicMeds, 50218},
{tests::kEpicVitals, 2083},
{tests::kiowa1kk, 28573},
{tests::kLegacyPayors, 43612}}};
};
65 changes: 33 additions & 32 deletions src/tests/test_ucc_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "algorithms/ucc/hyucc/hyucc.h"
#include "algorithms/ucc/ucc.h"
#include "algorithms/ucc/ucc_algorithm.h"
#include "all_datasets_info.h"
#include "config/thread_number/type.h"
#include "datasets.h"

Expand Down Expand Up @@ -53,42 +54,42 @@ class UCCAlgorithmTest : public ::testing::Test {
GetParamMap(test_data_dir / filename, separator, has_header));
}

static inline const std::vector<Dataset> light_datasets_ = {
{"WDC_astronomical.csv", 2089541732445U, ',', true},
{"WDC_symbols.csv", 1, ',', true},
{"WDC_science.csv", 2658842082150U, ',', true},
{"WDC_satellites.csv", 5208443370856032U, ',', true},
{"WDC_appearances.csv", 82369238361U, ',', true},
{"WDC_astrology.csv", 79554241843163108U, ',', true},
{"WDC_game.csv", 2555214540772530U, ',', true},
{"WDC_kepler.csv", 82426217315737U, ',', true},
{"WDC_planetz.csv", 2555214540772530U, ',', true},
{"WDC_age.csv", 2658842082150U, ',', true},
{"TestWide.csv", 2555250373874U, ',', true},
{"abalone.csv", 16581571148699134255U, ',', true},
{"iris.csv", 1, ',', false},
{"adult.csv", 1, ';', false},
{"breast_cancer.csv", 16854900230774656828U, ',', true},
static inline const std::vector<DatasetHashPair> light_datasets_ = {
{kWDC_astronomical, 2089541732445U},
{kWDC_symbols, 1},
{kWDC_science, 2658842082150U},
{kWDC_satellites, 5208443370856032U},
{kWDC_appearances, 82369238361U},
{kWDC_astrology, 79554241843163108U},
{kWDC_game, 2555214540772530U},
{kWDC_kepler, 82426217315737U},
{kWDC_planetz, 2555214540772530U},
{kWDC_age, 2658842082150U},
{kTestWide, 2555250373874U},
{kabalone, 16581571148699134255U},
{kiris, 1},
{kadult, 1},
{kbreast_cancer, 16854900230774656828U},
// Possibly heavy datasets, if another less efficient algorithm than HyUCC is not
// able to process these move them to heavy_datasets_
{"neighbors10k.csv", 170971924188219U, ',', true},
{kneighbors10k, 170971924188219U},
#if 0
{"neighbors50k.csv", 1, ',', true},
{kneighbors50k, 1},
#endif
{"neighbors100k.csv", 170971924188219U, ',', true},
{"CIPublicHighway10k.csv", 82369238361U, ',', true},
{"CIPublicHighway700.csv", 82369238361U, ',', true},
{kneighbors100k, 170971924188219U},
{kCIPublicHighway10k, 82369238361U},
{kCIPublicHighway700, 82369238361U},
};

static inline const std::vector<Dataset> heavy_datasets_ = {
{"EpicVitals.csv", 1, '|', true},
{"EpicMeds.csv", 59037771758954037U, '|', true},
{"iowa1kk.csv", 2654435863U, ',', true},
static inline const std::vector<DatasetHashPair> heavy_datasets_ = {
{kEpicVitals, 1},
{kEpicMeds, 59037771758954037U},
{kiowa1kk, 2654435863U},
#if 0
{"fd-reduced-30.csv", 275990379954778425U, ',', true},
{"flight_1k.csv", 2512091017708538662U, ';', true},
{"plista_1k.csv", 1, ';', false},
{"letter.csv", 1, ',', false},
{kfd_reduced_30, 275990379954778425U},
{kflight_1k, 2512091017708538662U},
{kplista_1k, 1},
{kletter, 1},
#endif
};
};
Expand Down Expand Up @@ -120,8 +121,8 @@ std::size_t Hash(std::vector<std::vector<unsigned>> const& vec) {
}

template <typename T>
void PerformConsistentHashTestOn(std::vector<Dataset> const& datasets) {
for (Dataset const& dataset : datasets) {
void PerformConsistentHashTestOn(std::vector<DatasetHashPair> const& datasets) {
for (auto const& [dataset, hash] : datasets) {
try {
auto ucc_algo =
T::CreateAlgorithmInstance(dataset.name, dataset.separator, dataset.has_header);
Expand All @@ -133,7 +134,7 @@ void PerformConsistentHashTestOn(std::vector<Dataset> const& datasets) {
std::transform(actual_list.begin(), actual_list.end(), std::back_inserter(actual),
[](Vertical const& v) { return v.GetColumnIndicesAsVector(); });
std::sort(actual.begin(), actual.end());
EXPECT_EQ(Hash(actual), dataset.hash) << "Wrong hash on dataset " << dataset.name;
EXPECT_EQ(Hash(actual), hash) << "Wrong hash on dataset " << dataset.name;
} catch (std::exception const& e) {
std::cout << "An exception with message: " << e.what() << "\n\tis thrown on dataset "
<< dataset.name << '\n';
Expand Down

0 comments on commit de7f408

Please sign in to comment.