Skip to content

Commit

Permalink
make 1: added example for train-test and train-validate-test cross va…
Browse files Browse the repository at this point in the history
…lidation data split
  • Loading branch information
ksiminski committed Jun 25, 2024
1 parent 9cf31f7 commit ffc778c
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 30 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ This example presents modifiers of data:
7. imputation of missing data with values from 3 nearest neighbours and then the imputed data set standardized
8. a series of data modifiers: imputer with values from knn (k == 3) + standardiser, imputer with constant value, normaliser, standardiser, average imputer, imputer with constant value + normaliser, average imputer + standardiser
9. imputer with constant value + normaliser + average imputer + standardiser

10. split of a data set into _n_ chunks of train and test datasets for _n_-fold cross-validation
11. split of a data set into _n_ chunks of train, validate, and test datasets for _n_-fold cross-validation

### make 2
Run `make 2` for this example.
Expand Down
127 changes: 98 additions & 29 deletions source/experiments/exp-001.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
/** @file */

#include <iostream>
#include <vector>
#include <string>



#include "../readers/reader-complete.h"
#include "../readers/reader-incomplete.h"
#include "../readers/train_test_model.h"
#include "../readers/train_validation_test_model.h"
#include "../common/data-modifier-marginaliser.h"
#include "../common/data-modifier-standardiser.h"
#include "../common/data-modifier-normaliser.h"
Expand All @@ -21,20 +20,22 @@
#include "../common/data-modifier-imputer-values-from-knn.h"
#include "../common/data-modifier-outlier-remove-sigma.h"


#include "../experiments/exp-001.h"

#include <fstream>

void ksi::exp_001::execute()
{
try
{
std::string dataDir ("../data/exp-001");



// complete data
{
std::cout << std::endl;
std::cout << "=====================" << std::endl;
std::cout << "reading complete data" << std::endl;
std::cout << "=====================" << std::endl;

std::string CompleteDataset ("/complete.data");
ksi::reader_complete DataReader;
auto dane = DataReader.read(dataDir + CompleteDataset);
Expand All @@ -59,12 +60,12 @@ void ksi::exp_001::execute()
std::cout << data << std::endl;
std::cout << std::endl;
}

{
auto data = dane;
auto filtering_function = [] (double value) -> bool
{
return value > 5;
return value > 5;
};
std::size_t attribute {4};
ksi::data_modifier_filter filter (attribute, filtering_function);
Expand All @@ -74,16 +75,21 @@ void ksi::exp_001::execute()
std::cout << std::endl;
}
}

// outliers
{
std::cout << std::endl;
std::cout << "========" << std::endl;
std::cout << "outliers" << std::endl;
std::cout << "========" << std::endl;

std::string OutlierDataset ("/outliers.data");
ksi::reader_complete DataReader;
auto dane = DataReader.read(dataDir + OutlierDataset);
std::cout << "original data" << std::endl;
std::cout << dane << std::endl;
std::cout << std::endl;


{
const double n { 1 };
Expand Down Expand Up @@ -117,18 +123,23 @@ void ksi::exp_001::execute()
}

}


// incomplete data
{
std::cout << std::endl;
std::cout << "===============" << std::endl;
std::cout << "incomplete data" << std::endl;
std::cout << "===============" << std::endl;

std::string IncompleteDataset ("/incomplete.data");

ksi::reader_incomplete DataReader;
auto dane = DataReader.read(dataDir + IncompleteDataset);
std::cout << "original incomplete data" << std::endl;
std::cout << dane << std::endl;
std::cout << std::endl;

{
auto marg = dane;
ksi::data_modifier_marginaliser marginaliser;
Expand All @@ -147,7 +158,7 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}

{
auto marg = dane;
ksi::data_modifier_imputer_average imputer;
Expand All @@ -156,7 +167,7 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}

{
auto marg = dane;
ksi::data_modifier_imputer_median imputer;
Expand All @@ -165,7 +176,7 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}

{
auto marg = dane;
int k = 3;
Expand All @@ -175,7 +186,7 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}

{
auto marg = dane;
int k = 3;
Expand All @@ -185,7 +196,7 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}

{
auto marg = dane;
int k = 3;
Expand All @@ -195,10 +206,15 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
}


// chains of modifiers:
{
std::cout << std::endl;
std::cout << "===================" << std::endl;
std::cout << "chains of modifiers" << std::endl;
std::cout << "===================" << std::endl;

auto marg = dane;
int k = 3;
ksi::data_modifier_imputer_values_from_knn imputer (k);
Expand All @@ -209,33 +225,86 @@ void ksi::exp_001::execute()
std::cout << marg << std::endl;
std::cout << std::endl;
std::cout << imputer.print() << std::endl;

}

{
std::cout << std::endl;
std::cout << "============================" << std::endl;
std::cout << "imputation of missing values" << std::endl;
std::cout << "============================" << std::endl;

auto marg = dane;

ksi::data_modifier_imputer dm1;
ksi::data_modifier_normaliser dm2;
ksi::data_modifier_standardiser dm3;
ksi::data_modifier_imputer_average dm4;

std::cout << dm1.print() << std::endl;
std::cout << dm2.print() << std::endl;
std::cout << dm3.print() << std::endl;
std::cout << dm4.print() << std::endl;

dm1.addModifier(dm2);
std::cout << dm1.print() << std::endl;
dm4.addModifier(dm3);
std::cout << dm4.print() << std::endl;
dm1.addModifier(dm4);

dm1.modify(marg);

std::cout << marg << std::endl;
std::cout << std::endl;
std::cout << dm1.print() << std::endl;
}
{
std::cout << std::endl;
std::cout << "==================================" << std::endl;
std::cout << "split into train and test datasets" << std::endl;
std::cout << "==================================" << std::endl;

{
std::cout << std::endl;
std::cout << "train and test cross validation" << std::endl;
std::cout << std::endl;

std::string CompleteDataset ("/complete.data");
ksi::reader_complete cr;
ksi::train_test_model tt(cr);

const int chunks { 5 };
tt.read_and_split_file(dataDir + CompleteDataset, chunks);

int counter { 1 };
for (const auto & [train, test] : tt)
{
std::cout << "chunk: " << counter++ << ", train dataset size: " << train.size() << ", test data set size: " << test.size() << std::endl;
}
}

{
std::cout << std::endl;
std::cout << "train, validate, and test cross validation" << std::endl;
std::cout << std::endl;

std::string CompleteDataset ("/complete.data");
ksi::reader_complete cr;
ksi::train_validation_test_model tvt(cr);

const int chunks { 5 };
tvt.read_and_split_file(dataDir + CompleteDataset, chunks);

int counter { 1 };
for (const auto & [train, validate, test] : tvt)
{
std::cout << "chunk: " << counter++ << ", train dataset size: " << train.size() << ", validation data set size: " << validate.size() << ", test data set size: " << test.size() << std::endl;
}
}




}
}
}
Expand Down

0 comments on commit ffc778c

Please sign in to comment.