From a90db17b48fcbae74a857afe5b0f53faaf7a1f97 Mon Sep 17 00:00:00 2001
From: Serkan Korkmaz <77464572+serkor1@users.noreply.github.com>
Date: Sat, 21 Dec 2024 13:38:10 +0100
Subject: [PATCH] A complete overhaul of unit-tests :hammer:  (#22)

* [UNIT-TEST] Consolidated Setup :hammer:

* All refererence function are now called from the setup script.
* All (unweighted) classification functions are now called from the setup script.

* [UNIT-TEST] S3 methods for classification :hammer:

* All S3 methods are now tested seperately for weighted and unweighted classifcation. The tests are for balanced and imbalanced classification.

* [UNIT-TEST] Completely Rewritten (See message) :hammer:

* To account for edge cases, corner cases and whatever type of cases there exists ALL unit tests have been rewritten to capture these errors.

* The specificity function have been rewritten, as the {scikit-learn} implementation is misbehaving.
---
 tests/testthat/pytorch.py                     |   2 +-
 tests/testthat/ref-manual.R                   | 104 ++++++-
 tests/testthat/scikit-learn.py                |   8 -
 tests/testthat/setup.R                        | 125 ++++++++-
 tests/testthat/test-Accuracy.R                |  82 ++++++
 tests/testthat/test-BalancedAccuracy.R        |  91 +++++++
 .../test-CoefficientOfDetermination.R         |  82 ++++++
 tests/testthat/test-CohensKappa.R             |  97 +++++++
 .../test-ConcordanceCorrelationCoefficient.R  |  90 +++++++
 tests/testthat/test-ConfusionMatrix.R         |  64 +++++
 tests/testthat/test-FBetaScore.R              | 103 +++++++
 tests/testthat/test-FalseDiscoveryRate.R      |  95 +++++++
 tests/testthat/test-FalseOmissionRate.R       |  95 +++++++
 tests/testthat/test-FalsePositiveRate.R       |  95 +++++++
 tests/testthat/test-FowlkesMallowsIndex.R     |  74 +++++
 tests/testthat/test-HuberLoss.R               |  90 +++++++
 tests/testthat/test-JaccardIndex.R            |  95 +++++++
 .../test-MattewsCorrerlationCoefficient.R     |  84 ++++++
 tests/testthat/test-MeanAbsoluteError.R       |  74 +++++
 .../test-MeanAbsolutePercentageError.R        |  74 +++++
 tests/testthat/test-MeanPercentageError.R     |  74 +++++
 tests/testthat/test-MeanSquaredError.R        |  74 +++++
 tests/testthat/test-NegativePredictiveValue.R |  95 +++++++
 tests/testthat/test-PinballLoss.R             | 124 +++++++++
 tests/testthat/test-Precision.R               |  95 +++++++
 tests/testthat/test-ROC.R                     |  19 --
 tests/testthat/test-Recall.R                  |  95 +++++++
 tests/testthat/test-RelativeAbsoluteError.R   |  74 +++++
 .../test-RelativeRootMeanSquaredError.R       |  74 +++++
 tests/testthat/test-RootMeanSquaredError.R    |  74 +++++
 .../test-RootMeanSquaredLogarithmicError.R    |  74 +++++
 tests/testthat/test-S3-classification.R       | 253 ++++++++++++++++++
 tests/testthat/test-Specificity.R             |  95 +++++++
 .../test-SymmetricMeanAbsoluteError.R         |  74 +++++
 tests/testthat/test-ZeroOneLoss.R             |  85 ++++++
 .../testthat/test-aggregated-classification.R | 129 ---------
 tests/testthat/test-balanced-classification.R | 172 ------------
 tests/testthat/test-confusion_matrix.R        |  40 ---
 tests/testthat/test-pinball.R                 |  56 ----
 tests/testthat/test-regression.R              | 119 --------
 tests/testthat/test-rsq.R                     |  94 -------
 tests/testthat/test-weighted_classification.R | 238 ----------------
 tests/testthat/test-weighted_regression.R     | 101 -------
 43 files changed, 2971 insertions(+), 981 deletions(-)
 create mode 100644 tests/testthat/test-Accuracy.R
 create mode 100644 tests/testthat/test-BalancedAccuracy.R
 create mode 100644 tests/testthat/test-CoefficientOfDetermination.R
 create mode 100644 tests/testthat/test-CohensKappa.R
 create mode 100644 tests/testthat/test-ConcordanceCorrelationCoefficient.R
 create mode 100644 tests/testthat/test-ConfusionMatrix.R
 create mode 100644 tests/testthat/test-FBetaScore.R
 create mode 100644 tests/testthat/test-FalseDiscoveryRate.R
 create mode 100644 tests/testthat/test-FalseOmissionRate.R
 create mode 100644 tests/testthat/test-FalsePositiveRate.R
 create mode 100644 tests/testthat/test-FowlkesMallowsIndex.R
 create mode 100644 tests/testthat/test-HuberLoss.R
 create mode 100644 tests/testthat/test-JaccardIndex.R
 create mode 100644 tests/testthat/test-MattewsCorrerlationCoefficient.R
 create mode 100644 tests/testthat/test-MeanAbsoluteError.R
 create mode 100644 tests/testthat/test-MeanAbsolutePercentageError.R
 create mode 100644 tests/testthat/test-MeanPercentageError.R
 create mode 100644 tests/testthat/test-MeanSquaredError.R
 create mode 100644 tests/testthat/test-NegativePredictiveValue.R
 create mode 100644 tests/testthat/test-PinballLoss.R
 create mode 100644 tests/testthat/test-Precision.R
 create mode 100644 tests/testthat/test-Recall.R
 create mode 100644 tests/testthat/test-RelativeAbsoluteError.R
 create mode 100644 tests/testthat/test-RelativeRootMeanSquaredError.R
 create mode 100644 tests/testthat/test-RootMeanSquaredError.R
 create mode 100644 tests/testthat/test-RootMeanSquaredLogarithmicError.R
 create mode 100644 tests/testthat/test-S3-classification.R
 create mode 100644 tests/testthat/test-Specificity.R
 create mode 100644 tests/testthat/test-SymmetricMeanAbsoluteError.R
 create mode 100644 tests/testthat/test-ZeroOneLoss.R
 delete mode 100644 tests/testthat/test-aggregated-classification.R
 delete mode 100644 tests/testthat/test-balanced-classification.R
 delete mode 100644 tests/testthat/test-confusion_matrix.R
 delete mode 100644 tests/testthat/test-pinball.R
 delete mode 100644 tests/testthat/test-regression.R
 delete mode 100644 tests/testthat/test-rsq.R
 delete mode 100644 tests/testthat/test-weighted_classification.R
 delete mode 100644 tests/testthat/test-weighted_regression.R

diff --git a/tests/testthat/pytorch.py b/tests/testthat/pytorch.py
index 6adc1af..3c06557 100644
--- a/tests/testthat/pytorch.py
+++ b/tests/testthat/pytorch.py
@@ -5,7 +5,7 @@
 from torchmetrics.functional import symmetric_mean_absolute_percentage_error
 
 # Classification metrics
-def py_huber(actual, predicted, delta=1.0, w=None):
+def py_huberloss(actual, predicted, delta=1.0, w=None):
   
     actual = torch.tensor(actual, dtype=torch.float64)
     predicted = torch.tensor(predicted, dtype=torch.float64)
diff --git a/tests/testthat/ref-manual.R b/tests/testthat/ref-manual.R
index bdc6e45..6b2091a 100644
--- a/tests/testthat/ref-manual.R
+++ b/tests/testthat/ref-manual.R
@@ -13,9 +13,9 @@
 # Concordance Correlation Coefficient
 # The values have been verified with yardstick and 
 # epiR
-py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
+py_ccc <- function(actual, predicted, w = NULL, correction = FALSE) {
 
-  actual <- as.numeric(actual)
+  actual    <- as.numeric(actual)
   predicted <- as.numeric(predicted)
   
   if (is.null(w)) {
@@ -39,7 +39,7 @@ py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
   predicted_variance <- cov_matrix$cov[2, 2]
   covariance <- cov_matrix$cov[1, 2]
   
-  if (bias) {
+  if (correction) {
     n <- sum(w) 
     actual_variance <- actual_variance * (n - 1) / n
     predicted_variance <- predicted_variance * (n - 1) / n
@@ -55,6 +55,59 @@ py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
 
 
 
+py_specificity <- function(
+  actual,
+  predicted,
+  average = NULL,
+  w       = NULL,
+  na.rm   = TRUE
+) {
+
+  # 1) Construct matrix
+  conf_mat <- SLmetrics::cmatrix(
+    actual = actual,
+    predicted = predicted,
+    w = w
+  )
+
+  TN <- sum(conf_mat) - rowSums(conf_mat) - colSums(conf_mat) + diag(conf_mat)
+  FP <- colSums(conf_mat) - diag(conf_mat)
+
+
+  output <- TN/(TN+FP)
+
+  # 2) calculate values
+  if (!is.null(average)) {
+
+    average <- as.logical(average == "micro")
+
+    if (average) {
+
+      output <-  sum(TN, na.rm = TRUE) / (sum(TN, na.rm = TRUE) + sum(FP, na.rm = TRUE))
+
+    } else {
+
+      if (!na.rm) {
+
+        output[!is.finite(output)] <- 0
+
+      }
+
+      output <- mean(
+        output,
+        na.rm = na.rm
+      )
+
+    }
+
+  }
+
+  return(
+    output
+  )
+
+}
+
 # False Discovery Rate
 py_fdr <- function(
     actual,
@@ -395,5 +448,50 @@ ref_prROC <- function(actual, response, thresholds) {
 
 }
 
+# Regression Functions
+py_rrmse <- function(
+  actual,
+  predicted,
+  w = NULL
+) {
+
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+
+  sqrt(sum((w * actual - w * predicted)^2) / sum((w * actual - weighted.mean(actual, w = w))^2))
+
+}
+
+
+py_rae <- function(
+  actual,
+  predicted,
+  w = NULL) {
+  
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+  
+    sum(abs(actual - predicted)) / sum(abs(actual - weighted.mean(actual, w = w)))
+}
+
+
+py_mpe <- function(
+  predicted, 
+  actual, 
+  w = NULL) {
+  
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+  
+  error <- (actual - predicted) / actual
+  weighted_mpe <- sum(w * error) / sum(w)
+  
+  weighted_mpe
+}
+
+
 
 # script end;
diff --git a/tests/testthat/scikit-learn.py b/tests/testthat/scikit-learn.py
index be2a3c6..9fea69d 100644
--- a/tests/testthat/scikit-learn.py
+++ b/tests/testthat/scikit-learn.py
@@ -112,14 +112,6 @@ def py_entropy(actual, response, normalize = True, w = None):
       sample_weight = w
     )
 
-def py_specificity(actual, response, average = None, w = None):
-    return specificity_score(
-      y_true    = actual,
-      y_pred    = response,
-      average   = average,
-      sample_weight = w
-    )
-
 def py_roc(actual, response, pos_label = 1, w = None):
     return metrics.roc_curve(
       actual,
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 0657948..2dff0b8 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -8,6 +8,9 @@
 #
 # script start;
 
+# 0) set amount of test failures
+testthat::set_max_fails(Inf)
+
 # 1) set seed for all
 # samples
 set.seed(1903)
@@ -113,7 +116,7 @@ set_equal <- function(
     current,
     target,
     tolerance = 1e-9) {
-
+  
   all.equal(
     target = target,
     current = current,
@@ -124,5 +127,125 @@ set_equal <- function(
 
 }
 
+# 6) load scripts
+# globally
+reticulate::source_python(
+  "scikit-learn.py"
+)
+reticulate::source_python(
+  "pytorch.py"
+)
+source("ref-manual.R")
+
+# 7) define all classification
+# functions in {SLmetrics}
+sl_classification <- list(
+  # accuracy
+  "accuracy"    = accuracy,
+  "baccuracy"   = baccuracy,
+
+  # Zero-One Loss
+  "zerooneloss" = zerooneloss,
+
+  # specificity methods
+  "specificity" = specificity,
+  "tnr"         = tnr,
+  "selectivity" = selectivity,
+
+
+  # recall methods;
+  "recall"      = recall,
+  "sensitivity" = sensitivity,
+  "tpr"         = tpr,
+
+  # precision methods
+  "precision"   = precision,
+  "ppv"         = ppv,
+
+  # fbeta methods
+  "fbeta"       = fbeta,
+
+  # likelihood methods
+  "dor"         = dor,
+  "plr"         = plr,
+  "nlr"         = nlr,
+
+  # jaccard methods
+  "jaccard"     = jaccard,
+  "tscore"      = tscore,
+  "csi"         = csi,
+
+  # mcc methods
+  "mcc"         = mcc,
+  "phi"         = phi,
+
+  # false positive
+  "fpr"         = fpr,
+  "fallout"     = fallout,
+
+  # fmi methods
+  "fmi"         = fmi,
+
+  "fdr"         = fdr,
+  "npv"         = npv,
+  "fer"         = fer,
+
+  "ckappa"      = ckappa
+
+)
+
+# 7) define all weighted classification
+# functions in {SLmetrics}
+sl_wclassification <- list(
+  # accuracy
+  "accuracy"    = weighted.accuracy,
+  "baccuracy"   = weighted.baccuracy,
+
+  # Zero-One Loss
+  "zerooneloss" = weighted.zerooneloss,
+
+  # specificity methods
+  "specificity" = weighted.specificity,
+  "tnr"         = weighted.tnr,
+  "selectivity" = weighted.selectivity,
+
+
+  # recall methods;
+  "recall"      = weighted.recall,
+  "sensitivity" = weighted.sensitivity,
+  "tpr"         = weighted.tpr,
+
+  # precision methods
+  "precision"   = weighted.precision,
+  "ppv"         = weighted.ppv,
+
+  # fbeta methods
+  "fbeta"       = weighted.fbeta,
+
+  # likelihood methods
+  "dor"         = weighted.dor,
+  "plr"         = weighted.plr,
+  "nlr"         = weighted.nlr,
+
+  # jaccard methods
+  "jaccard"     = weighted.jaccard,
+  "tscore"      = weighted.tscore,
+  "csi"         = weighted.csi,
+
+  # mcc methods
+  "mcc"         = weighted.mcc,
+  "phi"         = weighted.phi,
+
+  # false positive
+  "fpr"         = weighted.fpr,
+  "fallout"     = weighted.fallout,
+
+  "fdr"         = weighted.fdr,
+  "npv"         = weighted.npv,
+  "fer"         = weighted.fer,
+
+  "ckappa"      = weighted.ckappa
+
+)
 
 # script end;
diff --git a/tests/testthat/test-Accuracy.R b/tests/testthat/test-Accuracy.R
new file mode 100644
index 0000000..3853243
--- /dev/null
+++ b/tests/testthat/test-Accuracy.R
@@ -0,0 +1,82 @@
+# objective: Test that Accuracy
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `accuracy()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_accuracy <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          accuracy(
+            actual     = actual,
+            predicted  = predicted
+          )
+        } else {
+          weighted.accuracy(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w
+          )
+        }
+      }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2.1) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced,
+          "Weighted = ", weighted
+        )
+
+        # 2.2) generate score
+        # from {slmetrics}
+        score <- wrapped_accuracy(
+          actual     = actual,
+          predicted  = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.3) test that the values
+        # are sensible the values 
+        # can be NA
+        testthat::expect_true(is.numeric(score), info = info)
+        testthat::expect_true(length(score) == 1, info = info)
+
+        # 2.4) test that the values
+        # are equal to target value
+
+        # 2.4.1) calculate py_score
+        py_score <- py_accuracy(
+          actual    = actual,
+          predicted = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.4.2) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = score,
+            target  = py_score
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-BalancedAccuracy.R b/tests/testthat/test-BalancedAccuracy.R
new file mode 100644
index 0000000..1fa7381
--- /dev/null
+++ b/tests/testthat/test-BalancedAccuracy.R
@@ -0,0 +1,91 @@
+# objective: Test that BalancedAccuracy
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `baccuracy()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_baccuracy <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          baccuracy(
+            actual     = actual,
+            predicted  = predicted
+          )
+        } else {
+          weighted.baccuracy(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w
+          )
+        }
+      }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (adjust in c(TRUE, FALSE)) {
+          
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Adjusted = ", adjust,
+            "Weighted = ", weighted
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_baccuracy(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+          testthat::expect_true(length(score) == 1, info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_baccuracy(
+            actual    = actual,
+            predicted = predicted,
+            adjust    = adjust,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-CoefficientOfDetermination.R b/tests/testthat/test-CoefficientOfDetermination.R
new file mode 100644
index 0000000..82ec947
--- /dev/null
+++ b/tests/testthat/test-CoefficientOfDetermination.R
@@ -0,0 +1,82 @@
+# objective: Test that CoefficientOfDetermination
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `rsq()` function", code = {
+
+    # 0) construct Coefficient Of Determination
+    # wrapper
+    wrapped_rsq <- function(
+      actual, 
+      predicted, 
+      k = NULL, 
+      w = NULL) {
+      if (is.null(w)) {
+        rsq(
+          actual    = actual,
+          predicted = predicted,
+          k         = k
+        )
+      } else {
+        weighted.rsq(
+          actual    = actual,
+          predicted = predicted,
+          k         = k,
+          w         = w
+        )
+      }
+    }
+
+    # 1) prepare
+    # data
+    data <- mtcars
+    actual <- data$mpg
+    weights <- runif(length(actual))
+
+    for (weighted in c(TRUE, FALSE)) {
+      for (adjusted in c(TRUE, FALSE)) {
+
+        # 1) generate sensible
+        # label information
+        infor <- paste(
+          "Weighted = ", weighted,
+          "Adjusted = ", adjusted
+        )
+
+        # 2) test that values
+        # are equal to target values
+        model <- lm(
+          formula = mpg ~ .,
+          data    = data,
+          weights = if (weighted) weights else NULL
+        )
+
+        # 2.1) extract values
+        # from model
+        target_value  <- if (adjusted) { summary(model)$adj.r.squared } else { summary(model)$r.squared }
+        current_value <- wrapped_rsq(
+          actual    = actual,
+          predicted = fitted.values(model),
+          w         = if (weighted) weights else NULL,
+          k         = if (adjusted) ncol(model.matrix(model)) - 1 else 0
+        )
+
+        # 2.2) test that the values are sensible
+        testthat::expect_true(is.numeric(target_value), info = info)
+        testthat::expect_true(is.numeric(target_value), info = info)
+
+        # 2.3) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            target  = target_value,
+            current = current_value
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-CohensKappa.R b/tests/testthat/test-CohensKappa.R
new file mode 100644
index 0000000..506be8e
--- /dev/null
+++ b/tests/testthat/test-CohensKappa.R
@@ -0,0 +1,97 @@
+# objective: Test that CohensKappa
+# implemented in {SLmetrics} is aligned with
+# target functions.
+testthat::test_that(
+  desc = "Test `ckappa()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_ckappa <- function(
+      actual,
+      predicted,
+      beta = 0,
+      w = NULL) {
+        if (is.null(w)) {
+          ckappa(
+            actual     = actual,
+            predicted  = predicted,
+            beta       = beta
+          )
+        } else {
+          weighted.ckappa(
+            actual     = actual,
+            predicted  = predicted,
+            beta       = beta,
+            w          = w
+          )
+        }
+      }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (beta in c(0, 1, 2)) {
+          
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Beta = ", beta,
+            "Weighted = ", weighted
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_ckappa(
+            actual     = actual,
+            predicted  = predicted,
+            beta       = beta,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+          testthat::expect_true(length(score) == 1, info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_ckappa(
+            actual     = actual,
+            predicted  = predicted,
+            penalty    = switch(as.character(beta),
+            "0" = NULL,
+            "1" = "linear",
+            "2" = "quadratic"),
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-ConcordanceCorrelationCoefficient.R b/tests/testthat/test-ConcordanceCorrelationCoefficient.R
new file mode 100644
index 0000000..2e47a38
--- /dev/null
+++ b/tests/testthat/test-ConcordanceCorrelationCoefficient.R
@@ -0,0 +1,90 @@
+# objective: Test that ConcordanceCorrelationCoefficient
+# implemented in {SLmetrics} is aligned with
+# target functions.
+testthat::test_that(
+  desc = "Test `ccc()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_ccc <- function(
+      actual,
+      predicted,
+      correction,
+      w = NULL) {
+        if (is.null(w)) {
+          ccc(
+            actual     = actual,
+            predicted  = predicted,
+            correction = correction
+          )
+        } else {
+          weighted.ccc(
+            actual     = actual,
+            predicted  = predicted,
+            corrrection= correction,
+            w          = w
+          )
+        }
+      }
+    
+      # 1) generate regression
+      # values
+      values <- create_regression()
+      actual <- values$actual
+      predicted <- values$predicted
+      w         <- values$weights
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (correction in c(TRUE, FALSE)) {
+          
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "correction = ", correction,
+            "Weighted = ", weighted
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_ccc(
+            actual     = actual,
+            predicted  = predicted,
+            correction = correction,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+          testthat::expect_true(length(score) == 1, info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_ccc(
+            actual     = actual,
+            predicted  = predicted,
+            correction = correction,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+
+        }
+
+      }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-ConfusionMatrix.R b/tests/testthat/test-ConfusionMatrix.R
new file mode 100644
index 0000000..1dd5a81
--- /dev/null
+++ b/tests/testthat/test-ConfusionMatrix.R
@@ -0,0 +1,64 @@
+# objective: Test that Confusion Matrix
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `cmatrix()`-function", code = {
+
+    # 2) test that the are 
+    # equal to target values
+    for (balanced in c(TRUE, FALSE)) {
+
+      # 2.1) generate class
+      # values and weights
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+
+        # 2.2) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced,
+          "Weighted = ", weighted
+        )
+
+        # 2.3) generate confusion
+        # matrix
+        confusion_matrix <- cmatrix(
+          actual    = actual,
+          predicted = predicted,
+          w         = if (weighted) w else NULL 
+        )
+
+        # 2.3) test that the values
+        # are sensible
+        testthat::expect_true(dim(confusion_matrix)[1] == dim(confusion_matrix)[2], info = info)
+        testthat::expect_true(dim(confusion_matrix)[1] == length(levels(actual)), info = info)
+
+        # 2.4) test that the values
+        # are equal to target
+        py_confusion_matrix <- py_cmatrix(
+          actual    = actual,
+          predicted = predicted,
+          w         = if (weighted) w else NULL 
+        )
+
+        # 2.5) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = confusion_matrix,
+            target  = py_confusion_matrix
+          ),
+          info = info
+        )
+
+
+
+      }
+  
+    }
+  }
+)
+
diff --git a/tests/testthat/test-FBetaScore.R b/tests/testthat/test-FBetaScore.R
new file mode 100644
index 0000000..b1a9327
--- /dev/null
+++ b/tests/testthat/test-FBetaScore.R
@@ -0,0 +1,103 @@
+# objective: Test that FBetaScorre
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `fbeta()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_fbeta <- function(
+      actual,
+      predicted,
+      beta,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          fbeta(
+            actual     = actual,
+            predicted  = predicted,
+            beta       = beta,
+            micro      = micro
+          )
+        } else {
+          weighted.fbeta(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            beta       = beta,
+            micro      = micro
+          )
+        }
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (beta in c(0, 4, 8)) {
+          for (micro in c(NA, TRUE, FALSE)) {
+
+             
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "beta = ", beta,
+            "Weighted = ", weighted,
+            "Micro =", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_fbeta(
+            actual     = actual,
+            predicted  = predicted,
+            beta       = beta,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_fbeta(
+            actual    = actual,
+            predicted = predicted,
+            beta      = beta,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+          }
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-FalseDiscoveryRate.R b/tests/testthat/test-FalseDiscoveryRate.R
new file mode 100644
index 0000000..b7e1c19
--- /dev/null
+++ b/tests/testthat/test-FalseDiscoveryRate.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `fdr()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_fdr <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          fdr(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.fdr(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_fdr(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_fdr(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-FalseOmissionRate.R b/tests/testthat/test-FalseOmissionRate.R
new file mode 100644
index 0000000..457f38f
--- /dev/null
+++ b/tests/testthat/test-FalseOmissionRate.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `fer()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_fer <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          fer(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.fer(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_fer(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_fer(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-FalsePositiveRate.R b/tests/testthat/test-FalsePositiveRate.R
new file mode 100644
index 0000000..dc4b1d5
--- /dev/null
+++ b/tests/testthat/test-FalsePositiveRate.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `fpr()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_fpr <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          fpr(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.fpr(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_fpr(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_fpr(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-FowlkesMallowsIndex.R b/tests/testthat/test-FowlkesMallowsIndex.R
new file mode 100644
index 0000000..a2aead1
--- /dev/null
+++ b/tests/testthat/test-FowlkesMallowsIndex.R
@@ -0,0 +1,74 @@
+# objective: Test that Fowlks Mallows Index
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `fmi()`-function", code = {
+
+    # 0) construct fmi
+    # wrapper
+    wrapped_fmi <- function(
+      actual,
+      predicted) {
+      
+        fmi(
+          actual     = actual,
+          predicted  = predicted
+        )
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        # 2.1) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced
+        )
+
+        # 2.2) generate score
+        # from {slmetrics}
+        score <- wrapped_fmi(
+          actual     = actual,
+          predicted  = predicted
+        )
+
+        # 2.3) test that the values
+        # are sensible the values 
+        # can be NA
+        testthat::expect_true(is.numeric(score), info = info)
+        testthat::expect_true(length(score) == 1, info = info)
+
+        # 2.4) test that the values
+        # are equal to target value
+
+        # 2.4.1) calculate py_score
+        py_score <- py_fmi(
+          actual    = actual,
+          predicted = predicted
+        )
+
+        # 2.4.2) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = score,
+            target  = py_score
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-HuberLoss.R b/tests/testthat/test-HuberLoss.R
new file mode 100644
index 0000000..18e873d
--- /dev/null
+++ b/tests/testthat/test-HuberLoss.R
@@ -0,0 +1,90 @@
+# objective: Test that HuberLoss
+# implemented in {SLmetrics} is aligned with
+# target functions.
+testthat::test_that(
+  desc = "Test `huberloss()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_huberloss <- function(
+      actual,
+      predicted,
+      delta,
+      w = NULL) {
+        if (is.null(w)) {
+          huberloss(
+            actual     = actual,
+            predicted  = predicted,
+            delta      = delta
+          )
+        } else {
+          weighted.huberloss(
+            actual     = actual,
+            predicted  = predicted,
+            delta      = delta,
+            w          = w
+          )
+        }
+      }
+    
+      # 1) generate regression
+      # values
+      values <- create_regression()
+      actual <- values$actual
+      predicted <- values$predicted
+      w         <- values$weights
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (delta in c(1, 2, 3)) {
+          
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "delta = ", delta,
+            "Weighted = ", weighted
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_huberloss(
+            actual     = actual,
+            predicted  = predicted,
+            delta      = delta,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+          testthat::expect_true(length(score) == 1, info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_huberloss(
+            actual     = actual,
+            predicted  = predicted,
+            delta      = delta,
+            w          = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+
+        }
+
+      }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-JaccardIndex.R b/tests/testthat/test-JaccardIndex.R
new file mode 100644
index 0000000..10474ce
--- /dev/null
+++ b/tests/testthat/test-JaccardIndex.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `jaccard()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_jaccard <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          jaccard(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.jaccard(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_jaccard(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_jaccard(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-MattewsCorrerlationCoefficient.R b/tests/testthat/test-MattewsCorrerlationCoefficient.R
new file mode 100644
index 0000000..3bfda05
--- /dev/null
+++ b/tests/testthat/test-MattewsCorrerlationCoefficient.R
@@ -0,0 +1,84 @@
+# objective: Test that Matthews Correlation Coefficient
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `mcc()`-function", code = {
+
+    # 0) construct mcc
+    # wrapper
+    wrapped_mcc <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          mcc(
+            actual     = actual,
+            predicted  = predicted
+          )
+        } else {
+          weighted.mcc(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w
+          )
+        }
+      }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        # 2.1) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced,
+          "Weighted = ", weighted
+        )
+
+        # 2.2) generate score
+        # from {slmetrics}
+        score <- wrapped_mcc(
+          actual     = actual,
+          predicted  = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.3) test that the values
+        # are sensible the values 
+        # can be NA
+        testthat::expect_true(is.numeric(score), info = info)
+        testthat::expect_true(length(score) == 1, info = info)
+
+        # 2.4) test that the values
+        # are equal to target value
+
+        # 2.4.1) calculate py_score
+        py_score <- py_mcc(
+          actual    = actual,
+          predicted = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.4.2) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = score,
+            target  = py_score
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-MeanAbsoluteError.R b/tests/testthat/test-MeanAbsoluteError.R
new file mode 100644
index 0000000..fdb6354
--- /dev/null
+++ b/tests/testthat/test-MeanAbsoluteError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `mae()`-function", code = {
+
+    # 0) construct mae-wrapperr
+    wrapped_mae <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          mae(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.mae(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_mae(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_mae(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-MeanAbsolutePercentageError.R b/tests/testthat/test-MeanAbsolutePercentageError.R
new file mode 100644
index 0000000..ed3c9bd
--- /dev/null
+++ b/tests/testthat/test-MeanAbsolutePercentageError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `mape()`-function", code = {
+
+    # 0) construct mape-wrapperr
+    wrapped_mape <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          mape(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.mape(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_mape(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_mape(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-MeanPercentageError.R b/tests/testthat/test-MeanPercentageError.R
new file mode 100644
index 0000000..4e8cdbf
--- /dev/null
+++ b/tests/testthat/test-MeanPercentageError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `mpe()`-function", code = {
+
+    # 0) construct mpe-wrapperr
+    wrapped_mpe <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          mpe(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.mpe(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_mpe(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_mpe(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-MeanSquaredError.R b/tests/testthat/test-MeanSquaredError.R
new file mode 100644
index 0000000..4596727
--- /dev/null
+++ b/tests/testthat/test-MeanSquaredError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `mse()`-function", code = {
+
+    # 0) construct mse-wrapperr
+    wrapped_mse <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          mse(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.mse(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_mse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_mse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-NegativePredictiveValue.R b/tests/testthat/test-NegativePredictiveValue.R
new file mode 100644
index 0000000..4981db8
--- /dev/null
+++ b/tests/testthat/test-NegativePredictiveValue.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `npv()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_npv <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          npv(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.npv(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_npv(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_npv(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-PinballLoss.R b/tests/testthat/test-PinballLoss.R
new file mode 100644
index 0000000..0d1ebcd
--- /dev/null
+++ b/tests/testthat/test-PinballLoss.R
@@ -0,0 +1,124 @@
+# objective: Test that PinballLoss
+# implemented in {SLmetrics} is aligned with
+# target functions.
+testthat::test_that(
+  desc = "Test `pinball()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_pinball <- function(
+      actual,
+      predicted,
+      alpha,
+      deviance,
+      w = NULL) {
+        if (is.null(w)) {
+          pinball(
+            actual     = actual,
+            predicted  = predicted,
+            deviance   = deviance,
+            alpha      = alpha
+          )
+        } else {
+          weighted.pinball(
+            actual     = actual,
+            predicted  = predicted,
+            alpha      = alpha,
+            deviance   = deviance,
+            w          = w
+          )
+        }
+      }
+    
+    wrapped_pypinball <- function(
+      actual,
+      predicted,
+      alpha,
+      deviance,
+      w = NULL
+    ) {
+
+      if (deviance) {
+        py_d2pinball(
+          actual = actual,
+          predicted = predicted,
+          w = w,
+          alpha = alpha
+        )
+      } else {
+        py_pinball(
+          actual = actual,
+          predicted = predicted,
+          w = w,
+          alpha = alpha
+        )
+      }
+
+    }
+    
+      # 1) generate regression
+      # values
+      values <- create_regression()
+      actual <- values$actual
+      predicted <- values$predicted
+      w         <- values$weights
+
+      for (weighted in c(TRUE, FALSE)) {
+        for (deviance in c(TRUE, FALSE)) {
+
+          # 2) test that the are 
+          # equal to target values
+          for (alpha in c(0.2, 0.5, 0.8)) {
+            
+            # 2.1) generate sensible 
+            # label information
+            info <- paste(
+              "Alpha = ", alpha,
+              "Deviance = ", deviance,
+              "Weighted = ", weighted
+            )
+
+            # 2.2) generate score
+            # from {slmetrics}
+            score <- wrapped_pinball(
+              actual     = actual,
+              predicted  = predicted,
+              alpha      = alpha,
+              deviance   = deviance,
+              w          = if (weighted) w else NULL
+            )
+
+            # 2.3) test that the values
+            # are sensible the values 
+            # can be NA
+            testthat::expect_true(is.numeric(score), info = info)
+            testthat::expect_true(length(score) == 1, info = info)
+
+            # 2.4) test that the values
+            # are equal to target value
+
+            # 2.4.1) calculate py_score
+            py_score <- wrapped_pypinball(
+              actual     = actual,
+              predicted  = predicted,
+              deviance   = deviance,
+              alpha      = alpha,
+              w          = if (weighted) w else NULL
+            )
+
+            # 2.4.2) test for equality
+            testthat::expect_true(
+              object = set_equal(
+                current = score,
+                target  = py_score
+              ),
+              info = info
+            )
+
+          }
+      
+        }
+
+      }
+  }
+)
diff --git a/tests/testthat/test-Precision.R b/tests/testthat/test-Precision.R
new file mode 100644
index 0000000..8149bc2
--- /dev/null
+++ b/tests/testthat/test-Precision.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `precision()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_precision <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          precision(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.precision(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_precision(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_precision(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-ROC.R b/tests/testthat/test-ROC.R
index 69492c2..5be4f92 100644
--- a/tests/testthat/test-ROC.R
+++ b/tests/testthat/test-ROC.R
@@ -9,15 +9,6 @@ testthat::test_that(
   desc = "Test that `ROC()`-function works as expected",
   code = {
 
-    # testthat::skip("Skippin ROC()-tests - currently unstable")
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    source("ref-manual.R")
-
     n <- 1e3
     k <- 4
     # 1) generate
@@ -162,16 +153,6 @@ testthat::test_that(
   desc = "Test that `prROC()`-function works as expected",
   code = {
 
-    # testthat::skip("Skippin pROC()-tests - currently unstable")
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    source("ref-manual.R")
-
     n <- 1e3
     k <- 4
     # 1) generate
diff --git a/tests/testthat/test-Recall.R b/tests/testthat/test-Recall.R
new file mode 100644
index 0000000..bd4b465
--- /dev/null
+++ b/tests/testthat/test-Recall.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `recall()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_recall <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          recall(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.recall(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_recall(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_recall(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-RelativeAbsoluteError.R b/tests/testthat/test-RelativeAbsoluteError.R
new file mode 100644
index 0000000..9d9e7e7
--- /dev/null
+++ b/tests/testthat/test-RelativeAbsoluteError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `rae()`-function", code = {
+
+    # 0) construct rae-wrapperr
+    wrapped_rae <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          rae(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.rae(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_rae(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_rae(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-RelativeRootMeanSquaredError.R b/tests/testthat/test-RelativeRootMeanSquaredError.R
new file mode 100644
index 0000000..3ee9a40
--- /dev/null
+++ b/tests/testthat/test-RelativeRootMeanSquaredError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `rrmse()`-function", code = {
+
+    # 0) construct rrmse-wrapperr
+    wrapped_rrmse <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          rrmse(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.rrmse(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_rrmse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_rrmse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-RootMeanSquaredError.R b/tests/testthat/test-RootMeanSquaredError.R
new file mode 100644
index 0000000..cd6b823
--- /dev/null
+++ b/tests/testthat/test-RootMeanSquaredError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `rmse()`-function", code = {
+
+    # 0) construct rmse-wrapperr
+    wrapped_rmse <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          rmse(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.rmse(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_rmse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_rmse(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-RootMeanSquaredLogarithmicError.R b/tests/testthat/test-RootMeanSquaredLogarithmicError.R
new file mode 100644
index 0000000..b6ba632
--- /dev/null
+++ b/tests/testthat/test-RootMeanSquaredLogarithmicError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `rmsle()`-function", code = {
+
+    # 0) construct rmsle-wrapperr
+    wrapped_rmsle <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          rmsle(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.rmsle(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_rmsle(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_rmsle(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-S3-classification.R b/tests/testthat/test-S3-classification.R
new file mode 100644
index 0000000..23b9917
--- /dev/null
+++ b/tests/testthat/test-S3-classification.R
@@ -0,0 +1,253 @@
+# objective: Test that the S3 methods
+# works and returns the same values
+# using factors and confusion matrix with default calls.
+
+testthat::test_that(
+  desc = "Test that S3 methods returns the same values for classification metrics (balanced)", code = {
+
+    # 1) generate class
+    # values
+    actual    <- create_factor(balanced = TRUE)
+    predicted <- create_factor(balanced = TRUE)
+    w         <- runif(n = length(actual))
+
+    # 2) generate confusion matrix
+    # from {SLmetrics} and {Python}
+    sl_matrix <- cmatrix(
+      actual    = actual,
+      predicted = predicted
+    )
+
+    sl_wmatrix <- cmatrix(
+      actual    = actual,
+      predicted = predicted,
+      w         = w
+    )
+
+    py_matrix <- py_cmatrix(
+      actual    = actual,
+      predicted = predicted
+    )
+
+    py_wmatrix <- py_cmatrix(
+      actual    = actual,
+      predicted = predicted,
+      w         = w
+    )
+
+    # 2.1) test for equality
+    # in content
+    testthat::expect_true(
+      object = set_equal(
+        current    = as.table(py_matrix),
+        target     = as.table(sl_matrix)
+      )
+    )
+
+    # 2.1) test for equality
+    # in content
+    testthat::expect_true(
+      object = set_equal(
+        current    = as.table(py_wmatrix),
+        target     = as.table(sl_wmatrix)
+      )
+    )
+
+    # 3) test that the functions
+    # returns the same value regardless
+    # of method
+    for (i in seq_along(sl_classification)) {
+
+      # 3.1) extract function
+      # and pass into methods
+      .f <-  sl_classification[[i]]
+
+      # 3.2) expect these to
+      # be equal
+      testthat::expect_true(
+        object = set_equal(
+          .f(actual, predicted),
+          .f(sl_matrix)
+        ),
+        label = paste(
+          "Class-wise metods in", names(sl_classification)[i], "not equivalent."
+        )
+      )
+
+    }
+
+    # 4) test that the functions
+    # returns the same value regardless
+    # of method for weighted classification
+    for (i in seq_along(sl_classification)) {
+      name <- names(sl_classification)[i]
+      
+      # Check if the name exists in both lists
+      if (name %in% names(sl_wclassification)) {
+        # Extract corresponding functions
+        .f <- sl_wclassification[[name]]
+        .F <- sl_classification[[name]]
+        
+        # Ensure these are expected to be equal
+        testthat::expect_true(
+          object = set_equal(
+            .f(actual, predicted, w = w),
+            .F(sl_wmatrix)
+          ),
+          label = paste(
+            "Weighted class-wise methods in", name, "not equivalent."
+          )
+        )
+      }
+    }
+
+    # 5) test that the functions
+    # returns a differemt value regardless
+    # of method for weighted classification
+    for (i in seq_along(sl_classification)) {
+      name <- names(sl_classification)[i]
+      
+      # Check if the name exists in both lists
+      if (name %in% names(sl_wclassification)) {
+        # Extract corresponding functions
+        .f <- sl_wclassification[[name]]
+        .F <- sl_classification[[name]]
+        
+        # Ensure these are expected to be equal
+        # NOTE: if they are not equal it wont return
+        # a logical value.
+        testthat::expect_false(
+          object = is.logical(set_equal(
+            .f(actual, predicted, w = w),
+            .F(sl_matrix)
+          )),
+          label = paste(
+            "Weighted and unweighted class-wise methods in", name, "are equivalent."
+          )
+        )
+      }
+    }
+
+})
+
+
+testthat::test_that(
+  desc = "Test that S3 methods returns the same values for classification metrics (imbalanced)", code = {
+
+    # 1) generate class
+    # values
+    actual    <- create_factor(balanced = FALSE)
+    predicted <- create_factor(balanced = FALSE)
+    w         <- runif(n = length(actual))
+
+    # 2) generate confusion matrix
+    # from {SLmetrics} and {Python}
+    sl_matrix <- cmatrix(
+      actual    = actual,
+      predicted = predicted
+    )
+
+    sl_wmatrix <- cmatrix(
+      actual    = actual,
+      predicted = predicted,
+      w         = w
+    )
+
+    py_matrix <- py_cmatrix(
+      actual    = actual,
+      predicted = predicted
+    )
+
+    py_wmatrix <- py_cmatrix(
+      actual    = actual,
+      predicted = predicted,
+      w         = w
+    )
+
+    # 2.1) test for equality
+    # in content
+    testthat::expect_true(
+      object = set_equal(
+        current    = as.table(py_matrix),
+        target     = as.table(sl_matrix)
+      )
+    )
+
+    # 3) test that the functions
+    # returns the same value regardless
+    # of method
+    for (i in seq_along(sl_classification)) {
+
+      # 3.1) extract function
+      # and pass into methods
+      .f <-  sl_classification[[i]]
+
+      # 3.2) expect these to
+      # be equal
+      testthat::expect_true(
+        object = set_equal(
+          .f(actual, predicted),
+          .f(sl_matrix)
+        ),
+        label = paste(
+          "Unweighted class-wise metods in", names(sl_classification)[i], "not equivalent."
+        )
+      )
+
+    }
+
+    # 4) test that the functions
+    # returns the same value regardless
+    # of method for weighted classification
+    for (i in seq_along(sl_classification)) {
+      name <- names(sl_classification)[i]
+      
+      # Check if the name exists in both lists
+      if (name %in% names(sl_wclassification)) {
+        # Extract corresponding functions
+        .f <- sl_wclassification[[name]]
+        .F <- sl_classification[[name]]
+        
+        # Ensure these are expected to be equal
+        testthat::expect_true(
+          object = set_equal(
+            .f(actual, predicted, w = w),
+            .F(sl_wmatrix)
+          ),
+          label = paste(
+            "Weighted class-wise methods in", name, "not equivalent."
+          )
+        )
+      }
+    }
+
+    # 5) test that the functions
+    # returns a differemt value regardless
+    # of method for weighted classification
+    for (i in seq_along(sl_classification)) {
+      name <- names(sl_classification)[i]
+      
+      # Check if the name exists in both lists
+      if (name %in% names(sl_wclassification)) {
+        # Extract corresponding functions
+        .f <- sl_wclassification[[name]]
+        .F <- sl_classification[[name]]
+        
+        # Ensure these are expected to be equal
+        # NOTE: if they are not equal it wont return
+        # a logical value.
+        testthat::expect_false(
+          object = is.logical(set_equal(
+            .f(actual, predicted, w = w),
+            .F(sl_matrix)
+          )),
+          label = paste(
+            "Weighted and unweighted class-wise methods in", name, "are equivalent."
+          )
+        )
+      }
+    }
+
+})
+
+
diff --git a/tests/testthat/test-Specificity.R b/tests/testthat/test-Specificity.R
new file mode 100644
index 0000000..495fe7a
--- /dev/null
+++ b/tests/testthat/test-Specificity.R
@@ -0,0 +1,95 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `specificity()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_specificity <- function(
+      actual,
+      predicted,
+      w = NULL,
+      micro = TRUE) {
+      
+        if (is.null(w)) {
+          specificity(
+            actual     = actual,
+            predicted  = predicted,
+            micro      = micro
+          )
+        } else {
+          weighted.specificity(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w,
+            micro      = micro
+          )
+        }
+      
+    }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+        for (micro in c(NA, TRUE, FALSE)) {
+
+          # 2.1) generate sensible 
+          # label information
+          info <- paste(
+            "Balanced = ", balanced,
+            "Weighted = ", weighted,
+            "Micro = ", micro
+          )
+
+          # 2.2) generate score
+          # from {slmetrics}
+          score <- wrapped_specificity(
+            actual     = actual,
+            predicted  = predicted,
+            w          = if (weighted) w else NULL,
+            micro      = if (is.na(NA)) { NULL } else micro
+          )
+
+          # 2.3) test that the values
+          # are sensible the values 
+          # can be NA
+          testthat::expect_true(is.numeric(score), info = info)
+
+          # 2.4) test that the values
+          # are equal to target value
+
+          # 2.4.1) calculate py_score
+          py_score <- py_specificity(
+            actual    = actual,
+            predicted = predicted,
+            average   = if (is.na(NA)) { NULL } else ifelse(micro, "micro", "macro"),
+            w         = if (weighted) w else NULL
+          )
+
+          # 2.4.2) test for equality
+          testthat::expect_true(
+            object = set_equal(
+              current = score,
+              target  = py_score
+            ),
+            info = info
+          )
+
+        }
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-SymmetricMeanAbsoluteError.R b/tests/testthat/test-SymmetricMeanAbsoluteError.R
new file mode 100644
index 0000000..1b055a7
--- /dev/null
+++ b/tests/testthat/test-SymmetricMeanAbsoluteError.R
@@ -0,0 +1,74 @@
+# objective: Test that the metric
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `smape()`-function", code = {
+
+    # 0) construct smape-wrapperr
+    wrapped_smape <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          smape(
+            actual = actual,
+            predicted = predicted
+          )
+        } else {
+          weighted.smape(
+            actual = actual,
+            predicted = predicted,
+            w = w
+          )
+        }
+    }
+
+    for (weighted in c(FALSE, TRUE)) {
+
+      # 0) create regression
+      # for the test
+      values    <- create_regression()
+      actual    <- values$actual
+      predicted <- values$predicted
+      w         <- if (weighted) NULL else values$weights
+
+      # 1) generate sensible
+      # label information
+      info <- paste(
+        "Weighted = ", weighted
+      )
+
+      # 2) generate score
+      # from {slmetrics}
+      score <- wrapped_smape(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.1) test that the values
+      # are sensible
+      testthat::expect_true(is.numeric(score), info = info)
+      testthat::expect_true(!is.na(score), info = info)
+      testthat::expect_true(length(score) == 1, info = info)
+
+      # 2.2) calculate reference value
+      py_score <- py_smape(
+        actual     = actual,
+        predicted  = predicted,
+        w          = w
+      )
+
+      # 2.3) test for equality
+      testthat::expect_true(
+        object = set_equal(
+          current = score,
+          target  = py_score
+        ),
+        info = info
+      )
+
+    } 
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-ZeroOneLoss.R b/tests/testthat/test-ZeroOneLoss.R
new file mode 100644
index 0000000..18dc183
--- /dev/null
+++ b/tests/testthat/test-ZeroOneLoss.R
@@ -0,0 +1,85 @@
+# objective: Test that Zero-One Loss
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `zerooneloss()`-function", code = {
+
+    # 0) construct zerooneloss
+    # wrapper
+    wrapped_zerooneloss <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          zerooneloss(
+            actual     = actual,
+            predicted  = predicted
+          )
+        } else {
+          weighted.zerooneloss(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w
+          )
+        }
+      }
+    
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+      
+        # 2) test that the are 
+        # equal to target values
+
+        # 2.1) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced,
+          "Weighted = ", weighted
+        )
+
+        # 2.2) generate score
+        # from {slmetrics}
+        score <- wrapped_zerooneloss(
+          actual     = actual,
+          predicted  = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.3) test that the values
+        # are sensible the values 
+        # can be NA
+        testthat::expect_true(is.numeric(score), info = info)
+        testthat::expect_true(length(score) == 1, info = info)
+
+        # 2.4) test that the values
+        # are equal to target value
+
+        # 2.4.1) calculate py_score
+        py_score <- py_zerooneloss(
+          actual    = actual,
+          predicted = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.4.2) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = score,
+            target  = py_score
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)
\ No newline at end of file
diff --git a/tests/testthat/test-aggregated-classification.R b/tests/testthat/test-aggregated-classification.R
deleted file mode 100644
index 432afab..0000000
--- a/tests/testthat/test-aggregated-classification.R
+++ /dev/null
@@ -1,129 +0,0 @@
-# script: Imbalanced Classification
-# author: Serkan Korkmaz, serkor1@duck.com
-# date: 2024-10-04
-# objective: Test that aggregated
-# classifcation works as in {scikit-learn}
-# script start;
-
-testthat::test_that(
-  desc = "Aggregtetd classification in {SLmetrics} matches that of {scikit-learn} and {pytorch}",
-  code = {
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-    reticulate::source_python(
-      "pytorch.py"
-    )
-    source("ref-manual.R")
-    # 1) generate class
-    # values
-    actual    <- create_factor(balanced = FALSE)
-    predicted <- create_factor(balanced = FALSE)
-    
-    # 2) generate confusion matrix
-    # from {SLmetrics} and {Python}
-    sl_matrix <- cmatrix(
-      actual    = actual,
-      predicted = predicted
-    )
-    py_matrix <- py_cmatrix(
-      actual    = actual,
-      predicted = predicted
-    )
-    # 2.1) test for equality
-    # in content
-    testthat::expect_true(
-      object = set_equal(
-        current    = as.table(py_matrix),
-        target     = as.table(sl_matrix)
-      )
-    )
-    # 3) test that methods
-    # work and returns the same values
-    sl_function <- list(
-      "specificity" = specificity,
-      "tnr"         = tnr,
-      "selectivity" = selectivity,
-      # recall methods;
-      "recall"      = recall,
-      "sensitivity" = sensitivity,
-      "tpr"         = tpr,
-      # precision methods
-      "precision"   = precision,
-      "ppv"         = ppv,
-      # fbeta methods
-      # NOTE: There is a bug in Scikit-learn
-      # the fbeta score doesn't remove the NaNs
-      # "fbeta"       = fbeta,
-      # jaccard methods
-      "jaccard"     = jaccard,
-      "tscore"      = tscore,
-      "csi"         = csi,
-      # fpr methods
-      "fpr"         = fpr,
-      "fallout"     = fallout,
-      # fmi methods
-      "fdr"         = fdr,
-      "npv"         = npv,
-      "fer"         = fer
-    )
-
-    for (i in seq_along(sl_function)) {
-      .f <-  sl_function[[i]]
-      for (lgl in c(TRUE, FALSE)) {
-        testthat::expect_true(
-          object = set_equal(
-            .f(actual, predicted, micro = lgl, na.rm = TRUE),
-            .f(sl_matrix, micro = lgl, na.rm = TRUE)
-          ),
-          label = paste(
-            "Aggregated methods in", names(sl_function)[i], "not equivalent."
-          )
-        )
-      }
-    }
-    
-    # 4) test that the functions
-    # returns the same values as
-    # their python equivalents
-    py_function <- Filter(
-      Negate(is.null),
-      setNames(
-        lapply(seq_along(sl_function), function(i) {
-          tryCatch(
-            get(paste0("py_",names(sl_function)[i])),
-            error = function(e) {
-              NULL
-            }
-          )
-        }),
-        paste0("py_", names(sl_function))
-      )
-    )
-    sl_function <- sl_function[
-      names(sl_function) %in% gsub("py_", "", names(py_function))
-    ]
-    for (i in seq_along(sl_function)) {
-      .f <-  sl_function[[i]]
-      .F <-  py_function[[i]]
-     
-      for (lgl in c(TRUE, FALSE)) {
-        testthat::expect_true(
-          object = set_equal(
-            .f(actual, predicted, micro = lgl, na.rm = TRUE),
-            as.numeric(.F(actual, predicted, average = ifelse(lgl, "micro", "macro")))
-          ),
-          label = paste(
-            names(sl_function)[i],
-            paste0("with micro:", lgl, ""),
-            "not equivalent to {torch} or {scikit-learn}."
-          )
-        )
-      }
-    }
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-balanced-classification.R b/tests/testthat/test-balanced-classification.R
deleted file mode 100644
index d085c27..0000000
--- a/tests/testthat/test-balanced-classification.R
+++ /dev/null
@@ -1,172 +0,0 @@
-# script: Classification Tests
-# author: Serkan Korkmaz, serkor1@duck.com
-# date: 2024-10-02
-# objective: Test all classification
-# metrics against their torch/scikit-learn
-# counter part
-# script start;
-
-testthat::test_that(
-  desc = "Test that all classification metrics are correctly implemented",
-  code = {
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    reticulate::source_python(
-      "pytorch.py"
-    )
-
-    source("ref-manual.R")
-
-    # 1) generate class
-    # values
-    actual    <- create_factor(balanced = TRUE)
-    predicted <- create_factor(balanced = TRUE)
-
-    # 2) generate confusion matrix
-    # from {SLmetrics} and {Python}
-    sl_matrix <- cmatrix(
-      actual    = actual,
-      predicted = predicted
-    )
-
-    py_matrix <- py_cmatrix(
-      actual    = actual,
-      predicted = predicted
-    )
-
-    # 2.1) test for equality
-    # in content
-    testthat::expect_true(
-      object = set_equal(
-        current    = as.table(py_matrix),
-        target     = as.table(sl_matrix)
-      )
-    )
-
-    # 3) test that methods
-    # work and returns the same values
-    sl_function <- list(
-      # accuracy
-      "accuracy"    = accuracy,
-      "baccuracy"   = baccuracy,
-
-      # Zero-One Loss
-      "zerooneloss" = zerooneloss,
-
-      # specificity methods
-      "specificity" = specificity,
-      "tnr"         = tnr,
-      "selectivity" = selectivity,
-
-
-      # recall methods;
-      "recall"      = recall,
-      "sensitivity" = sensitivity,
-      "tpr"         = tpr,
-
-      # precision methods
-      "precision"   = precision,
-      "ppv"         = ppv,
-
-      # fbeta methods
-      "fbeta"       = fbeta,
-
-      # likelihood methods
-      "dor"         = dor,
-      "plr"         = plr,
-      "nlr"         = nlr,
-
-      # jaccard methods
-      "jaccard"     = jaccard,
-      "tscore"      = tscore,
-      "csi"         = csi,
-
-      # mcc methods
-      "mcc"         = mcc,
-      "phi"         = phi,
-
-      # fpr methods. Differs by 0.001 against scikit 
-      "fpr"         = fpr,
-      "fallout"     = fallout,
-
-      # fmi methods
-      "fmi"         = fmi,
-
-      "fdr"         = fdr,
-      "npv"         = npv,
-      "fer"         = fer,
-
-      "ckappa"      = ckappa
-
-
-    )
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-
-      testthat::expect_true(
-        object = set_equal(
-          .f(actual, predicted),
-          .f(sl_matrix)
-        ),
-        label = paste(
-          "Class-wise metods in", names(sl_function)[i], "not equivalent."
-        )
-      )
-
-
-    }
-
-    # 4) test that the functions
-    # returns the same values as
-    # their python equivalents
-    py_function <- Filter(
-      Negate(is.null),
-
-      setNames(
-        lapply(seq_along(sl_function), function(i) {
-          tryCatch(
-            get(paste0("py_",names(sl_function)[i])),
-            error = function(e) {
-              NULL
-            }
-          )
-
-        }),
-        paste0("py_", names(sl_function))
-      )
-    )
-
-    sl_function <- sl_function[
-      names(sl_function) %in% gsub("py_", "", names(py_function))
-    ]
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-      .F <-  py_function[[i]]
-
-      testthat::expect_true(
-        object = set_equal(
-          .f(actual, predicted),
-          as.numeric(.F(actual, predicted))
-        ),
-        label = paste(
-          "Class-wise functions in",
-          names(sl_function)[i],
-          "not equivalent to {torch} or {scikit-learn}."
-        )
-      )
-
-    }
-
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-confusion_matrix.R b/tests/testthat/test-confusion_matrix.R
deleted file mode 100644
index e873cb7..0000000
--- a/tests/testthat/test-confusion_matrix.R
+++ /dev/null
@@ -1,40 +0,0 @@
-# script: Confusion Matrix
-# date: 2024-10-05
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective: Test that the associated methods
-# of the cmatrix works as intended
-# script start;
-
-testthat::test_that(
-  desc = "Test that the methods associated with the `cmatrix()`-function works as intended",
-  code = {
-
-    # 1) generate confusion
-    # matrix
-    confusion_matrix <- cmatrix(
-      actual    = create_factor(),
-      predicted = create_factor()
-    )
-
-    # 2) test that summary
-    # works without any conditions
-    testthat::expect_no_condition(
-      invisible(
-        summary(confusion_matrix)
-      )
-
-    )
-
-    # 3) test that plot
-    # works without any conditions
-    testthat::expect_no_condition(
-      invisible(
-        plot(
-          confusion_matrix
-        )
-      )
-    )
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-pinball.R b/tests/testthat/test-pinball.R
deleted file mode 100644
index 778aab1..0000000
--- a/tests/testthat/test-pinball.R
+++ /dev/null
@@ -1,56 +0,0 @@
-# script: Test Pinball
-# date: 2024-10-18
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective:
-#
-# Test that pinball deviance
-# is correctly implemmented
-#
-# script start;
-
-testthat::test_that(
-  desc = "Test that `pinball(deviance = TRUE)` is correctly implemented",
-  code = {
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    reticulate::source_python(
-      "pytorch.py"
-    )
-
-
-    # 1) generate actual
-    # and predicted values
-    values <- create_regression()
-    actual <- values$actual
-    predicted <- values$predicted
-    w <- values$weight
-
-    # 2) test for unweighted deviance
-    # equality
-    testthat::expect_true(
-      object = set_equal(
-        current = as.numeric(pinball(actual, predicted, deviance = TRUE)),
-        target  = as.numeric(py_d2pinball(actual, predicted))
-      ),
-      label = "(unweighted) Pinball deviance is not equal to {scikit-learn} implementation"
-    )
-
-    # 3) test for weighted deviance
-    # equality
-    testthat::expect_true(
-      object = set_equal(
-        current = as.numeric(weighted.pinball(actual, predicted, w = w, deviance = TRUE)),
-        target  = as.numeric(py_d2pinball(actual, predicted, w = w))
-      ),
-      label = "(weightted) Pinball deviance is not equal to {scikit-learn} implementation"
-    )
-
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-regression.R b/tests/testthat/test-regression.R
deleted file mode 100644
index 2e8cb51..0000000
--- a/tests/testthat/test-regression.R
+++ /dev/null
@@ -1,119 +0,0 @@
-# script: Regression Tests
-# date: 2024-10-07
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective: Test that regression methods
-# are consistent with equivalent python methods
-# and retrurns sensible values
-# script start;
-
-testthat::test_that(
-  desc = "Test that all regression metrics are correctly implemented",
-  code = {
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    reticulate::source_python(
-      "pytorch.py"
-    )
-
-    # 1) generate actual
-    # and predicted values
-    values <- create_regression()
-    actual <- values$actual
-    predicted <- values$predicted
-
-    # 2) generate functions
-    # lists
-    sl_function <- list(
-      "rmse"      = rmse,
-      "mse"       = mse,
-      "rmsle"     = rmsle,
-      "huberloss" = huberloss,
-      "mpe"       = mpe,
-      "mape"      = mape,
-      "smape"     = smape,
-      "rae"       = rae,
-      "rrmse"     = rrmse,
-      "mae"       = mae,
-      "ccc"       = ccc,
-      "pinball"   = pinball
-    )
-
-    # 3) test that the functions
-    # runs without errors
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-
-      testthat::expect_true(
-        all(
-          is.numeric(.f(actual, predicted)),
-          length(.f(actual, predicted)) == 1
-        ),
-        label = paste(
-          names(sl_function)[i],
-          "Not all true"
-        )
-      )
-
-    }
-
-    # 4) test that the functions
-    # returns the same values as
-    # their python equivalents
-    py_function <- Filter(
-      Negate(is.null),
-
-      setNames(
-        lapply(seq_along(sl_function), function(i) {
-          tryCatch(
-            get(paste0("py_",names(sl_function)[i])),
-            error = function(e) {
-              NULL
-            }
-          )
-
-        }),
-        paste0("py_", names(sl_function))
-      )
-    )
-
-    sl_function <- sl_function[
-      names(sl_function) %in% gsub("py_", "", names(py_function))
-    ]
-
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-      .F <-  py_function[[i]]
-
-      # 1) replace missing with 0
-      # as in python
-      sl_measure <- .f(actual, predicted);
-
-      py_measure <- as.numeric(reticulate::py_suppress_warnings(.F(actual, predicted)))
-
-      testthat::expect_true(
-        object = set_equal(
-          sl_measure,
-          py_measure
-
-        ),
-        label = paste(
-          "Unweighted functions in",
-          names(sl_function)[i],
-          "not equivalent to {torch} or {scikit-learn}."
-        )
-      )
-
-    }
-
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-rsq.R b/tests/testthat/test-rsq.R
deleted file mode 100644
index e57a4c3..0000000
--- a/tests/testthat/test-rsq.R
+++ /dev/null
@@ -1,94 +0,0 @@
-# script: Test R Squared
-# date: 2024-10-10
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective: This is tested directly
-# against LM
-# script start;
-
-testthat::test_that(
-  desc = "Test that `rsq()`-function is consistent with the `lm()`-function",
-  code = {
-
-    # 0) run a regression
-    # on mtcars
-    model <-  lm(
-      formula = mpg ~ .,
-      data    = mtcars
-    )
-
-    # 1) extract rsq
-    target_rsq    <- summary(model)$r.squared
-    target_adjrsq <- summary(model)$adj.r.squared
-
-    # 2) calculate rsq
-    # manually
-    sl_rsq <- rsq(
-      actual = mtcars$mpg,
-      predicted = fitted.values(model)
-    )
-
-    sl_adjrsq <- rsq(
-      actual = mtcars$mpg,
-      predicted = fitted.values(model),
-      k = ncol(model.matrix(model)) - 1
-    )
-
-    # 3) testtat they are
-    # all exactly equal
-    testthat::expect_true(
-      set_equal(
-        target  = c(target_rsq, target_adjrsq),
-        current = c(sl_rsq, sl_adjrsq)
-      )
-    )
-
-  }
-)
-
-
-testthat::test_that(
-  desc = "Test that `weighted.rsq()`-function is consistent with the `lm()`-function",
-  code = {
-
-    # 0) run a regression
-    # on mtcars
-    w <- runif(length(mtcars$mpg))
-
-    model <-  lm(
-      formula = mpg ~ .,
-      data    = mtcars,
-      weights = w
-    )
-
-    # 1) extract rsq
-    target_rsq    <- summary(model)$r.squared
-    target_adjrsq <- summary(model)$adj.r.squared
-
-    # 2) calculate rsq
-    # manually
-    sl_rsq <- weighted.rsq(
-      actual = mtcars$mpg,
-      predicted = fitted.values(model),
-      w = w
-    )
-
-    sl_adjrsq <- weighted.rsq(
-      actual = mtcars$mpg,
-      predicted = fitted.values(model),
-      k = ncol(model.matrix(model)) - 1,
-      w = w
-    )
-
-    # 3) testtat they are
-    # all exactly equal
-    testthat::expect_true(
-      set_equal(
-        target  = c(target_rsq, target_adjrsq),
-        current = c(sl_rsq, sl_adjrsq)
-      )
-    )
-
-  }
-)
-
-# script end;
diff --git a/tests/testthat/test-weighted_classification.R b/tests/testthat/test-weighted_classification.R
deleted file mode 100644
index adad75b..0000000
--- a/tests/testthat/test-weighted_classification.R
+++ /dev/null
@@ -1,238 +0,0 @@
-# script: weighted classification metrics
-# date: 2024-12-12
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective: Test that the weighted metrics
-# corresponds to target values
-# script start;
-
-testthat::test_that(
-  desc = "Test that the weighted classification metrics equals {scikit-learn}-equivalents",
-  code = {
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    reticulate::source_python(
-      "pytorch.py"
-    )
-
-    # source("ref-manual.R")
-
-    # 1) generate class
-    # values
-    actual    <- create_factor(balanced = TRUE)
-    predicted <- create_factor(balanced = TRUE)
-    weights   <- runif(n = length(actual))
-
-    # 2) generate confusion matrix
-    # from {SLmetrics} and {Python}
-    sl_matrix <- cmatrix(
-      actual    = actual,
-      predicted = predicted,
-      w         = weights
-    )
-
-    py_matrix <- py_cmatrix(
-      actual    = actual,
-      predicted = predicted,
-      w         = weights
-    )
-
-    # 2.1) test for equality
-    # in content
-    testthat::expect_true(
-      object = set_equal(
-        current    = as.table(py_matrix),
-        target     = as.table(sl_matrix)
-      )
-    )
-
-    # 3) test that all metrics
-    # that supports weighted classifications
-    # are equal to target metrics and methods
-    sl_function <- list(
-      # accuracy
-      "accuracy"    = weighted.accuracy,
-      "baccuracy"   = weighted.baccuracy,
-
-      # Zero-One Loss
-      "zerooneloss" = weighted.zerooneloss,
-
-      # specificity methods
-      # Bug in python application. This is incorrect
-      # make PR
-      # "specificity" = weighted.specificity,
-      # "tnr"         = weighted.tnr,
-      # "selectivity" = weighted.selectivity,
-
-
-      # recall methods;
-      "recall"      = weighted.recall,
-      "sensitivity" = weighted.sensitivity,
-      "tpr"         = weighted.tpr,
-
-      # precision methods
-      "precision"   = weighted.precision,
-      "ppv"         = weighted.ppv,
-
-      # fbeta methods
-      "fbeta"       = weighted.fbeta,
-
-      # likelihood methods
-      "dor"         = weighted.dor,
-      "plr"         = weighted.plr, 
-      "nlr"         = weighted.nlr,
-
-      # jaccard methods
-      "jaccard"     = weighted.jaccard,
-      "tscore"      = weighted.tscore,
-      "csi"         = weighted.csi,
-
-      # mcc methods: 
-      "mcc"         = weighted.mcc,
-      "phi"         = weighted.phi,
-
-      # fpr methods
-      "fpr"         = weighted.fpr, 
-      "fallout"     = weighted.fallout,
-
-      "fdr"         = weighted.fdr,
-      "npv"         = weighted.npv,
-      "fer"         = weighted.fer,
-
-      # Cohens Kappa:
-      "ckappa"      = weighted.ckappa
-
-    )
-
-    # 3.1) check if the foo.cmatrix
-    # gives the same result
-    #
-    # NOTE: This has to be produced differently
-    # to avoid mismatches!
-
-    cmatrix_method <- list(
-      # accuracy
-      "accuracy"    = accuracy,
-      "baccuracy"   = baccuracy,
-
-      # Zero-One Loss
-      "zerooneloss" = zerooneloss,
-
-      # specificity methods
-      # "specificity" = specificity,
-      # "tnr"         = tnr,
-      # "selectivity" = selectivity,
-
-
-      # recall methods;
-      "recall"      = recall,
-      "sensitivity" = sensitivity,
-      "tpr"         = tpr,
-
-      # precision methods
-      "precision"   = precision,
-      "ppv"         = ppv,
-
-      # fbeta methods
-      "fbeta"       = fbeta,
-
-      # likelihood methods
-      "dor"         = dor,
-      "plr"         = plr,
-      "nlr"         = nlr,
-
-      # jaccard methods
-      "jaccard"     = jaccard,
-      "tscore"      = tscore,
-      "csi"         = csi,
-
-      # mcc methods
-      "mcc"         = mcc,
-      "phi"         = phi,
-
-      # fpr methods. Differs by 0.001 against scikit 
-      "fpr"         = fpr,
-      "fallout"     = fallout,
-
-      "fdr"         = fdr,
-      "npv"         = npv,
-      "fer"         = fer,
-
-      "ckappa"      = ckappa
-
-
-    )
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-      .F <-  cmatrix_method[[i]]
-
-      testthat::expect_true(
-        object = set_equal(
-          .f(actual, predicted, w = weights),
-          .F(sl_matrix)
-        ),
-        label = paste(
-          "foo.cmatrix", names(sl_function)[i], "is not equivalent to weighted.foo!"
-        )
-      )
-
-
-    }
-
-    # 4) test that the functions
-    # returns the same values as
-    # their python equivalents
-    py_function <- Filter(
-      Negate(is.null),
-
-      setNames(
-        lapply(seq_along(sl_function), function(i) {
-          tryCatch(
-            get(paste0("py_",names(sl_function)[i])),
-            error = function(e) {
-              NULL
-            }
-          )
-
-        }),
-        paste0("py_", names(sl_function))
-      )
-    )
-
-    sl_function <- sl_function[
-      names(sl_function) %in% gsub("py_", "", names(py_function))
-    ]
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-      .F <-  py_function[[i]]
-
-      testthat::expect_true(
-        object = set_equal(
-          .f(actual, predicted, w = weights),
-          as.numeric(.F(actual, predicted, w = weights))
-        ),
-        label = paste(
-          "Class-wise functions in",
-          names(sl_function)[i],
-          "not equivalent to {torch} or {scikit-learn}."
-        )
-      )
-
-
-    }
-
-
-
-
-  }
-)
-
-# script end;
\ No newline at end of file
diff --git a/tests/testthat/test-weighted_regression.R b/tests/testthat/test-weighted_regression.R
deleted file mode 100644
index c4e7764..0000000
--- a/tests/testthat/test-weighted_regression.R
+++ /dev/null
@@ -1,101 +0,0 @@
-# script: Regression Tests
-# date: 2024-12-18
-# author: Serkan Korkmaz, serkor1@duck.com
-# objective: Test that weighted regression methods
-# are consistent with equivalent python methods
-# and retrurns sensible values
-# script start;
-
-testthat::test_that(
-  desc = "Test that all weighted regression metrics are correctly implemented",
-  code = {
-
-    # 0) load functions from
-    # {scikit-learn} and {pytorch}
-    reticulate::source_python(
-      "scikit-learn.py"
-    )
-
-    reticulate::source_python(
-      "pytorch.py"
-    )
-
-    # 1) generate actual
-    # and predicted values
-    values <- create_regression()
-    actual <- values$actual
-    predicted <- values$predicted
-    w <- values$weight
-
-    # 2) generate functions
-    # lists
-    sl_function <- list(
-      "rmse"      = weighted.rmse,
-      "rmsle"     = weighted.rmsle,
-      "rrmse"     = weighted.rrmse,
-      "rae"       = weighted.rae,
-      "mape"      = weighted.mape,
-      "smape"     = weighted.smape,
-      "mpe"       = weighted.mpe,
-      "mse"       = weighted.mse,
-      "ccc"       = weighted.ccc,
-      "huberloss" = weighted.huberloss,
-      "mae"       = weighted.mae,
-      "pinball"   = weighted.pinball
-    )
-
-    # 3) test that the functions
-    # returns the same values as
-    # their python equivalents
-    py_function <- Filter(
-      Negate(is.null),
-
-      setNames(
-        lapply(seq_along(sl_function), function(i) {
-          tryCatch(
-            get(paste0("py_",names(sl_function)[i])),
-            error = function(e) {
-              NULL
-            }
-          )
-
-        }),
-        paste0("py_", names(sl_function))
-      )
-    )
-
-    sl_function <- sl_function[
-      names(sl_function) %in% gsub("py_", "", names(py_function))
-    ]
-
-
-    for (i in seq_along(sl_function)) {
-
-      .f <-  sl_function[[i]]
-      .F <-  py_function[[i]]
-
-      # 1) replace missing with 0
-      # as in python
-      sl_measure <- .f(actual, predicted, w = w);
-
-      py_measure <- as.numeric(reticulate::py_suppress_warnings(.F(actual, predicted, w = w)))
-
-      testthat::expect_true(
-        object = set_equal(
-          sl_measure,
-          py_measure
-
-        ),
-        label = paste(
-          "Weighted functions in",
-          names(sl_function)[i],
-          "not equivalent to {torch} or {scikit-learn}."
-        )
-      )
-
-    }
-
-  }
-)
-
-# script end;