A complete overhaul of unit-tests 🔨 (#22)

* [UNIT-TEST] Consolidated Setup 🔨 * All refererence function are now called from the setup script. * All (unweighted) classification functions are now called from the setup script. * [UNIT-TEST] S3 methods for classification 🔨 * All S3 methods are now tested seperately for weighted and unweighted classifcation. The tests are for balanced and imbalanced classification. * [UNIT-TEST] Completely Rewritten (See message) 🔨 * To account for edge cases, corner cases and whatever type of cases there exists ALL unit tests have been rewritten to capture these errors. * The specificity function have been rewritten, as the {scikit-learn} implementation is misbehaving.
serkor1 · Dec 21, 2024 · a90db17 · a90db17
1 parent 9aec082
commit a90db17
Show file tree

Hide file tree

Showing 43 changed files with 2,971 additions and 981 deletions.
diff --git a/tests/testthat/pytorch.py b/tests/testthat/pytorch.py
@@ -5,7 +5,7 @@
 from torchmetrics.functional import symmetric_mean_absolute_percentage_error
 
 # Classification metrics
-def py_huber(actual, predicted, delta=1.0, w=None):
+def py_huberloss(actual, predicted, delta=1.0, w=None):
 
     actual = torch.tensor(actual, dtype=torch.float64)
     predicted = torch.tensor(predicted, dtype=torch.float64)

diff --git a/tests/testthat/ref-manual.R b/tests/testthat/ref-manual.R
@@ -13,9 +13,9 @@
 # Concordance Correlation Coefficient
 # The values have been verified with yardstick and 
 # epiR
-py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
+py_ccc <- function(actual, predicted, w = NULL, correction = FALSE) {
 
-  actual <- as.numeric(actual)
+  actual    <- as.numeric(actual)
   predicted <- as.numeric(predicted)
 
   if (is.null(w)) {
@@ -39,7 +39,7 @@ py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
   predicted_variance <- cov_matrix$cov[2, 2]
   covariance <- cov_matrix$cov[1, 2]
 
-  if (bias) {
+  if (correction) {
     n <- sum(w) 
     actual_variance <- actual_variance * (n - 1) / n
     predicted_variance <- predicted_variance * (n - 1) / n
@@ -55,6 +55,59 @@ py_ccc <- function(actual, predicted, w = NULL, bias = FALSE) {
 
 
 
+py_specificity <- function(
+  actual,
+  predicted,
+  average = NULL,
+  w       = NULL,
+  na.rm   = TRUE
+) {
+
+  # 1) Construct matrix
+  conf_mat <- SLmetrics::cmatrix(
+    actual = actual,
+    predicted = predicted,
+    w = w
+  )
+
+  TN <- sum(conf_mat) - rowSums(conf_mat) - colSums(conf_mat) + diag(conf_mat)
+  FP <- colSums(conf_mat) - diag(conf_mat)
+
+
+  output <- TN/(TN+FP)
+
+  # 2) calculate values
+  if (!is.null(average)) {
+
+    average <- as.logical(average == "micro")
+
+    if (average) {
+
+      output <-  sum(TN, na.rm = TRUE) / (sum(TN, na.rm = TRUE) + sum(FP, na.rm = TRUE))
+
+    } else {
+
+      if (!na.rm) {
+
+        output[!is.finite(output)] <- 0
+
+      }
+
+      output <- mean(
+        output,
+        na.rm = na.rm
+      )
+
+    }
+
+  }
+
+  return(
+    output
+  )
+
+}
+
 # False Discovery Rate
 py_fdr <- function(
     actual,
@@ -395,5 +448,50 @@ ref_prROC <- function(actual, response, thresholds) {
 
 }
 
+# Regression Functions
+py_rrmse <- function(
+  actual,
+  predicted,
+  w = NULL
+) {
+
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+
+  sqrt(sum((w * actual - w * predicted)^2) / sum((w * actual - weighted.mean(actual, w = w))^2))
+
+}
+
+
+py_rae <- function(
+  actual,
+  predicted,
+  w = NULL) {
+
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+
+    sum(abs(actual - predicted)) / sum(abs(actual - weighted.mean(actual, w = w)))
+}
+
+
+py_mpe <- function(
+  predicted, 
+  actual, 
+  w = NULL) {
+
+  if (is.null(w)) {
+    w <- rep(1, length(actual))
+  }
+
+  error <- (actual - predicted) / actual
+  weighted_mpe <- sum(w * error) / sum(w)
+
+  weighted_mpe
+}
+
+
 
 # script end;
diff --git a/tests/testthat/scikit-learn.py b/tests/testthat/scikit-learn.py
@@ -112,14 +112,6 @@ def py_entropy(actual, response, normalize = True, w = None):
       sample_weight = w
     )
 
-def py_specificity(actual, response, average = None, w = None):
-    return specificity_score(
-      y_true    = actual,
-      y_pred    = response,
-      average   = average,
-      sample_weight = w
-    )
-
 def py_roc(actual, response, pos_label = 1, w = None):
     return metrics.roc_curve(
       actual,

diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
@@ -8,6 +8,9 @@
 #
 # script start;
 
+# 0) set amount of test failures
+testthat::set_max_fails(Inf)
+
 # 1) set seed for all
 # samples
 set.seed(1903)
@@ -113,7 +116,7 @@ set_equal <- function(
     current,
     target,
     tolerance = 1e-9) {
-
+  
   all.equal(
     target = target,
     current = current,
@@ -124,5 +127,125 @@ set_equal <- function(
 
 }
 
+# 6) load scripts
+# globally
+reticulate::source_python(
+  "scikit-learn.py"
+)
+reticulate::source_python(
+  "pytorch.py"
+)
+source("ref-manual.R")
+
+# 7) define all classification
+# functions in {SLmetrics}
+sl_classification <- list(
+  # accuracy
+  "accuracy"    = accuracy,
+  "baccuracy"   = baccuracy,
+
+  # Zero-One Loss
+  "zerooneloss" = zerooneloss,
+
+  # specificity methods
+  "specificity" = specificity,
+  "tnr"         = tnr,
+  "selectivity" = selectivity,
+
+
+  # recall methods;
+  "recall"      = recall,
+  "sensitivity" = sensitivity,
+  "tpr"         = tpr,
+
+  # precision methods
+  "precision"   = precision,
+  "ppv"         = ppv,
+
+  # fbeta methods
+  "fbeta"       = fbeta,
+
+  # likelihood methods
+  "dor"         = dor,
+  "plr"         = plr,
+  "nlr"         = nlr,
+
+  # jaccard methods
+  "jaccard"     = jaccard,
+  "tscore"      = tscore,
+  "csi"         = csi,
+
+  # mcc methods
+  "mcc"         = mcc,
+  "phi"         = phi,
+
+  # false positive
+  "fpr"         = fpr,
+  "fallout"     = fallout,
+
+  # fmi methods
+  "fmi"         = fmi,
+
+  "fdr"         = fdr,
+  "npv"         = npv,
+  "fer"         = fer,
+
+  "ckappa"      = ckappa
+
+)
+
+# 7) define all weighted classification
+# functions in {SLmetrics}
+sl_wclassification <- list(
+  # accuracy
+  "accuracy"    = weighted.accuracy,
+  "baccuracy"   = weighted.baccuracy,
+
+  # Zero-One Loss
+  "zerooneloss" = weighted.zerooneloss,
+
+  # specificity methods
+  "specificity" = weighted.specificity,
+  "tnr"         = weighted.tnr,
+  "selectivity" = weighted.selectivity,
+
+
+  # recall methods;
+  "recall"      = weighted.recall,
+  "sensitivity" = weighted.sensitivity,
+  "tpr"         = weighted.tpr,
+
+  # precision methods
+  "precision"   = weighted.precision,
+  "ppv"         = weighted.ppv,
+
+  # fbeta methods
+  "fbeta"       = weighted.fbeta,
+
+  # likelihood methods
+  "dor"         = weighted.dor,
+  "plr"         = weighted.plr,
+  "nlr"         = weighted.nlr,
+
+  # jaccard methods
+  "jaccard"     = weighted.jaccard,
+  "tscore"      = weighted.tscore,
+  "csi"         = weighted.csi,
+
+  # mcc methods
+  "mcc"         = weighted.mcc,
+  "phi"         = weighted.phi,
+
+  # false positive
+  "fpr"         = weighted.fpr,
+  "fallout"     = weighted.fallout,
+
+  "fdr"         = weighted.fdr,
+  "npv"         = weighted.npv,
+  "fer"         = weighted.fer,
+
+  "ckappa"      = weighted.ckappa
+
+)
 
 # script end;
diff --git a/tests/testthat/test-Accuracy.R b/tests/testthat/test-Accuracy.R
@@ -0,0 +1,82 @@
+# objective: Test that Accuracy
+# implemented in {SLmetrics} is aligned with
+# target functions.
+
+testthat::test_that(
+  desc = "Test `accuracy()`-function", code = {
+
+    # 0) construct Balanced Accuracy
+    # wrapper
+    wrapped_accuracy <- function(
+      actual,
+      predicted,
+      w = NULL) {
+        if (is.null(w)) {
+          accuracy(
+            actual     = actual,
+            predicted  = predicted
+          )
+        } else {
+          weighted.accuracy(
+            actual     = actual,
+            predicted  = predicted,
+            w          = w
+          )
+        }
+      }
+
+    for (balanced in c(FALSE, TRUE)) {
+
+      # 1) generate class
+      # values
+      actual    <- create_factor(balanced = balanced)
+      predicted <- create_factor(balanced = balanced)
+      w         <- runif(n = length(actual))
+
+      for (weighted in c(TRUE, FALSE)) {
+
+        # 2.1) generate sensible 
+        # label information
+        info <- paste(
+          "Balanced = ", balanced,
+          "Weighted = ", weighted
+        )
+
+        # 2.2) generate score
+        # from {slmetrics}
+        score <- wrapped_accuracy(
+          actual     = actual,
+          predicted  = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.3) test that the values
+        # are sensible the values 
+        # can be NA
+        testthat::expect_true(is.numeric(score), info = info)
+        testthat::expect_true(length(score) == 1, info = info)
+
+        # 2.4) test that the values
+        # are equal to target value
+
+        # 2.4.1) calculate py_score
+        py_score <- py_accuracy(
+          actual    = actual,
+          predicted = predicted,
+          w          = if (weighted) w else NULL
+        )
+
+        # 2.4.2) test for equality
+        testthat::expect_true(
+          object = set_equal(
+            current = score,
+            target  = py_score
+          ),
+          info = info
+        )
+
+      }
+
+    }
+  }
+)