Skip to content

Commit

Permalink
a single \donttest{} is enough
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhard-da committed May 17, 2024
1 parent d517632 commit 9793865
Showing 1 changed file with 50 additions and 51 deletions.
101 changes: 50 additions & 51 deletions R/dUtility.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
#'
#' The standardised distances of the perturbed data values to the original ones
#' are measured. The following measures are available:
#' - `"IL1`: sum of absolute distances between original and perturbed variables
#' - `"IL1`: sum of absolute distances between original and perturbed variables
#' scaled by absolute values of the original variables
#' - `"IL1s`: measures the absolute distances between original
#' and perturbed ones, scaled by the standard deviation of original variables times
#' the square root of `2`.
#' and perturbed ones, scaled by the standard deviation of original variables times
#' the square root of `2`.
#' - `"eigen`; compares the eigenvalues of original and perturbed data
#' - `"robeigen`; compares robust eigenvalues of original and perturbed data
#' @name dUtility
Expand All @@ -22,7 +22,7 @@
#' @return data utility or modified entry for data utility the [sdcMicroObj-class].
#' @author Matthias Templ
#' @seealso [dRisk()], [dRiskRMD()]
#' @references for IL1 and IL1s: see Mateo-Sanz, Sebe, Domingo-Ferrer.
#' @references for IL1 and IL1s: see Mateo-Sanz, Sebe, Domingo-Ferrer.
#' Outlier Protection in Continuous Microdata Masking.
#' International Workshop on Privacy in Statistical Databases.
#' PSD 2004: Privacy in Statistical Databases pp 201-215.
Expand Down Expand Up @@ -59,7 +59,6 @@
#' keyVars=c('urbrur','roof','walls','water','electcon','relat','sex'),
#' numVars=c('expend','income','savings'), w='sampling_weight')
#' ## this is already made internally, so you don't need to run this:
#' \donttest{
#' sdc <- dUtility(sdc)
#' }
dUtility <- function(obj, ...) {
Expand Down Expand Up @@ -96,7 +95,7 @@ dUtilityWORK <- function(x, xm, method = "IL1s") {
if (method == "IL1") {
a <- x
for (i in 1:dim(x)[2]) {
a[[i]] <- 100 * abs(x[[i]] - xm[[i]]) / (abs(x[[i]]))
a[[i]] <- 100 * abs(x[[i]] - xm[[i]]) / (abs(x[[i]]))
}
a[a == "Inf"] <- NA
infLoss1 <- sum(a, na.rm = TRUE)
Expand Down Expand Up @@ -140,30 +139,30 @@ dUtilityWORK <- function(x, xm, method = "IL1s") {


#' Additional Information-Loss measures
#'
#'
#' Measures [IL_correl()] and [IL_variables()] were proposed by Andrzej Mlodak and are (theoretically) bounded between `0` and `1`.
#'
#'
#' - `IL_correl()`: is a information-loss measure that can be applied to common numerically scaled variables in `x` and `xm`. It is based
#' on diagonal entries of inverse correlation matrices in the original and perturbed data.
#' - `IL_variables()`: for common-variables in `x` and `xm` the individual distance-functions depend on the class of the variable;
#' on diagonal entries of inverse correlation matrices in the original and perturbed data.
#' - `IL_variables()`: for common-variables in `x` and `xm` the individual distance-functions depend on the class of the variable;
#' specifically these functions are different for numeric variables, ordered-factors and character/factor variables. The individual distances
#' are summed up and scaled by `n * m` with `n` being the number of records and `m` being the number of (common) variables.
#' are summed up and scaled by `n * m` with `n` being the number of records and `m` being the number of (common) variables.
#' @author Bernhard Meindl <bernhard.meindl@@statistik.gv.at>
#' @details Details can be found in the references below
#'
#' The implementation of [IL_correl()] differs slightly with the original proposition from Mlodak, A. (2020) as
#'
#' The implementation of [IL_correl()] differs slightly with the original proposition from Mlodak, A. (2020) as
#' the constant multiplier was changed to `1 / sqrt(2)` instead of `1/2` for better efficiency and interpretability
#' of the measure.
#'
#'
#' @param x an object coercible to a `data.frame` representing the original dataset
#' @param xm an object coercible to a `data.frame` representing the perturbed, modified dataset
#' @param digits number digits used for rounding when displaying results
#' @param ... additional parameter for print-methods; currently ignored
#' @references Mlodak, A. (2020). Information loss resulting from statistical disclosure control of output data,
#' Wiadomosci Statystyczne. The Polish Statistician, 2020, 65(9), 7-27, DOI: 10.5604/01.3001.0014.4121
#'
#' Mlodak, A. (2019). Using the Complex Measure in an Assessment of the Information Loss Due to the Microdata Disclosure Control,
#' Przegląd Statystyczny, 2019, 66(1), 7-26,
#'
#' Mlodak, A. (2019). Using the Complex Measure in an Assessment of the Information Loss Due to the Microdata Disclosure Control,
#' Przegląd Statystyczny, 2019, 66(1), 7-26,
#' DOI: 10.5604/01.3001.0013.8285
#' @return the corresponding information-loss measure
#' @export
Expand All @@ -173,8 +172,8 @@ dUtilityWORK <- function(x, xm, method = "IL1s") {
#' data("Tarragona", package = "sdcMicro")
#' res1 <- addNoise(obj = Tarragona, variables = colnames(Tarragona), noise = 100)
#' IL_correl(x = as.data.frame(res1$x), xm = as.data.frame(res1$xm))
#'
#' res2 <- addNoise(obj = Tarragona, variables = colnames(Tarragona), noise = 25)
#'
#' res2 <- addNoise(obj = Tarragona, variables = colnames(Tarragona), noise = 25)
#' IL_correl(x = as.data.frame(res2$x), xm = as.data.frame(res2$xm))
IL_correl <- function(x, xm) {
# compute inverse of correlation matrix and return diagonal entries
Expand All @@ -185,44 +184,44 @@ IL_correl <- function(x, xm) {
}
diag(inv_mat)
}

stopifnot(is.data.frame(x), is.data.frame(xm))

x <- data.table::setDT(x)
xm <- data.table::setDT(xm)

cn <- intersect(names(x), names(xm))
if (length(cn) == 0) {
stop("please supply data.frames with overlapping variable names", call. = FALSE)
}

# compute overlapping numeric variables
numerics_x <- sort(cn[sapply(cn, function(k) {
is.numeric(x[[k]])
})])

numerics_y <- sort(cn[sapply(cn, function(k) {
is.numeric(xm[[k]])
})])

stopifnot(all.equal(numerics_x, numerics_y))

df_o <- x[, numerics_x, with = FALSE, drop = FALSE]
df_p <- xm[, numerics_y, with = FALSE, drop = FALSE]

# compute diagonal entries
diags_o <- .get_inverse_diags(df = df_o)
diags_p <- .get_inverse_diags(df = df_p)

# compute inner denominator in formula
denom_o <- sqrt(sum(diags_o^2));
denom_o <- sqrt(sum(diags_o^2));
denom_p <- sqrt(sum(diags_p^2))

# compute lambda
res <- sqrt(sum(((diags_o / denom_o) - (diags_p / denom_p))^2))
res <- sqrt(sum(((diags_o / denom_o) - (diags_p / denom_p))^2))
#res <- 0.5 * res # original proposal
res <- res * (1 / sqrt(2))

attr(res, "nr_vars") <- ncol(df_o)
attr(res, "n_x") <- nrow(x)
attr(res, "n_y") <- nrow(xm)
Expand All @@ -244,7 +243,7 @@ print.il_correl <- function(x, digits = 3, ...) {
#' @export
#' @rdname il_additional
#' @examples
#'
#'
#' # creating test-inputs
#' n <- 150
#' x <- xm <- data.frame(
Expand All @@ -266,14 +265,14 @@ IL_variables <- function(x, xm) {
stop("`p` must be an (ordered) factor or character", call. = FALSE)
}
stopifnot(length(x) == length(p))

if (is.factor(x)) x <- as.character(x)
if (is.factor(p)) p <- as.character(p)

dists <- rep(1, length(x))
na_x <- is.na(x)
na_p <- is.na(p)

dists <- rep(1, length(x))
dists[x == p] <- 0
dists[na_x | na_p] <- 1
Expand All @@ -284,9 +283,9 @@ IL_variables <- function(x, xm) {
stopifnot(is.factor(x)); stopifnot("ordered" %in% class(x))
stopifnot(is.factor(p)); stopifnot("ordered" %in% class(p))
stopifnot(all.equal(levels(x), levels(p)))

max_diff <- length(levels(x)) - 1

# single NA -> max_diff
# both NA -> 0
d <- abs(as.numeric(x) - as.numeric(p))
Expand All @@ -297,43 +296,43 @@ IL_variables <- function(x, xm) {
dist_num <- function(x, p) {
stopifnot(is.numeric(x))
stopifnot(is.numeric(p))

# max. possible difference
max_diff <- max(abs(range(p, na.rm = TRUE) - rev(range(x, na.rm = TRUE))))

v <- abs(x - p)
v[is.na(x) | is.na(p)] <- max_diff
v[is.na(x) & is.na(p)] <- 0
sum((2 / pi) * atan(v))
}

stopifnot(is.data.frame(x), is.data.frame(xm))
x <- data.table::setDT(x)
xm <- data.table::setDT(xm)
stopifnot(nrow(x) == nrow(xm))

cn <- intersect(names(x), names(xm))
if (length(cn) == 0) {
stop("please supply data.frames with overlapping variable names", call. = FALSE)
}

cl_o <- sapply(cn, function(v) {
class(x[[v]])
})
cl_p <- sapply(cn, function(v) {
class(xm[[v]])
})

# classes of common-variables must match
if(!all.equal(cl_o, cl_p)) {
stop("classes of common-variables in `x` and `xm` must be identical", call. = FALSE)
}

inp <- data.frame(v = cn, type = "nominal")
inp[sapply(cn, function(v) inherits(x[[v]], "numeric")), "type"] <- "numeric"
inp[sapply(cn, function(v) inherits(x[[v]], "ordered")), "type"] <- "ordered"
# compute distances
inp[sapply(cn, function(v) inherits(x[[v]], "ordered")), "type"] <- "ordered"

# compute distances
inp$dists <- sapply(1:nrow(inp), function(y) {
v <- inp$v[y]
vals_o <- x[[v]]
Expand All @@ -346,14 +345,14 @@ IL_variables <- function(x, xm) {
dist_nom(x = vals_o, p = vals_p)
}
})

# aggregate distances to compute final utility measure
lambda <- sum(inp$dists) / (length(cn) * nrow(x))

# individual contributions
indiv_contr <- inp$dists / nrow(x)
names(indiv_contr) <- inp$v

attr(lambda, "indiv_distances") <- indiv_contr
attr(lambda, "n") <- nrow(x)
class(lambda) <- "il_variables"
Expand Down

0 comments on commit 9793865

Please sign in to comment.