diff --git a/NAMESPACE b/NAMESPACE index c51ca12c..f28bce3e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(fetch_401k) export(fetch_bonus) export(make_iivm_data) export(make_irm_data) +export(make_pliv_BCCH2012) export(make_pliv_CHS2015) export(make_pliv_multiway_cluster_CKMS2021) export(make_plr_CCDDHNR2018) diff --git a/R/datasets.R b/R/datasets.R index a1f0487d..bedbf7de 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -552,6 +552,155 @@ make_pliv_CHS2015 = function(n_obs, alpha = 1, dim_x = 200, dim_z = 150, return(data) } +#' @title Generates data from a partially linear IV regression model used in +#' Belloni et al. (2012). +#' +#' @description +#' Generates data from a linear IV regression model used in +#' Belloni et al. (2012). The data generating process +#' is defined as +#' +#' \eqn{y_i = \beta d_i + e_i,} +#' +#' \eqn{d_i = z_i'\Pi + v_i,} +#' +#' with i.i.d. +#' +#' \eqn{(e_i, v_i) \sim \mathcal{N} \left(0, \left( \begin{array}{cc} +#' \sigma^2_e & \sigma_{ev} \\ \sigma_{ev} & \sigma^2_v\end{array} +#' \right) \right),} +#' +#' with \eqn{\beta} being the parameter of interests and +#' \eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{p_z - 1} +#' \right)}, instrumental variables \eqn{z_i = (z_{i1}, \ldots, z_{ip_z})} drawn +#' from a normal distribution \eqn{N(0,\Sigma)} with covariance matrix +#' \eqn{\Sigma_Z} and \eqn{E[z^2_{ih}]=\sigma^2_z} and \eqn{Corr(z_{ih}, +#' z_{ij})=\rho^{j-h}}. +#' The sparsity parameter `s` can be used to set coefficients in \eqn{\Pi} +#' with \eqn{j>s} exactly to zero, i.e., +#' \eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{s}, 0, +#' \ldots , 0 \right)}. The constant \eqn{C} is calibrated internally such that +#' the concentration parameter \eqn{\mu^2} is set to a specific value specified +#' via `mu2`. +#' +#' Default values are set to \eqn{\rho = 0.5}, \eqn{\sigma^2_e = 1}, +#' \eqn{\sigma^2_z = 1} and \eqn{Corr(e,v) = 0.6}. For the coefficient vectors +#' defaults are set such that \eqn{\beta = 1}, \eqn{\mu^2 = 30} and +#' \eqn{\pi_0 = 0.7}. +#' +#' @references Belloni, A., Chen, D., Chernozhukov, V., and Hansen, C. (2012), +#' Sparse Models and Methods for Optimal Instruments with an Application to +#' Eminent Domain. Econometrica, 80 (6): 2369-2429. +#' +#' @param n_obs (`integer(1)`) \cr +#' The number of observations to simulate. +#' +#' @param beta (`numeric(1)`) \cr +#' The value of the causal parameter. +#' +#' @param dim_z (`integer(1)`) \cr +#' The number of instruments. +#' +#' @param pi_0 (`numeric(1)`) \cr +#' Coefficient vector in first-stage equation. +#' +#' @param s (`integer(1)`) \cr +#' Sparsity index. +#' +#' @param mu2 (`numeric(1)`) \cr +#' Value of concentration parameter used for calibration of constant \eqn{C}. +#' +#' @param rho (`numeric(1)`) \cr +#' Coefficient determining correlation between instruments. +#' +#' @param sigma_z (`numeric(1)`) \cr +#' Standard deviation of instruments. +#' +#' @param corr (`numeric(1)`) \cr +#' Correlation between errors \eqn{e} and \eqn{v}. +#' +#' @param sigma_e (`numeric(1)`) \cr +#' Standard deviation for error \eqn{e}. +#' +#' @param return_type (`character(1)`) \cr +#' If `"DoubleMLData"`, returns a `DoubleMLData` object. +#' If `"data.frame"` returns a `data.frame()`. +#' If `"data.table"` returns a `data.table()`. +#' If `"matrix"` a named `list()` with entries `X`, `y`, `d` and +#' `z` is returned. +#' Every entry in the list is a `matrix()` object. Default is `"DoubleMLData"`. +#' +#' @return A data object according to the choice of `return_type`. +#' +#' @export +make_pliv_BCCH2012 = function(n_obs = 100, beta = 1, dim_z = 100, pi_0 = 0.7, + s = 0, mu2 = 30, + rho = 0.5, sigma_z = 1, + corr = 0.6, sigma_e = 1, + return_type = "DoubleMLData") { + # based on https://www.econometricsociety.org/content/supplement-sparse-models-and-methods-optimal-instruments-application-eminent-domain-1 and + # http://qed.econ.queensu.ca/jae/datasets/spindler001/ + + assert_count(n_obs) + assert_numeric(beta, len = 1) + assert_count(dim_z) + assert_numeric(pi_0, len = 1) + assert_count(s, positive = FALSE) + assert_numeric(mu2, len = 1) + assert_numeric(rho, len = 1) + assert_numeric(sigma_z, len = 1) + assert_numeric(corr, len = 1) + assert_numeric(sigma_e, len = 1) + assert_choice( + return_type, + c("data.table", "matrix", "data.frame", "DoubleMLData")) + + sigma_z = toeplitz(rho^(0:(dim_z - 1))) + mu_z = rep(0, dim_z) + z = rmvnorm(n = n_obs, mean = mu_z, sigma = sigma_z) + pi = pi_0^(0:(dim_z - 1)) + + scale = c(sqrt(mu2 / ((n_obs + mu2) * pi %*% sigma_z %*% pi))) + sigma_v = sqrt(1 - (scale^2) * t(pi) %*% sigma_z %*% pi) + sev = corr * sigma_e * sigma_v + + sigma_e_v = matrix(c(sigma_e^2, sev, sev, sigma_v^2), ncol = 2) + mu_e_v = rep(0, 2) + e_v = rmvnorm(n = n_obs, mean = mu_e_v, sigma = sigma_e_v) + e = e_v[, 1] + v = e_v[, 2] + + if (s > 0) { + pi[(s + 1):dim_z] = 0 + } + d = scale * z %*% pi + v + y = beta * d + e + + if (return_type == "matrix") { + return(list("y" = y, "d" = d, "z" = z)) + } else { + colnames(z) = paste0("Z", 1:dim_z) + colnames(y) = "y" + colnames(d) = "d" + + if (return_type == "data.frame") { + data = data.frame(y, d, z) + return(data) + } else if (return_type == "data.table") { + data = data.table(y, d, z) + return(data) + } else if (return_type == "DoubleMLData") { + dt = data.table(y, d, z) + data = DoubleMLData$new(dt, + y_col = "y", d_cols = "d", + x_cols = NULL, + z_cols = colnames(z)) + return(data) + } + } + return(data) +} + #' @title Generates data from a interactive regression (IRM) model. #' diff --git a/R/double_ml_pliv.R b/R/double_ml_pliv.R index 0fc09e86..aa5307ce 100644 --- a/R/double_ml_pliv.R +++ b/R/double_ml_pliv.R @@ -257,7 +257,6 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV", return(res) }, - ml_nuisance_and_score_elements_partialX = function(smpls, ...) { g_hat = dml_cv_predict(self$learner$ml_g, @@ -447,28 +446,74 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV", ml_nuisance_and_score_elements_partialZ = function(smpls, ...) { # nuisance r - - r_hat = dml_cv_predict(self$learner$ml_r, - c( - self$data$x_cols, - self$data$other_treat_cols, - self$data$z_cols), - self$data$treat_col, - self$data$data_model, - nuisance_id = "nuis_r", - smpls = smpls, - est_params = self$get_params("ml_r"), - return_train_preds = FALSE, - learner_class = private$learner_class$ml_r, - fold_specific_params = private$fold_specific_params) - d = self$data$data_model[[self$data$treat_col]] y = self$data$data_model[[self$data$y_col]] + if (test_character(self$data$x_cols, len = 0)) { + r_hat = dml_cv_predict(self$learner$ml_r, + c( + self$data$x_cols, + self$data$other_treat_cols, + self$data$z_cols), + self$data$treat_col, + self$data$data_model, + nuisance_id = "nuis_r", + smpls = smpls, + est_params = self$get_params("ml_r"), + return_train_preds = FALSE, + learner_class = private$learner_class$ml_r, + fold_specific_params = + private$fold_specific_params) + } else { + # Partial out Xs from y and d by using linear regression + task_part_y = initiate_task("lm_part_out_y", self$data$data_model, + target = self$data$y_col, + select_cols = c( + self$data$x_cols, + self$data$other_treat_cols), + "LearnerRegr") + learner_lm = LearnerRegrLM$new() + resampling_part_y = rsmp("insample")$instantiate(task_part_y) + r_part_y = resample(task_part_y, learner_lm, resampling_part_y, + store_models = TRUE) + y_tilde = y - as.data.table(r_part_y$prediction())$response + + task_part_d = initiate_task("lm_part_out_d", self$data$data_model, + target = self$data$treat_col, + select_cols = c( + self$data$x_cols, + self$data$other_treat_cols), + "LearnerRegr") + resampling_part_d = rsmp("insample")$instantiate(task_part_d) + r_part_d = resample(task_part_d, learner_lm, resampling_part_d, + store_models = TRUE) + d_tilde = d - as.data.table(r_part_d$prediction())$response + + data_aux = data.table(self$data$data_model, "d_tilde" = d_tilde) + r_hat = dml_cv_predict(self$learner$ml_r, + c( + self$data$x_cols, + self$data$other_treat_cols, + self$data$z_cols), + "d_tilde", + data_aux, + nuisance_id = "nuis_r", + smpls = smpls, + est_params = self$get_params("ml_r"), + return_train_preds = FALSE, + learner_class = private$learner_class$ml_r, + fold_specific_params = + private$fold_specific_params) + } if (is.character(self$score)) { if (self$score == "partialling out") { - psi_a = -r_hat * d - psi_b = r_hat * y + if (test_character(self$data$x_cols, len = 0)) { + psi_a = -r_hat * d + psi_b = r_hat * y + } else { + psi_a = -r_hat * d_tilde + psi_b = r_hat * y_tilde + } } res = list(psi_a = psi_a, psi_b = psi_b) } else if (is.function(self$score)) { @@ -656,27 +701,61 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV", params = tuning_result_r$params)) return(tuning_result) }, - ml_nuisance_tuning_partialZ = function(smpls, param_set, tune_settings, tune_on_folds, ...) { - if (!tune_on_folds) { - data_tune_list = list(self$data$data_model) + if (test_character(self$data$x_cols, len = 0)) { + if (!tune_on_folds) { + data_tune_list = list(self$data$data_model) + } else { + data_tune_list = lapply( + smpls$train_ids, + function(x) extract_training_data(self$data$data_model, x)) + } + tuning_result_r = dml_tune(self$learner$ml_r, + c( + self$data$x_cols, + self$data$other_treat_cols, + self$data$z_cols), + self$data$treat_col, data_tune_list, + nuisance_id = "nuis_r", + param_set$ml_r, tune_settings, + tune_settings$measure$ml_r, + private$learner_class$ml_r) } else { - data_tune_list = lapply( - smpls$train_ids, - function(x) extract_training_data(self$data$data_model, x)) - } + # Partial out Xs from d by using linear regression + task_part_d = initiate_task("lm_part_out_d", self$data$data_model, + target = self$data$treat_col, + select_cols = c( + self$data$x_cols, + self$data$other_treat_cols), + "LearnerRegr") + resampling_part_d = rsmp("insample")$instantiate(task_part_d) + learner_lm = LearnerRegrLM$new() + r_part_d = resample(task_part_d, learner_lm, resampling_part_d, + store_models = TRUE) + d_tilde = self$data$data_model[[self$data$treat_col]] - + as.data.table(r_part_d$prediction())$response + data_aux = data.table(self$data$data_model, "d_tilde" = d_tilde) - tuning_result_r = dml_tune(self$learner$ml_r, - c( - self$data$x_cols, - self$data$other_treat_cols, - self$data$z_cols), - self$data$treat_col, data_tune_list, - nuisance_id = "nuis_r", - param_set$ml_r, tune_settings, - tune_settings$measure$ml_r, - private$learner_class$ml_r) + if (!tune_on_folds) { + data_tune_list = list(data_aux) + } else { + data_tune_list = lapply( + smpls$train_ids, + function(x) extract_training_data(data_aux, x)) + } + + tuning_result_r = dml_tune(self$learner$ml_r, + c( + self$data$x_cols, + self$data$other_treat_cols, + self$data$z_cols), + "d_tilde", data_tune_list, + nuisance_id = "nuis_r", + param_set$ml_r, tune_settings, + tune_settings$measure$ml_r, + private$learner_class$ml_r) + } tuning_result = list("ml_r" = list(tuning_result_r, params = tuning_result_r$params)) diff --git a/_pkgdown.yml b/_pkgdown.yml index fa33621b..a00bdfae 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -62,6 +62,7 @@ reference: - title: Datasets generators contents: - make_plr_CCDDHNR2018 + - make_pliv_BCCH2012 - make_pliv_CHS2015 - make_irm_data - make_iivm_data diff --git a/man/make_pliv_BCCH2012.Rd b/man/make_pliv_BCCH2012.Rd new file mode 100644 index 00000000..a3ec3b6e --- /dev/null +++ b/man/make_pliv_BCCH2012.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{make_pliv_BCCH2012} +\alias{make_pliv_BCCH2012} +\title{Generates data from a partially linear IV regression model used in +Belloni et al. (2012).} +\usage{ +make_pliv_BCCH2012( + n_obs = 100, + beta = 1, + dim_z = 100, + pi_0 = 0.7, + s = 0, + mu2 = 30, + rho = 0.5, + sigma_z = 1, + corr = 0.6, + sigma_e = 1, + return_type = "DoubleMLData" +) +} +\arguments{ +\item{n_obs}{(\code{integer(1)}) \cr +The number of observations to simulate.} + +\item{beta}{(\code{numeric(1)}) \cr +The value of the causal parameter.} + +\item{dim_z}{(\code{integer(1)}) \cr +The number of instruments.} + +\item{pi_0}{(\code{numeric(1)}) \cr +Coefficient vector in first-stage equation.} + +\item{s}{(\code{integer(1)}) \cr +Sparsity index.} + +\item{mu2}{(\code{numeric(1)}) \cr +Value of concentration parameter used for calibration of constant \eqn{C}.} + +\item{rho}{(\code{numeric(1)}) \cr +Coefficient determining correlation between instruments.} + +\item{sigma_z}{(\code{numeric(1)}) \cr +Standard deviation of instruments.} + +\item{corr}{(\code{numeric(1)}) \cr +Correlation between errors \eqn{e} and \eqn{v}.} + +\item{sigma_e}{(\code{numeric(1)}) \cr +Standard deviation for error \eqn{e}.} + +\item{return_type}{(\code{character(1)}) \cr +If \code{"DoubleMLData"}, returns a \code{DoubleMLData} object. +If \code{"data.frame"} returns a \code{data.frame()}. +If \code{"data.table"} returns a \code{data.table()}. +If \code{"matrix"} a named \code{list()} with entries \code{X}, \code{y}, \code{d} and +\code{z} is returned. +Every entry in the list is a \code{matrix()} object. Default is \code{"DoubleMLData"}.} +} +\value{ +A data object according to the choice of \code{return_type}. +} +\description{ +Generates data from a linear IV regression model used in +Belloni et al. (2012). The data generating process +is defined as + +\eqn{y_i = \beta d_i + e_i,} + +\eqn{d_i = z_i'\Pi + v_i,} + +with i.i.d. + +\eqn{(e_i, v_i) \sim \mathcal{N} \left(0, \left( \begin{array}{cc} +\sigma^2_e & \sigma_{ev} \\ \sigma_{ev} & \sigma^2_v\end{array} +\right) \right),} + +with \eqn{\beta} being the parameter of interests and +\eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{p_z - 1} +\right)}, instrumental variables \eqn{z_i = (z_{i1}, \ldots, z_{ip_z})} drawn +from a normal distribution \eqn{N(0,\Sigma)} with covariance matrix +\eqn{\Sigma_Z} and \eqn{E[z^2_{ih}]=\sigma^2_z} and \eqn{Corr(z_{ih}, +z_{ij})=\rho^{j-h}}. +The sparsity parameter \code{s} can be used to set coefficients in \eqn{\Pi} +with \eqn{j>s} exactly to zero, i.e., +\eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{s}, 0, +\ldots , 0 \right)}. The constant \eqn{C} is calibrated internally such that +the concentration parameter \eqn{\mu^2} is set to a specific value specified +via \code{mu2}. + +Default values are set to \eqn{\rho = 0.5}, \eqn{\sigma^2_e = 1}, +\eqn{\sigma^2_z = 1} and \eqn{Corr(e,v) = 0.6}. For the coefficient vectors +defaults are set such that \eqn{\beta = 1}, \eqn{\mu^2 = 30} and +\eqn{\pi_0 = 0.7}. +} +\references{ +Belloni, A., Chen, D., Chernozhukov, V., and Hansen, C. (2012), +Sparse Models and Methods for Optimal Instruments with an Application to +Eminent Domain. Econometrica, 80 (6): 2369-2429. +} diff --git a/tests/testthat/helper-03-dgp.R b/tests/testthat/helper-03-dgp.R index 53017363..d7ce84b5 100644 --- a/tests/testthat/helper-03-dgp.R +++ b/tests/testthat/helper-03-dgp.R @@ -133,7 +133,8 @@ dgp1_toeplitz = function(n, p, betamax = 4, decay = 0.99, threshold = 0, noiseva return(data) } -make_data_pliv_partialZ = function(n_obs, alpha = 1, dim_x = 5, dim_z = 150) { +make_data_pliv_partialZ = function(n_obs, alpha = 1, dim_x = 5, dim_z = 150, + return_x_vars = TRUE) { sigma_e_u = matrix(c(1, 0.6, 0.6, 1), ncol = 2) mu_e_u = rep(0, 2) @@ -161,12 +162,16 @@ make_data_pliv_partialZ = function(n_obs, alpha = 1, dim_x = 5, dim_z = 150) { d = x %*% gamma + z %*% delta + u y = alpha * d + x %*% beta + epsilon - colnames(x) = paste0("X", 1:dim_x) colnames(z) = paste0("Z", 1:dim_z) colnames(y) = "y" colnames(d) = "d" - data = data.frame(x, y, d, z) + if (return_x_vars) { + data = data.frame(x, y, d, z) + } else { + data = data.frame(y, d, z) + } + return(data) } diff --git a/tests/testthat/helper-04-simdata.R b/tests/testthat/helper-04-simdata.R index fef84990..4d79a0c5 100644 --- a/tests/testthat/helper-04-simdata.R +++ b/tests/testthat/helper-04-simdata.R @@ -103,11 +103,27 @@ dim_z = 150 df = make_data_pliv_partialZ( setting$n, alpha = setting$theta, - dim_x = 5) -Xnames = names(df)[names(df) %in% c("y", "d", paste0("Z", 1:dim_z)) == FALSE] + dim_x = 5, + return_x_vars = FALSE) +Xnames = NULL dml_data = double_ml_data_from_data_frame(df, y_col = "y", d_cols = "d", x_cols = Xnames, z_cols = paste0("Z", 1:dim_z)) data_pliv_partialZ = list( df = df, dml_data = dml_data) + +set.seed(1282) +dim_z = 150 +df = make_data_pliv_partialZ( + setting$n, + alpha = setting$theta, + dim_x = 5, + return_x_vars = TRUE) +Xnames = names(df)[names(df) %in% c("y", "d", paste0("Z", 1:dim_z)) == FALSE] +dml_data = double_ml_data_from_data_frame(df, + y_col = "y", + d_cols = "d", x_cols = Xnames, z_cols = paste0("Z", 1:dim_z)) +data_pliv_partialZ_with_X = list( + df = df, + dml_data = dml_data) diff --git a/tests/testthat/helper-14-dml_pliv_partial_z.R b/tests/testthat/helper-14-dml_pliv_partial_z.R index c5e673de..bbde349b 100644 --- a/tests/testthat/helper-14-dml_pliv_partial_z.R +++ b/tests/testthat/helper-14-dml_pliv_partial_z.R @@ -12,6 +12,17 @@ dml_pliv_partial_z = function(data, y, d, z, all_thetas = all_ses = rep(NA_real_, n_rep) all_preds = list() + # check whether data contains Xs + x_indx = names(data)[!(names(data) %in% c(y, d, z))] + if (length(x_indx) != 0) { + formula_rhs = paste0(x_indx, collapse = " + ") + lm_y = lm(paste0("y ~ ", formula_rhs), data) + data$y_tilde = data[, y] - predict(lm_y) + + lm_d = lm(paste0("d ~ ", formula_rhs), data) + data$d_tilde = data[, d] - predict(lm_d) + } + for (i_rep in 1:n_rep) { this_smpl = smpls[[i_rep]] @@ -26,8 +37,14 @@ dml_pliv_partial_z = function(data, y, d, z, this_smpl, all_preds[[i_rep]]) r_hat = residuals$r_hat - D = data[, d] - Y = data[, y] + + if (all(!(names(data) %in% c("y_tilde", "d_tilde")))) { + D = data[, d] + Y = data[, y] + } else { + D = data[, "d_tilde"] + Y = data[, "y_tilde"] + } # DML 1 if (dml_procedure == "dml1") { @@ -75,7 +92,8 @@ dml_pliv_partial_z = function(data, y, d, z, res = list( coef = theta, se = se, t = t, pval = pval, thetas = all_thetas, ses = all_ses, - all_preds = all_preds, smpls = smpls) + all_preds = all_preds, smpls = smpls, + data_with_res = data) return(res) } @@ -88,23 +106,39 @@ fit_nuisance_pliv_partial_z = function(data, y, d, z, train_ids = smpls$train_ids test_ids = smpls$test_ids - # nuisance r: E[D|X] - r_indx = names(data) != y - data_r = data[, r_indx, drop = FALSE] - task_r = mlr3::TaskRegr$new(id = paste0("nuis_r_", d), backend = data_r, target = d) - if (!is.null(params_r)) { - ml_r$param_set$values = params_r - } + if (all(!(names(data) %in% c("y_tilde", "d_tilde")))) { + # case without Xs - resampling_r = mlr3::rsmp("custom") - resampling_r$instantiate(task_r, train_ids, test_ids) + # nuisance r: E[D|X] + r_indx = names(data) != y + data_r = data[, r_indx, drop = FALSE] + task_r = mlr3::TaskRegr$new(id = paste0("nuis_r_", d), backend = data_r, target = d) + if (!is.null(params_r)) { + ml_r$param_set$values = params_r + } - r_r = mlr3::resample(task_r, ml_r, resampling_r, store_models = TRUE) - r_hat_list = lapply(r_r$predictions(), function(x) x$response) + resampling_r = mlr3::rsmp("custom") + resampling_r$instantiate(task_r, train_ids, test_ids) + + r_r = mlr3::resample(task_r, ml_r, resampling_r, store_models = TRUE) + r_hat_list = lapply(r_r$predictions(), function(x) x$response) + } else { + r_indx = !(names(data) %in% c(y, d, "y_tilde")) + data_r = data[, r_indx, drop = FALSE] + task_r = mlr3::TaskRegr$new(id = paste0("nuis_r_", d), backend = data_r, target = "d_tilde") + if (!is.null(params_r)) { + ml_r$param_set$values = params_r + } + + resampling_r = mlr3::rsmp("custom") + resampling_r$instantiate(task_r, train_ids, test_ids) + + r_r = mlr3::resample(task_r, ml_r, resampling_r, store_models = TRUE) + r_hat_list = lapply(r_r$predictions(), function(x) x$response) + } all_preds = list( r_hat_list = r_hat_list) - return(all_preds) } @@ -153,8 +187,14 @@ bootstrap_pliv_partial_z = function(theta, se, data, y, d, z, n_folds, smpls, smpls[[i_rep]], all_preds[[i_rep]]) r_hat = residuals$r_hat - D = data[, d] - Y = data[, y] + + if (all(!(names(data) %in% c("y_tilde", "d_tilde")))) { + D = data[, d] + Y = data[, y] + } else { + D = data[, "d_tilde"] + Y = data[, "y_tilde"] + } psi = (Y - D * theta[i_rep]) * r_hat psi_a = -r_hat * D diff --git a/tests/testthat/test-double_ml_data.R b/tests/testthat/test-double_ml_data.R index 4db052c3..55b28dfc 100644 --- a/tests/testthat/test-double_ml_data.R +++ b/tests/testthat/test-double_ml_data.R @@ -39,14 +39,6 @@ test_that("Unit tests for DoubleMLData", { expect_equal(D1_multZ$data, multZ_dt1) expect_equal(D1_multZ$data_model, multZ_dt1) - # No X - D1b_multZ = double_ml_data_from_matrix(X = NULL, y, d, z_mult) - multZ_dt1b = as.data.table( - data.frame(data, "z1" = z, "z2" = d2[, 2]))[, c("y", "d", "z1", "z2"), - with = FALSE] - expect_equal(D1_multZ$data, multZ_dt1) - expect_equal(D1_multZ$data_model, multZ_dt1) - # No X D1b_multZ = double_ml_data_from_matrix(X = NULL, y, d, z_mult) multZ_dt1b = as.data.table( diff --git a/tests/testthat/test-double_ml_datasets.R b/tests/testthat/test-double_ml_datasets.R index aba35204..5f39a7a9 100644 --- a/tests/testthat/test-double_ml_datasets.R +++ b/tests/testthat/test-double_ml_datasets.R @@ -40,6 +40,18 @@ patrick::with_parameters_test_that("Unit tests for datasets functionalities:", expect_is(df$z, "matrix") } + # Test BCCH2012 + if (return_type != "matrix") { + df = make_pliv_BCCH2012(return_type = return_type) + expect_is(df, paste0(return_type)) + } else { + df = make_pliv_BCCH2012(return_type = return_type) + expect_is(df, "list") + expect_is(df$y, "matrix") + expect_is(df$d, "matrix") + expect_is(df$z, "matrix") + } + # Test CKMS2019 N = 10 M = 10 diff --git a/tests/testthat/test-double_ml_pliv_partial_z_parameter_passing.R b/tests/testthat/test-double_ml_pliv_partial_z_parameter_passing.R index 06afe5e1..1c6985f5 100644 --- a/tests/testthat/test-double_ml_pliv_partial_z_parameter_passing.R +++ b/tests/testthat/test-double_ml_pliv_partial_z_parameter_passing.R @@ -8,12 +8,14 @@ test_cases = expand.grid( learner = "regr.rpart", dml_procedure = c("dml1", "dml2"), score = "partialling out", + with_x = c(TRUE, FALSE), stringsAsFactors = FALSE) test_cases_nocf = expand.grid( learner = "regr.rpart", dml_procedure = "dml1", score = "partialling out", + with_x = c(TRUE, FALSE), stringsAsFactors = FALSE) test_cases[".test_name"] = apply(test_cases, 1, paste, collapse = "_") @@ -28,6 +30,11 @@ patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.par learner_pars = get_default_mlmethod_pliv(learner) df = data_pliv$df + if (!with_x) { + x_indx = grep("X", names(df)) + df = df[, -x_indx, drop = FALSE] + } + set.seed(3141) pliv_hat = dml_pliv_partial_z(df, y = "y", d = "d", z = c("z", "z2"), @@ -38,8 +45,15 @@ patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.par theta = pliv_hat$coef se = pliv_hat$se + if (!with_x) { + df_boot = df + } else { + df_boot = pliv_hat$data_with_res + } + + set.seed(3141) boot_theta = bootstrap_pliv_partial_z(pliv_hat$thetas, pliv_hat$ses, - df, + df_boot, y = "y", d = "d", z = c("z", "z2"), n_folds = n_folds, n_rep = n_rep, smpls = pliv_hat$smpls, @@ -70,6 +84,7 @@ patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.par se_obj = dml_pliv_obj$se # bootstrap + set.seed(3141) dml_pliv_obj$bootstrap(method = "normal", n_rep = n_rep_boot) boot_theta_obj = dml_pliv_obj$boot_coef @@ -87,6 +102,11 @@ patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.par learner_pars = get_default_mlmethod_pliv(learner) df = data_pliv$df + if (!with_x) { + x_indx = grep("X", names(df)) + df = df[, -x_indx, drop = FALSE] + } + # Passing for non-cross-fitting case set.seed(3141) my_task = Task$new("help task", "regr", data_pliv$df) @@ -187,7 +207,7 @@ patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.par } ) -patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.partialXZ (default vs explicit)", +patrick::with_parameters_test_that("Unit tests for parameter passing of PLIV.partialZ (default vs explicit)", .cases = test_cases, { n_folds = 2 n_rep = 3 diff --git a/tests/testthat/test-double_ml_pliv_partial_z_with_x.R b/tests/testthat/test-double_ml_pliv_partial_z_with_x.R new file mode 100644 index 00000000..2b2e23f3 --- /dev/null +++ b/tests/testthat/test-double_ml_pliv_partial_z_with_x.R @@ -0,0 +1,76 @@ +context("Unit tests for PLIV.partialZ with covariates X") + +lgr::get_logger("mlr3")$set_threshold("warn") + +skip_on_cran() + +test_cases = expand.grid( + learner = c("regr.lm", "regr.glmnet"), + dml_procedure = c("dml1", "dml2"), + score = "partialling out", + stringsAsFactors = FALSE) +test_cases["test_name"] = apply(test_cases, 1, paste, collapse = "_") + +patrick::with_parameters_test_that("Unit tests for PLIV.partialZ:", + .cases = test_cases, { + learner_pars = get_default_mlmethod_pliv(learner) + n_rep_boot = 498 + + set.seed(3141) + dim_z = 150 + pliv_hat = dml_pliv_partial_z(data_pliv_partialZ_with_X$df, + y = "y", d = "d", z = paste0("Z", 1:dim_z), + n_folds = 5, + ml_r = learner_pars$ml_r$clone(), + dml_procedure = dml_procedure, score = score) + theta = pliv_hat$coef + se = pliv_hat$se + + # data with residuals + data_with_res = pliv_hat$data_with_res + + set.seed(3141) + boot_theta = bootstrap_pliv_partial_z(pliv_hat$thetas, pliv_hat$ses, + data_with_res, + y = "y", d = "d", z = paste0("Z", 1:dim_z), + n_folds = 5, smpls = pliv_hat$smpls, + all_preds = pliv_hat$all_preds, + bootstrap = "normal", n_rep_boot = n_rep_boot)$boot_coef + + set.seed(3141) + double_mlpliv_obj = DoubleMLPLIV.partialZ(data_pliv_partialZ_with_X$dml_data, + ml_r = learner_pars$ml_r$clone(), + n_folds = 5, + score = score, + dml_procedure = dml_procedure) + + double_mlpliv_obj$fit() + theta_obj = double_mlpliv_obj$coef + se_obj = double_mlpliv_obj$se + + # bootstrap + set.seed(3141) + double_mlpliv_obj$bootstrap(method = "normal", n_rep = n_rep_boot) + boot_theta_obj = double_mlpliv_obj$boot_coef + + # at the moment the object result comes without a name + expect_equal(theta, theta_obj, tolerance = 1e-8) + expect_equal(se, se_obj, tolerance = 1e-8) + expect_equal(as.vector(boot_theta), as.vector(boot_theta_obj), tolerance = 1e-8) + } +) + +test_that("Unit tests for PLIV.partialZ invalid score", { + msg = paste( + "Callable score not implemented for DoubleMLPLIV with", + "partialX=FALSE and partialZ=TRUE.") + double_mlplr_obj <- DoubleMLPLIV.partialZ( + data_pliv_partialZ$dml_data, + ml_r = mlr3::lrn("regr.rpart"), + score = function(x) { + return(mean(x)) + }) + expect_error(double_mlplr_obj$fit(), + regexp = msg) +} +) diff --git a/tests/testthat/test-double_ml_pliv_tuning.R b/tests/testthat/test-double_ml_pliv_tuning.R index 0687d250..aa17f289 100644 --- a/tests/testthat/test-double_ml_pliv_tuning.R +++ b/tests/testthat/test-double_ml_pliv_tuning.R @@ -52,7 +52,6 @@ patrick::with_parameters_test_that("Unit tests for tuning of PLIV", .cases = test_cases, { # TBD: Functional Test Case - set.seed(3141) n_folds = 2 n_rep_boot = 498 @@ -70,6 +69,7 @@ patrick::with_parameters_test_that("Unit tests for tuning of PLIV", z_cols = z_vars[[z_indx]] set.seed(3141) df = data_pliv$df + Xnames = names(df)[names(df) %in% c("y", "d", "z", "z2") == FALSE] data_ml = double_ml_data_from_data_frame(df, y_col = "y", @@ -127,6 +127,33 @@ patrick::with_parameters_test_that("Unit tests for tuning of PLIV", # } # if (data_ml$n_instr > 1) { + + # Case without X's + set.seed(3141) + data_ml_noX = double_ml_data_from_data_frame(df, + y_col = "y", + d_cols = "d", x_cols = character(0), z_cols = z_cols) + double_mlpliv_obj_tuned_Z_noX = DoubleMLPLIV.partialZ(data_ml_noX, + n_folds = n_folds, + ml_r = learner, + dml_procedure = dml_procedure, + score = score, + n_rep = n_rep) + param_grid_r = list("ml_r" = param_grid[["ml_r"]]) + tune_settings_r = tune_settings + tune_settings_r$measure$ml_g = tune_settings_r$measure$ml_m = NULL + double_mlpliv_obj_tuned_Z_noX$tune( + param_set = param_grid_r, tune_on_folds = tune_on_folds, + tune_settings = tune_settings_r) + double_mlpliv_obj_tuned_Z_noX$fit() + + theta_obj_tuned_Z_noX = double_mlpliv_obj_tuned_Z_noX$coef + se_obj_tuned_Z_noX = double_mlpliv_obj_tuned_Z_noX$se + + expect_is(theta_obj_tuned_Z_noX, "numeric") + expect_is(se_obj_tuned_Z_noX, "numeric") + + # Case with X's set.seed(3141) double_mlpliv_obj_tuned_Z = DoubleMLPLIV.partialZ(data_ml, n_folds = n_folds,