Merge pull request #62 from NSAPH-Software/release_0.2.1

Release 0.2.1
NSAPH-Software · Mar 15, 2023 · 737a88a · 737a88a
2 parents 3cf62a5 + 4ea1be3
commit 737a88a
Show file tree

Hide file tree

Showing 155 changed files with 2,915 additions and 1,943 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: GPCERF
 Title: Gaussian Processes for Estimating Causal Exposure Response Curves
-Version: 0.2.0
+Version: 0.2.1
 Authors@R: c(
     person("Naeem", "Khoshnevis", email = "[email protected]",
            role=c("aut","cre"), 
@@ -31,9 +31,11 @@ Imports:
     Rcpp,
     RcppArmadillo,
     ggplot2,
+    cowplot,
     rlang,
     Rfast,
-    SuperLearner
+    SuperLearner,
+    wCorr
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,10 +13,10 @@ export(compute_rl_deriv_nn)
 export(compute_w_corr)
 export(estimate_cerf_gp)
 export(estimate_cerf_nngp)
+export(estimate_gps)
 export(generate_synthetic_data)
 export(get_logger)
 export(set_logger)
-export(train_gps)
 import(MASS)
 import(Rcpp)
 import(RcppArmadillo)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+# GPCERF 0.2.1 (2023-01-15)
+
+## Changed
+- `full GP` --> `standard GP`
+- `plot`s of exposure response function objects include covariate balance.
+- `formula` is no longer need in nn functions.
+- `estimate_gps` now returns the used exposure level, too. 
+- `train_gps` --> `estimate_gps` 
+- The nearest neighbor approach does not get `expand` as an input parameter (`n_neighbor` * `expand` --> `n_neighbor`).
+- The weighted covariate balance now is computed using the wCorr package.
+
+
 # GPCERF 0.2.0 (2023-01-22)
 
 ## Changed

diff --git a/R/calc_ac.R b/R/calc_ac.R
diff --git a/R/compute_deriv_nn.R b/R/compute_deriv_nn.R
@@ -7,19 +7,24 @@
 #'
 #' @param w A scalar of exposure level of interest.
 #' @param w_obs A vector of observed exposure levels of all samples.
-#' @param GPS_m A data.frame of GPS vectors. Including:
-#'   - Column 1: GPS values.
-#'   - Column 2: Prediction of exposure for covariate of each data sample (e_gps_pred).
-#'   - Column 3: Standard deviation of  e_gps (e_gps_std).
+#' @param gps_m An S3 gps object including:
+#'   gps: A data.frame of GPS vectors.
+#'     - Column 1: GPS
+#'     - Column 2: Prediction of exposure for covariate of each data sample
+#'     (e_gps_pred).
+#'     - Column 3: Standard deviation of  e_gps (e_gps_std)
+#'   used_params:
+#'     - dnorm_log: TRUE or FALSE
 #' @param y_obs A vector of observed outcome values.
 #' @param hyperparam A vector of hyper-parameters in the GP model.
-#' @param n_neighbor The number of nearest neighbors on one side (see also \code{expand}).
-#' @param expand Scaling factor to determine the total number of nearest
-#' neighbors. The total is \code{2 * expand * n_neighbor}.
-#' @param block_size The number of samples included in a computation block. Mainly used to
-#' balance the speed and memory requirement. Larger \code{block_size} is faster, but requires more memory.
-#' @param kernel_fn The covariance function. The input is the square of Euclidean distance.
-#' @param kernel_deriv_fn The partial derivative of the covariance function. The input is the square of Euclidean distance.
+#' @param n_neighbor The number of nearest neighbors on one side.
+#' @param block_size The number of samples included in a computation block.
+#' Mainly used to balance the speed and memory requirement. Larger
+#' \code{block_size} is faster, but requires more memory.
+#' @param kernel_fn The covariance function. The input is the square of
+#' Euclidean distance.
+#' @param kernel_deriv_fn The partial derivative of the covariance function.
+#' The input is the square of Euclidean distance.
 #'
 #' @return
 #' A scalar of estimated derivative of CERF at \code{w} in nnGP.
@@ -28,14 +33,13 @@
 #'
 compute_deriv_nn <- function(w,
                              w_obs,
-                             GPS_m,
+                             gps_m,
                              y_obs,
                              hyperparam,
                              n_neighbor,
-                             expand,
                              block_size,
                              kernel_fn = function(x) exp(-x),
-                             kernel_deriv_fn = function(x) -exp(-x)){
+                             kernel_deriv_fn = function(x) -exp(-x)) {
 
 
   # Get hyperparameters
@@ -45,54 +49,60 @@ compute_deriv_nn <- function(w,
 
 
   # Get gps and helper functions
-  GPS <- GPS_m$GPS
-  e_gps_pred <- GPS_m$e_gps_pred
-  e_gps_std <- GPS_m$e_gps_std
+  gps <- gps_m$gps$gps
+  e_gps_pred <- gps_m$gps$e_gps_pred
+  e_gps_std <- gps_m$gps$e_gps_std
+  dnorm_log <- gps_m$used_params$dnorm_log
 
+  gps_w <- dnorm(w, mean = e_gps_pred, sd = e_gps_std, log = dnorm_log)
 
-  GPS_w <- dnorm(w, mean = e_gps_pred, sd = e_gps_std, log = TRUE)
+  n <- length(gps_w)
+  n_block <- ceiling(n / block_size)
+  obs_raw <- cbind(w_obs, gps)
+  obs_ord <- obs_raw[order(obs_raw[, 1]), ]
+  y_obs_ord <- y_obs[order(obs_raw[, 1])]
 
-  n <- length(GPS_w)
-  n_block <- ceiling(n/block_size)
-  obs_raw <- cbind(w_obs, GPS)
-  obs_ord <- obs_raw[order(obs_raw[,1]),]
-  y_obs_ord <- y_obs[order(obs_raw[,1])]
 
-
-  if(w >= obs_ord[nrow(obs_ord),1]){
-    idx_all <- seq( nrow(obs_ord) - expand*n_neighbor + 1, nrow(obs_ord), 1)
-  }else{
-    idx_anchor <- which.max(obs_ord[,1]>=w)
-    idx_start <- max(1, idx_anchor - n_neighbor*expand)
-    idx_end <- min(nrow(obs_ord), idx_anchor + n_neighbor*expand)
-    if(idx_end == nrow(obs_ord)){
-      idx_all <- seq(idx_end - n_neighbor*2*expand + 1, idx_end, 1)
-    }else{
-      idx_all <- seq(idx_start, idx_start+n_neighbor*2*expand-1, 1)
+  if (w >= obs_ord[nrow(obs_ord), 1]) {
+    idx_all <- seq(nrow(obs_ord) - n_neighbor + 1, nrow(obs_ord), 1)
+  } else {
+    idx_anchor <- which.max(obs_ord[, 1] >= w)
+    idx_start <- max(1, idx_anchor - n_neighbor)
+    idx_end <- min(nrow(obs_ord), idx_anchor + n_neighbor)
+    if (idx_end == nrow(obs_ord)) {
+      idx_all <- seq(idx_end - n_neighbor * 2 + 1, idx_end, 1)
+    } else {
+      idx_all <- seq(idx_start, idx_start + n_neighbor * 2 - 1, 1)
     }
   }
 
-  obs_use <- t(t(obs_ord[idx_all,])*(1/sqrt(c(alpha, beta))))
+  obs_use <- t(t(obs_ord[idx_all, ]) * (1 / sqrt(c(beta, alpha))))
   y_use <- y_obs_ord[idx_all]
 
-  obs_new <- t(t(cbind(w, GPS_w))*(1/sqrt(c(alpha, beta))))
-  id_all <- split(1:n, ceiling(seq_along(1:n)/n_block))
-  Sigma_obs <- g_sigma*kernel_fn(as.matrix(dist(obs_use))^2) + diag(nrow(obs_use))
-  Sigma_obs_inv <- chol2inv(chol(Sigma_obs))
+  obs_new <- t(t(cbind(w, gps_w)) * (1 / sqrt(c(beta, alpha))))
+  id_all <- split(1:n, ceiling(seq_along(1:n) / n_block))
+  # sigma refers to capital Sigma in the paper.
+  sigma_obs <- g_sigma * kernel_fn(as.matrix(dist(obs_use)) ^ 2) +
+               diag(nrow(obs_use))
+  sigma_obs_inv <- chol2inv(chol(sigma_obs))
+
+  all_weights <- sapply(id_all, function(id_ind) {
 
-  all_weights <- sapply(id_all, function(id.ind){
+    # TODO: change index to column name.
+    cross_dist <- spatstat.geom::crossdist(obs_new[id_ind, 1],
+                                           obs_new[id_ind, 2],
+                                           obs_use[, 1], obs_use[, 2])
 
-    cross_dist <- spatstat.geom::crossdist(obs_new[id.ind,1], obs_new[id.ind,2],
-                                          obs_use[,1], obs_use[,2])
 
-    Sigma_cross <- g_sigma*(1/alpha)*(2*outer(rep(w,length(id.ind))*(1/alpha),
-                                              obs_use[,1], "-"))*
-                                              kernel_deriv_fn(cross_dist^2)
+    sigma_cross <- g_sigma * (1 / beta) *
+                               (2 * outer(rep(w, length(id_ind)) * (1 / beta),
+                                              obs_use[, 1], "-")) *
+                                              kernel_deriv_fn(cross_dist ^ 2)
     #mean
-    wght <- Sigma_cross%*%Sigma_obs_inv
+    wght <- sigma_cross %*% sigma_obs_inv
     colSums(wght)
   })
-  weights <- rowSums(all_weights)/n
+  weights <- rowSums(all_weights) / n
 
-  return(weights%*%y_use)
+  return(weights %*% y_use)
 }
diff --git a/R/compute_deriv_weights_gp.R b/R/compute_deriv_weights_gp.R
@@ -7,11 +7,14 @@
 #'
 #' @param w A scalar of exposure level of interest.
 #' @param w_obs A vector of observed exposure levels of all samples.
-#' @param GPS_m A data.frame of GPS vectors. Including:
-#'   - Column 1: GPS values.
-#'   - Column 2: Prediction of exposure for covariate of each data
-#'   sample (e_gps_pred).
-#'   - Column 3: Standard deviation of  e_gps (e_gps_std)
+#' @param gps_m An S3 gps object including:
+#'   gps: A data.frame of GPS vectors.
+#'     - Column 1: GPS
+#'     - Column 2: Prediction of exposure for covariate of each data sample
+#'     (e_gps_pred).
+#'     - Column 3: Standard deviation of  e_gps (e_gps_std)
+#'   used_params:
+#'     - dnorm_log: TRUE or FLASE
 #' @param hyperparam A vector of hyper-parameters in the GP model.
 #' @param kernel_fn The covariance function.
 #' @param kernel_deriv_fn The partial derivative of the covariance function.
@@ -24,36 +27,41 @@
 #'
 compute_deriv_weights_gp <- function(w,
                                      w_obs,
-                                     GPS_m,
+                                     gps_m,
                                      hyperparam,
                                      kernel_fn = function(x) exp(-x),
-                                     kernel_deriv_fn = function(x) -exp(-x)){
+                                     kernel_deriv_fn = function(x) -exp(-x)) {
 
 
   alpha <- hyperparam[[1]]
   beta <- hyperparam[[2]]
   g_sigma <- hyperparam[[3]]
 
 
-  GPS <- GPS_m$GPS
-  e_gps_pred <- GPS_m$e_gps_pred
-  e_gps_std <- GPS_m$e_gps_std
+  gps <- gps_m$gps$gps
+  e_gps_pred <- gps_m$gps$e_gps_pred
+  e_gps_std <- gps_m$gps$e_gps_std
+  dnorm_log <- gps_m$used_params$dnorm_log
 
+  gps_w <- dnorm(w, mean = e_gps_pred, sd = e_gps_std, log = dnorm_log)
+  n <- length(gps_w)
 
-  GPS_w <- dnorm(w, mean = e_gps_pred, sd = e_gps_std, log = TRUE)
-  n <- length(GPS_w)
+  obs_use <- cbind(w_obs * sqrt(1 / beta), gps * sqrt(1 / alpha))
+  colnames(obs_use) <- c("w_sc_obs", "gps_sc_obs")
 
-  obs_use <- cbind(w_obs * sqrt(1 / alpha), GPS * sqrt(1 / beta))
-  obs_new <- cbind(w * sqrt(1 / alpha), GPS_w * sqrt(1 / beta))
-  Sigma_obs <- g_sigma * kernel_fn(as.matrix(dist(obs_use)) ^ 2) +
+  obs_new <- cbind(w * sqrt(1 / beta), gps_w * sqrt(1 / alpha))
+  colnames(obs_new) <- c("w_sc_for_w", "gps_sc_for_w")
+
+  sigma_obs <- g_sigma * kernel_fn(as.matrix(dist(obs_use)) ^ 2) +
                diag(nrow(obs_use))
-  cross_dist <- spatstat.geom::crossdist(obs_new[, 1], obs_new[, 2],
-                                         obs_use[, 1], obs_use[, 2])
+  cross_dist <- spatstat.geom::crossdist(obs_new[, "w_sc_for_w"],
+                                         obs_new[, "gps_sc_for_w"],
+                                         obs_use[, "w_sc_obs"],
+                                         obs_use[, "gps_sc_obs"])
 
-  #TODO: Needs refactoring. `outer` function uses significant amount of memory.
-  Sigma_cross <- g_sigma * sqrt(1 / alpha) * kernel_deriv_fn(cross_dist ^ 2) *
+  sigma_cross <- g_sigma * sqrt(1 / beta) * kernel_deriv_fn(cross_dist ^ 2) *
                          (2 * outer(rep(w, n), w_obs, "-"))
-  weights_all <- Sigma_cross %*% chol2inv(chol(Sigma_obs))
+  weights_all <- sigma_cross %*% chol2inv(chol(sigma_obs))
 
   return(colMeans(weights_all))
 }
diff --git a/R/compute_inverse.R b/R/compute_inverse.R
@@ -18,13 +18,17 @@ compute_inverse <- function(mtrx) {
                 "Current format: ", class(mtrx)[1]))
   }
 
-  if (nrow(mtrx) != ncol(mtrx)){
+  if (nrow(mtrx) != ncol(mtrx)) {
     stop(paste0("The input mtrx should be a square matrix. ",
                 "Current dimension: nrow: ",
                 nrow(mtrx), ", ncol: ", ncol(mtrx)))
   }
 
+  t_1 <- proc.time()
   inv_mtrx <- chol2inv(chol(mtrx))
-
+  t_2 <- proc.time()
+  logger::log_debug("Wall clock time to compute inverse matrix ",
+                    "({nrow(mtrx)}, {ncol(mtrx)}): ",
+                    " {t_2[[3]] - t_1[[3]]} s.")
   return(inv_mtrx)
 }