merge with master

imbs-hl · Apr 30, 2024 · 29f3ca0 · 29f3ca0
2 parents 81bf7d8 + 24a24bf
commit 29f3ca0
Show file tree

Hide file tree

Showing 32 changed files with 607 additions and 97 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: ranger
 Type: Package
 Title: A Fast Implementation of Random Forests
-Version: 0.15.3
-Date: 2023-07-19
+Version: 0.16.0
+Date: 2023-11-09
 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb]
 Maintainer: Marvin N. Wright <[email protected]>
 Description: A fast implementation of Random Forests, particularly suited for high

diff --git a/NEWS b/NEWS
@@ -1,4 +1,11 @@
 
+##### Version 0.16.0
+* Add node.stats option to save node statistics of all nodes
+* Add time.interest option to restrict unique survival times (faster and saves memory)
+* Fix min bucket option in C++ version
+* Fix memory error for always.split.variables in certain settings
+* Fix quantile regression for factor variables in "order" mode
+
 ##### Version 0.15.0
 * Switch to C++14 standard
 * Add min.bucket parameter to restrict terminal node size

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,11 @@
 
+# ranger 0.16.0
+* New CRAN version
+
+# ranger 0.15.4
+* Add node.stats option to save node statistics of all nodes
+* Add time.interest option to restrict unique survival times (faster and saves memory)
+
 # ranger 0.15.3
 * Fix min bucket option in C++ version
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,8 +1,8 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth) {
-    .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth)
+rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest) {
+    .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, min_bucket, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag, regularization_factor, use_regularization_factor, regularization_usedepth, node_stats, time_interest, use_time_interest)
 }
 
 numSmaller <- function(values, reference) {

diff --git a/R/predict.R b/R/predict.R
@@ -243,6 +243,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   regularization.factor <- c(0, 0)
   use.regularization.factor <- FALSE
   regularization.usedepth <- FALSE
+  node.stats <- FALSE
+  time.interest <- c(0, 0)
+  use.time.interest <- FALSE
 
   ## Use sparse matrix
   if (inherits(x, "dgCMatrix")) {
@@ -266,7 +269,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
                       predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, 
                       prediction.type, num.random.splits, sparse.x, use.sparse.data,
                       order.snps, oob.error, max.depth, inbag, use.inbag, 
-                      regularization.factor, use.regularization.factor, regularization.usedepth)
+                      regularization.factor, use.regularization.factor, regularization.usedepth, 
+                      node.stats, time.interest, use.time.interest)
 
   if (length(result) == 0) {
     stop("User interrupt or internal error.")

diff --git a/R/ranger.R b/R/ranger.R
@@ -65,6 +65,22 @@
 ##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. 
 ##' We recommend not to use the 'impurity_corrected' importance when making predictions since the feature permutation step might reduce predictive performance (a warning is raised when predicting on new data). 
 ##'
+##' Note that ranger has different default values than other packages.
+##' For example, our default for \code{mtry} is the square root of the number of variables for all tree types, whereas other packages use different values for regression.
+##' Also, changing one hyperparameter does not change other hyperparameters (where possible). 
+##' For example, \code{splitrule="extratrees"} uses randomized splitting but does not disable bagging as in Geurts et al. (2006).
+##' To disable bagging, use \code{replace = FALSE, sample.fraction = 1}. 
+##' This can also be used to grow a single decision tree without bagging and feature subsetting: \code{ranger(..., num.trees = 1, mtry = p, replace = FALSE, sample.fraction = 1)}, where p is the number of independent variables.
+##'
+##' While random forests are known for their robustness, default hyperparameters not always work well. 
+##' For example, for high dimensional data, increasing the \code{mtry} value and the number of trees \code{num.trees} is recommended. 
+##' For more details and recommendations, see Probst et al. (2019). 
+##' To find the best hyperparameters, consider hyperparameter tuning with the \code{tuneRanger} or \code{mlr3} packages.
+##' 
+##' Out-of-bag prediction error is calculated as accuracy (proportion of misclassified observations) for classification, as Brier score for probability estimation, as mean squared error (MSE) for regression and as one minus Harrell's C-index for survival.
+##' Harrell's C-index is calculated based on the sum of the cumulative hazard function (CHF) over all timepoints, i.e., \code{rowSums(chf)}, where \code{chf} is the the out-of-bag CHF; for details, see Ishwaran et al. (2008).
+##' Calculation of the out-of-bag prediction error can be turned off with \code{oob.error = FALSE}.
+##'
 ##' Regularization works by penalizing new variables by multiplying the splitting criterion by a factor, see Deng & Runger (2012) for details.  
 ##' If \code{regularization.usedepth=TRUE}, \eqn{f^d} is used, where \emph{f} is the regularization factor and \emph{d} the depth of the node.
 ##' If regularization is used, multithreading is deactivated because all trees need access to the list of variables that are already included in the model.
@@ -115,10 +131,12 @@
 ##' @param inbag Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling.
 ##' @param holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error.
 ##' @param quantreg Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction.
+##' @param time.interest Time points of interest (survival only). Can be \code{NULL} (default, use all observed time points), a vector of time points or a single number to use as many time points (grid over observed time points).
 ##' @param oob.error Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.
 ##' @param num.threads Number of threads. Default is number of CPUs available.
 ##' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems.
 ##' @param verbose Show computation status and estimated runtime.
+##' @param node.stats Save node statistics. Set to \code{TRUE} to save prediction, number of observations and split statistics for each node.
 ##' @param seed Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed. 
 ##' @param na.action Handling of missing values. Set to "na.learn" to internally handle missing values (default, see below), to "na.omit" to omit observations with missing values and to "na.fail" to stop if missing values are found.
 ##' @param dependent.variable.name Name of dependent variable, needed if no formula given. For survival forests this is the time variable.
@@ -129,12 +147,12 @@
 ##' @param ... Further arguments passed to or from other methods (currently ignored).
 ##' @return Object of class \code{ranger} with elements
 ##'   \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.}
-##'   \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).}
+##'   \item{\code{predictions}}{Predicted classes/values, based on out-of-bag samples (classification and regression only).}
 ##'   \item{\code{variable.importance}}{Variable importance for each independent variable.}
 ##'   \item{\code{variable.importance.local}}{Variable importance for each independent variable and each sample, if \code{local.importance} is set to TRUE and \code{importance} is set to 'permutation'.}
-##'   \item{\code{prediction.error}}{Overall out of bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.}
-##'   \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out of bag data.}
-##'   \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out of bag samples (classification only).}
+##'   \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is accuracy (proportion of misclassified observations), for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.}
+##'   \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out-of-bag data.}
+##'   \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).}
 ##'   \item{\code{unique.death.times}}{Unique death times (survival only).}
 ##'   \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).}
 ##'   \item{\code{survival}}{Estimated survival function for each sample (survival only).}
@@ -147,6 +165,8 @@
 ##'   \item{\code{importance.mode}}{Importance mode used.}
 ##'   \item{\code{num.samples}}{Number of samples.}
 ##'   \item{\code{inbag.counts}}{Number of times the observations are in-bag in the trees.}
+##'   \item{\code{dependent.variable.name}}{Name of the dependent variable. This is NULL when x/y interface is used.}
+##'   \item{\code{status.variable.name}}{Name of the status variable (survival only). This is NULL when x/y interface is used.}
 ##' @examples
 ##' ## Classification forest with default settings
 ##' ranger(Species ~ ., data = iris)
@@ -203,6 +223,7 @@
 ##'   \item Sandri, M. & Zuccolotto, P. (2008). A bias correction algorithm for the Gini variable importance measure in classification trees. J Comput Graph Stat, 17:611-628. \doi{10.1198/106186008X344522}.
 ##'   \item Coppersmith D., Hong S. J., Hosking J. R. (1999). Partitioning nominal attributes in decision trees. Data Min Knowl Discov 3:197-217. \doi{10.1023/A:1009869804967}.
 ##'   \item Deng & Runger (2012). Feature selection via regularized trees. The 2012 International Joint Conference on Neural Networks (IJCNN), Brisbane, Australia. \doi{10.1109/IJCNN.2012.6252640}.
+##'   \item Probst, P., Wright, M. N. & Boulesteix, A-L. (2019). Hyperparameters and tuning strategies for random forest. WIREs Data Mining Knowl Discov 9:e1301.\doi{10.1002/widm.1301}.
 ##'   }
 ##' @seealso \code{\link{predict.ranger}}
 ##' @useDynLib ranger, .registration = TRUE
@@ -223,9 +244,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
                    local.importance = FALSE, 
                    regularization.factor = 1, regularization.usedepth = FALSE,
                    keep.inbag = FALSE, inbag = NULL, holdout = FALSE,
-                   quantreg = FALSE, oob.error = TRUE,
+                   quantreg = FALSE, time.interest = NULL, oob.error = TRUE,
                    num.threads = NULL, save.memory = FALSE,
-                   verbose = TRUE, seed = NULL, na.action = "na.learn",
+                   verbose = TRUE, node.stats = FALSE, seed = NULL, na.action = "na.learn",
                    dependent.variable.name = NULL, status.variable.name = NULL, 
                    classification = NULL, x = NULL, y = NULL, ...) {
 
@@ -276,7 +297,14 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
       if (!inherits(formula, "formula")) {
         stop("Error: Invalid formula.")
       }
+      if (ncol(data) > 10000) {
+        warning("Avoid the formula interface for high-dimensional data. If ranger is slow or you get a 'protection stack overflow' error, consider the x/y or dependent.variable.name interface (see examples).")
+      }
       data.selected <- parse.formula(formula, data, env = parent.frame())
+      dependent.variable.name <- all.vars(formula)[1]
+      if (inherits(data.selected[, 1], "Surv")) {
+        status.variable.name <- all.vars(formula)[2]
+      }
       y <- data.selected[, 1]
       x <- data.selected[, -1, drop = FALSE]
     }
@@ -646,6 +674,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
     if (length(inbag) != num.trees) {
       stop("Error: Size of inbag list not equal to number of trees.")
     }
+    if (any(sapply(inbag, length) != nrow(x))) {
+      stop("Error: Size of at least one element in inbag not equal to number of samples.")
+    }
   } else {
     stop("Error: Invalid inbag, expects list of vectors of size num.trees.")
   }
@@ -837,6 +868,32 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
     }
   }
 
+  ## Time of interest
+  if (is.null(time.interest)) {
+    time.interest <- c(0, 0)
+    use.time.interest <- FALSE
+  } else {
+    use.time.interest <- TRUE
+    if (treetype != 5) {
+      stop("Error: time.interest only applicable to survival forests.")
+    }
+    if (is.numeric(time.interest) & length(time.interest) == 1) {
+      if (time.interest < 1) {
+        stop("Error: time.interest must be a positive integer.")
+      }
+      # Grid over observed time points
+      nocens <- y[, 2] > 0
+      time <- sort(unique(y[nocens, 1]))
+      if (length(time) <= time.interest) {
+        time.interest <- time
+      } else {
+        time.interest <- time[unique(round(seq.int(1, length(time), length.out = time.interest)))]
+      }
+    } else {
+      time.interest <- sort(unique(time.interest))
+    }
+  }
+
   ## Prediction mode always false. Use predict.ranger() method.
   prediction.mode <- FALSE
   predict.all <- FALSE
@@ -888,7 +945,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
                       predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, 
                       num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, 
                       inbag, use.inbag, 
-                      regularization.factor, use.regularization.factor, regularization.usedepth)
+                      regularization.factor, use.regularization.factor, regularization.usedepth, 
+                      node.stats, time.interest, use.time.interest)
 
   if (length(result) == 0) {
     stop("User interrupt or internal error.")
@@ -989,6 +1047,11 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
     }
   }
 
+  ## Dependent (and status) variable name
+  ## will be NULL only when x/y interface is used
+  result$dependent.variable.name <- dependent.variable.name
+  result$status.variable.name <- status.variable.name
+
   class(result) <- "ranger"
 
   ## Prepare quantile prediction