From 73f5d9f3d7fa026345e96a478d65190be9f5d0ba Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 18:48:50 +0100 Subject: [PATCH 1/7] update readme and clean up docu --- DESCRIPTION | 5 +- NEWS.md | 17 +++++- README.md | 16 +++--- cran-comments.md | 26 +++------ packaging.R | 11 +--- tests/testthat/test-generateNA.R | 1 - tests/testthat/test-imputeUnivariate.R | 2 - tests/testthat/test-pmm.R | 3 -- vignettes/missRanger.Rmd | 74 +++++++++++++------------- vignettes/multiple_imputation.Rmd | 29 ++++++++-- vignettes/working_with_censoring.Rmd | 55 +++++++++++++++---- 11 files changed, 141 insertions(+), 98 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9d3a13d..9c72b50 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: missRanger Title: Fast Imputation of Missing Values -Version: 2.1.5.9000 +Version: 2.2.0 Authors@R: person(given = "Michael", family = "Mayer", @@ -31,9 +31,6 @@ Imports: stats, utils Suggests: - mice, - dplyr, - survival, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/NEWS.md b/NEWS.md index 117fb91..ed274e9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,17 @@ -# missRanger 2.1.5 +# missRanger 2.2.0 + +## Less dependencies + +- Removed {mice} from "suggested" packages. +- Removed {dplyr} from "suggested" packages. +- Removed {survival} from "suggested" packages. + +## Maintenance + +- Adding Github pages. +- Introduction of Github actions. + +# missRanger 2.1.5 (not on CRAN) Maintenance release, @@ -6,7 +19,7 @@ Maintenance release, - changing the package structure, and - bringing vignettes into right order. -# missRanger 2.1.4 +# missRanger 2.1.4 (not on CRAN) ## Minor changes diff --git a/README.md b/README.md index ddc9ea2..2ae2be8 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ## Overview -The {missRanger} package uses the {ranger} package to do fast missing value imputation by chained random forest. As such, it serves as an alternative implementation of the beautiful 'MissForest' algorithm, see vignette. +{missRanger} uses the {ranger} package to do fast missing value imputation by chained random forest. As such, it serves as an alternative implementation of the beautiful 'MissForest' algorithm, see vignette. The main function `missRanger()` offers the option to combine random forest imputation with predictive mean matching. This firstly avoids the generation of values not present in the original data (like a value 0.3334 in a 0-1 coded variable). Secondly, this step tends to raise the variance in the resulting conditional distributions to a realistic level, a crucial element to apply multiple imputation frameworks. @@ -38,7 +38,7 @@ library(missRanger) # Generate data with missing values in all columns irisWithNA <- generateNA(iris, seed = 347) -# Impute missing values with missRanger +# Impute missing values irisImputed <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100) # Check results @@ -46,7 +46,7 @@ head(irisImputed) head(irisWithNA) head(iris) -# With extra trees algorithm +# Replace random forest by extremely randomized trees irisImputed_et <- missRanger( irisWithNA, pmm.k = 3, @@ -54,12 +54,10 @@ irisImputed_et <- missRanger( num.trees = 100 ) -# With "dplyr" syntax -library(dplyr) - -iris %>% - generateNA() %>% - missRanger(verbose = 0, pmm.k = 5) %>% +# Using the pipe... +iris |> + generateNA() |> + missRanger(pmm.k = 5, verbose = 0) |> head() ``` diff --git a/cran-comments.md b/cran-comments.md index 26aa153..757c37e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,25 +1,15 @@ -This is a maintenance release, switching to +# missRanger 2.2.0 -- testthat 3, -- modifying vignette order, -- improving the way how the package is being updated/generated. +- removed suggested dependencies dplyr, mice, survival +- improved docu -## R CMD check results seem okay +## R CMD check -checking for unstated dependencies in examples ... OK +??? checking for unstated dependencies in examples ... OK WARNING 'qpdf' is needed for checks on size reduction of PDFs -## Online check results seem okay (2 notes below) +??? checking for future file timestamps ... NOTE + unable to verify current time -- check_win_devel() -- check_rhub() - -Found the following (possibly) invalid DOIs: - DOI: 10.1093/bioinformatics/btr597 - From: DESCRIPTION - Status: Forbidden - Message: 403 -* checking for detritus in the temp directory ... NOTE -Found the following files/directories: - 'lastMiKTeXException' +## \ No newline at end of file diff --git a/packaging.R b/packaging.R index ff5ae9d..04b0f19 100644 --- a/packaging.R +++ b/packaging.R @@ -15,7 +15,7 @@ library(usethis) use_description( fields = list( Title = "Fast Imputation of Missing Values", - Version = "2.1.5.9000", + Version = "2.2.0", Description = "Alternative implementation of the beautiful 'MissForest' algorithm used to impute mixed-type data sets by chaining random forests, introduced by Stekhoven, D.J. and Buehlmann, P. (2012) . Under the hood, it uses the @@ -38,10 +38,6 @@ use_package("ranger", "Imports") use_package("stats", "Imports") use_package("utils", "Imports") -use_package("dplyr", "Suggests") -use_package("mice", "Suggests") -use_package("survival", "Suggests") - use_gpl_license(2) use_github_links() # use this if this project is on github @@ -77,9 +73,6 @@ use_logo("logo.png") # If package goes to CRAN: infos (check results etc.) for CRAN use_cran_comments() - -# Build website -use_pkgdown() # Github actions use_github_action("document") use_github_action("check-standard") @@ -95,7 +88,7 @@ library(devtools) document() test() -build_vignettes() +# build_vignettes() check(manual = TRUE, cran = TRUE) build() # build(binary = TRUE) diff --git a/tests/testthat/test-generateNA.R b/tests/testthat/test-generateNA.R index ca24c6d..5f634ae 100644 --- a/tests/testthat/test-generateNA.R +++ b/tests/testthat/test-generateNA.R @@ -48,4 +48,3 @@ test_that("it works for data.frame with one row", { expect_true(!anyNA(generateNA(x, p = 0.5))) expect_true(all(is.na(generateNA(x, p = 0.55)))) }) - diff --git a/tests/testthat/test-imputeUnivariate.R b/tests/testthat/test-imputeUnivariate.R index 0443f46..44fa518 100644 --- a/tests/testthat/test-imputeUnivariate.R +++ b/tests/testthat/test-imputeUnivariate.R @@ -57,5 +57,3 @@ test_that("it fails when all values in a column are missing", { x <- rep(NA, 10L) expect_error(imputeUnivariate(x)) }) - - diff --git a/tests/testthat/test-pmm.R b/tests/testthat/test-pmm.R index cc5016a..23087fe 100644 --- a/tests/testthat/test-pmm.R +++ b/tests/testthat/test-pmm.R @@ -54,6 +54,3 @@ test_that("pmm works with NA in xtrain and NA in ytrain", { test_that("pmm gives error when xtest contains NA", { expect_error(pmm(c(1, 1, 2), NA, ytrain = c(0, 1, 2))) }) - - - diff --git a/vignettes/missRanger.Rmd b/vignettes/missRanger.Rmd index 6b7ca30..a20d784 100644 --- a/vignettes/missRanger.Rmd +++ b/vignettes/missRanger.Rmd @@ -19,13 +19,13 @@ knitr::opts_chunk$set( ) ``` -## Introduction +## Overview -The aim of this vignette is to introduce the R package `missRanger` for imputation of missing values and to explain how to use it for multiple imputation. +The aim of this vignette is to introduce {missRanger} for imputation of missing values and to explain how to use it for multiple imputation. -`missRanger` uses the `ranger` package [@wright] to do fast missing value imputation by chained random forest. As such, it can be used as an alternative to `missForest`, a beautiful algorithm introduced in [@stekhoven]. Basically, each variable is imputed by predictions from a random forest using all other variables as covariables. `missRanger` iterates multiple times over all variables until the average out-of-bag prediction error of the models stops to improve. +{missRanger} uses the {ranger} package [@wright] to do fast missing value imputation by chained random forest. As such, it can be used as an alternative to {missForest}, a beautiful algorithm introduced in [@stekhoven]. Basically, each variable is imputed by predictions from a random forest using all other variables as covariables. The main function `missRanger()` iterates multiple times over all variables until the average out-of-bag prediction error of the models stops to improve. -Why should you consider `missRanger`? +Why should you consider {missRanger}? - It is fast. @@ -33,34 +33,30 @@ Why should you consider `missRanger`? - It can deal with most realistic variable types, even dates and times without destroying the original data structure. -- It combines random forest imputation with predictive mean matching. This generates realistic variability and avoids "new" values like 0.3334 in a 0-1 coded variable. Like this, `missRanger` can be used for realistic multiple imputation scenarios, see e.g. [@rubin] for the statistical background. +- It combines random forest imputation with predictive mean matching. This generates realistic variability and avoids "new" values like 0.3334 in a 0-1 coded variable. Like this, `missRanger()` can be used for realistic multiple imputation scenarios, see e.g. [@rubin] for the statistical background. -In the examples below, we will meet two functions from the `missRanger` package: +In the examples below, we will meet two functions from {missRanger}: -- `generateNA`: To replace values in a data set by missing values. +- `generateNA()`: To replace values in a data set by missing values. -- `missRanger`: To impute missing values in a data frame. +- `missRanger()`: To impute missing values in a data frame. ## Installation -From CRAN: -``` +```r +# From CRAN install.packages("missRanger") -``` -Latest version from github: -``` -library(devtools) -install_github("mayer79/missRanger") +# Development version +devtools::install_github("mayer79/missRanger") ``` -## Examples +## Usage We first generate a data set with about 20% missing values per column and fill them again by `missRanger`. ``` {r} library(missRanger) -library(dplyr) set.seed(84553) @@ -82,7 +78,7 @@ irisImputed <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, verbose = 0) head(irisImputed) ``` -Note that `missRanger` offers a `...` argument to pass options to `ranger`, e.g. `num.trees` or `min.node.size`. How would we use its "extra trees" variant with 50 trees? +Note that `missRanger()` offers a `...` argument to pass options to `ranger()`, e.g. `num.trees` or `min.node.size`. How would we use its "extremely randomized trees" variant with 50 trees? ``` {r} irisImputed_et <- missRanger( @@ -97,22 +93,23 @@ head(irisImputed_et) It is as simple! -Further note that `missRanger` does not rely on `tidyverse` but you can embed it into a `dplyr` pipeline (without `group_by`). +{missRanger} also plays well together with the pipe: -``` {r} -iris %>% - generateNA() %>% - missRanger(verbose = 0) %>% +```r +iris |> + generateNA() |> + missRanger(verbose = 0) |> head() ``` -By default `missRanger` uses all columns in the data set to impute all columns with missings. To override this behaviour, you can use an intuitive formula interface: The left hand side specifies the variables to be imputed (variable names separated by a `+`), while the right hand side lists the variables used for imputation. +By default `missRanger()` uses all columns in the data set to impute all columns with missings. To override this behaviour, you can use an intuitive formula interface: The left hand side specifies the variables to be imputed (variable names separated by a `+`), while the right hand side lists the variables used for imputation. ``` {r} # Impute all variables with all (default behaviour). Note that variables without # missing values will be skipped from the left hand side of the formula. -m <- missRanger(irisWithNA, formula = . ~ ., - pmm.k = 3, num.trees = 10, seed = 1, verbose = 0) +m <- missRanger( + irisWithNA, formula = . ~ ., pmm.k = 3, num.trees = 10, seed = 1, verbose = 0 +) head(m) # Same @@ -120,19 +117,20 @@ m <- missRanger(irisWithNA, pmm.k = 3, num.trees = 10, seed = 1, verbose = 0) head(m) # Impute all variables with all except Species -m <- missRanger(irisWithNA, . ~ . - Species, - pmm.k = 3, num.trees = 10, verbose = 0) +m <- missRanger(irisWithNA, . ~ . - Species, pmm.k = 3, num.trees = 10, verbose = 0) head(m) # Impute Sepal.Width by Species -m <- missRanger(irisWithNA, Sepal.Width ~ Species, - pmm.k = 3, num.trees = 10, verbose = 0) +m <- missRanger( + irisWithNA, Sepal.Width ~ Species, pmm.k = 3, num.trees = 10, verbose = 0 +) head(m) # No success. Why? Species contains missing values and thus can only # be used for imputation if it is being imputed as well -m <- missRanger(irisWithNA, Sepal.Width + Species ~ Species, - pmm.k = 3, num.trees = 10, verbose = 0) +m <- missRanger( + irisWithNA, Sepal.Width + Species ~ Species, pmm.k = 3, num.trees = 10, verbose = 0 +) head(m) # Impute all variables univariatly @@ -142,7 +140,7 @@ head(m) ## Imputation takes too much time. What can I do? -`missRanger` is based on iteratively fitting random forests for each variable with missing values. Since the underlying random forest implementation `ranger` uses 500 trees per default, a huge number of trees might be calculated. For larger data sets, the overall process can take very long. +`missRanger()` is based on iteratively fitting random forests for each variable with missing values. Since the underlying random forest implementation `ranger()` uses 500 trees per default, a huge number of trees might be calculated. For larger data sets, the overall process can take very long. Here are tweaks to make things faster: @@ -160,7 +158,7 @@ Here are tweaks to make things faster: ### Examples evaluated on a normal laptop (not run here) -``` r +```r library(ggplot2) # for diamonds data dim(diamonds) # 53940 10 @@ -183,8 +181,7 @@ system.time( # Takes 9 seconds system.time( - m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 50, - sample.fraction = 0.1) + m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 50, sample.fraction = 0.1) ) ``` @@ -204,8 +201,9 @@ m <- missRanger(irisWithNA, num.trees = 20, pmm.k = 3, seed = 5, verbose = 0) head(m) # Weighted by number of non-missing values per row. -m <- missRanger(irisWithNA, num.trees = 20, pmm.k = 3, seed = 5, verbose = 0, - case.weights = non_miss) +m <- missRanger( + irisWithNA, num.trees = 20, pmm.k = 3, seed = 5, verbose = 0, case.weights = non_miss +) head(m) ``` diff --git a/vignettes/multiple_imputation.Rmd b/vignettes/multiple_imputation.Rmd index 9859f20..66c09fb 100644 --- a/vignettes/multiple_imputation.Rmd +++ b/vignettes/multiple_imputation.Rmd @@ -19,17 +19,17 @@ knitr::opts_chunk$set( ) ``` -## How to use `missRanger` for multiple imputation? +## How to use {missRanger} for multiple imputation? For machine learning tasks, imputation is typically seen as a fixed data preparation step like dummy coding. There, multiple imputation is rarely applied as it adds another level of complexity to the analysis. This might be fine since a good validation schema will account for variation introduced by imputation. For tasks with focus on statistical inference (p values, standard errors, confidence intervals, estimation of effects), the extra variability introduced by imputation has to be accounted for except if only very few missing values appear. One of the standard approaches is to impute the data set multiple times, generating e.g. 10 or 100 versions of a complete data set. Then, the intended analysis (t-test, linear model etc.) is applied independently to each of the complete data sets. Their results are combined afterward in a pooling step, usually by Rubin's rule [@rubin]. For parameter estimates, averages are taken. Their variance is basically a combination of the average squared standard errors plus the variance of the parameter estimates across the imputed data sets, leading to inflated standard errors and thus larger p values and wider confidence intervals. -The package `mice` [@buuren] takes case of this pooling step. The creation of multiple complete data sets can be done by `mice` or also by `missRanger`. In the latter case, in order to keep the variance of imputed values at a realistic level, we suggest to use predictive mean matching on top of the random forest imputations. +The package {mice} [@buuren] takes care of this pooling step. The creation of multiple complete data sets can be done by {mice} or also by {missRanger}. In the latter case, in order to keep the variance of imputed values at a realistic level, we suggest to use predictive mean matching on top of the random forest imputation. The following example shows how easy such workflow looks like. -``` {r} +```r library(missRanger) library(mice) @@ -50,9 +50,32 @@ models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x)) # Pool the results by mice summary(pooled_fit <- pool(models)) +# term estimate std.error statistic df p.value +# 1 (Intercept) 2.5366092 0.3575478 7.0944612 74.48225 6.365362e-10 +# 2 Sepal.Width 0.4262516 0.1104055 3.8607804 81.52526 2.253823e-04 +# 3 Petal.Length 0.7311306 0.0895942 8.1604670 60.04758 2.595957e-11 +# 4 Petal.Width -0.1840820 0.1856190 -0.9917193 68.08826 3.248458e-01 +# 5 Speciesversicolor -0.6755016 0.2907406 -2.3233824 82.80105 2.261132e-02 +# 6 Speciesvirginica -0.8584752 0.3970706 -2.1620217 81.93105 3.353349e-02 + # Compare with model on original data summary(lm(Sepal.Length ~ ., data = iris)) +# Coefficients: +# Estimate Std. Error t value Pr(>|t|) +# (Intercept) 2.17127 0.27979 7.760 1.43e-12 *** +# Sepal.Width 0.49589 0.08607 5.761 4.87e-08 *** +# Petal.Length 0.82924 0.06853 12.101 < 2e-16 *** +# Petal.Width -0.31516 0.15120 -2.084 0.03889 * +# Speciesversicolor -0.72356 0.24017 -3.013 0.00306 ** +# Speciesvirginica -1.02350 0.33373 -3.067 0.00258 ** +# --- +# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 +# +# Residual standard error: 0.3068 on 144 degrees of freedom +# Multiple R-squared: 0.8673, Adjusted R-squared: 0.8627 +# F-statistic: 188.3 on 5 and 144 DF, p-value: < 2.2e-16 + ``` The standard errors and p values of the multiple imputation are larger than of the original data set. This reflects the additional uncertainty introduced by the presence of missing values in a realistic way. diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd index 824dd72..1c9d776 100644 --- a/vignettes/working_with_censoring.Rmd +++ b/vignettes/working_with_censoring.Rmd @@ -25,13 +25,13 @@ There is no obvious way of how to deal with survival variables as covariables in Options discussed in [@white] include: -- Use both status variable s and (censored) time variable t +- Use both status variable $s$ and (censored) time variable $t$ -- s and log(t) +- $s$ and $\log(t)$ -- surv(t), and, optionally s +- $\text{surv}(t)$, and, optionally $s$ -By surv(t), we denote the Nelson-Aalen survival estimate at each value of t. The third option is the most elegant one as it explicitly deals with censoring information. We provide some additional details on it in the example +By $\text{surv}(t)$, we denote the Nelson-Aalen survival estimate at each value of $t$. The third option is the most elegant one as it explicitly deals with censoring information. We provide some additional details on it in the example. ### Example @@ -42,16 +42,25 @@ A reasonable way to estimate the covariable adjusted treatment effect is the fol 1. Add Nelson-Aalen survival estimates "surv" to the dataset. 2. Use "surv" as well as the covariables to impute missing values in the covariables multiple times. 3. Perform the intended Cox regression for each of the imputed data sets. -4. Pool their results by Rubin's rule [@rubin], using package `mice` [@buuren]. +4. Pool their results by Rubin's rule [@rubin], using package {mice} [@buuren]. -``` {r} +```r library(missRanger) library(survival) library(mice) + set.seed(65) head(veteran) +# trt celltype time status karno diagtime age prior +# 1 1 squamous 72 1 60 7 69 0 +# 2 1 squamous 411 1 70 5 64 10 +# 3 1 squamous 228 1 60 3 38 0 +# 4 1 squamous 126 1 60 9 63 10 +# 5 1 squamous 118 1 70 11 65 10 +# 6 1 squamous 10 1 20 5 49 0 + # 1. Calculate Nelson-Aalen survival probabilities for each time point nelson_aalen <- summary( survfit(Surv(time, status) ~ 1, data = veteran), @@ -66,8 +75,16 @@ veteran2 <- merge(veteran, nelson_aalen, all.x = TRUE) veteran2 <- generateNA(veteran2, p = c(age = 0.1, karno = 0.1, diagtime = 0.1)) # 2. Generate 20 complete data sets, representing "time" and "status" by "surv" -filled <- replicate(20, missRanger(veteran2, . ~ . - time - status, - verbose = 0, pmm.k = 3, num.trees = 25), simplify = FALSE) +filled <- replicate( + 20, + missRanger( + veteran2, . ~ . - time - status, + verbose = 0, + pmm.k = 3, + num.trees = 25 + ), + simplify = FALSE +) # 3. Run a Cox regression for each of the completed data sets models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x)) @@ -75,8 +92,28 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x)) # 4. Pool the results by mice summary(pooled_fit <- pool(models)) -# On the original +# term estimate std.error statistic df p.value +# 1 trt 0.238408881 0.213416156 1.1171079 108.30303 2.664203e-01 +# 2 celltypesmallcell 0.801088770 0.286383107 2.7972626 112.17712 6.066665e-03 +# 3 celltypeadeno 1.134351839 0.308977998 3.6713030 110.65791 3.731780e-04 +# 4 celltypelarge 0.327092592 0.291069423 1.1237614 109.29555 2.635765e-01 +# 5 karno -0.031250557 0.005786704 -5.4004073 99.60711 4.529695e-07 +# 6 diagtime 0.002889092 0.009020319 0.3202872 106.17585 7.493801e-01 +# 7 age -0.007620985 0.009632902 -0.7911411 97.27459 4.307864e-01 +# 8 prior 0.003954604 0.023537476 0.1680131 111.98360 8.668760e-01 + +# Compare with the results on the original data summary(coxph(Surv(time, status) ~ ., veteran))$coefficients +# coef exp(coef) se(coef) z Pr(>|z|) +# trt 2.946028e-01 1.3425930 0.207549604 1.419433313 1.557727e-01 +# celltypesmallcell 8.615605e-01 2.3668512 0.275284474 3.129709606 1.749792e-03 +# celltypeadeno 1.196066e+00 3.3070825 0.300916994 3.974738536 7.045662e-05 +# celltypelarge 4.012917e-01 1.4937529 0.282688638 1.419553530 1.557377e-01 +# karno -3.281533e-02 0.9677173 0.005507757 -5.958020093 2.553121e-09 +# diagtime 8.132051e-05 1.0000813 0.009136062 0.008901046 9.928981e-01 +# age -8.706475e-03 0.9913313 0.009300299 -0.936149992 3.491960e-01 +# prior 7.159360e-03 1.0071850 0.023230538 0.308187441 7.579397e-01 ``` + ## References From 993aa6f69c258995686833ed08a5d98570eeb5a1 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 19:29:51 +0100 Subject: [PATCH 2/7] update docu --- R/generateNA.R | 4 +- R/imputeUnivariate.R | 6 ++- R/missRanger.R | 108 ++++++++++++++++------------------------ R/pmm.R | 15 ++++-- cran-comments.md | 9 +++- man/generateNA.Rd | 4 +- man/imputeUnivariate.Rd | 6 ++- man/missRanger.Rd | 105 ++++++++++++++++---------------------- man/pmm.Rd | 15 ++++-- 9 files changed, 132 insertions(+), 140 deletions(-) diff --git a/R/generateNA.R b/R/generateNA.R index a577e83..b64cc0c 100644 --- a/R/generateNA.R +++ b/R/generateNA.R @@ -3,7 +3,9 @@ #' Takes a vector, matrix or \code{data.frame} and replaces some values by \code{NA}. #' #' @param x A vector, matrix or \code{data.frame}. -#' @param p Proportion of missing values to add to \code{x}. In case \code{x} is a \code{data.frame}, \code{p} can also be a vector of probabilities per column or a named vector (see examples). +#' @param p Proportion of missing values to add to \code{x}. +#' In case \code{x} is a \code{data.frame}, \code{p} can also be a vector of +#' probabilities per column or a named vector (see examples). #' @param seed An integer seed. #' #' @return \code{x} with missing values. diff --git a/R/imputeUnivariate.R b/R/imputeUnivariate.R index 0dde202..98bf7c4 100644 --- a/R/imputeUnivariate.R +++ b/R/imputeUnivariate.R @@ -1,9 +1,11 @@ #' Univariate Imputation #' -#' Fills missing values of a vector, matrix or data frame by sampling with replacement from the non-missing values. For data frames, this sampling is done within column. +#' Fills missing values of a vector, matrix or data frame by sampling with replacement +#' from the non-missing values. For data frames, this sampling is done within column. #' #' @param x A vector, matrix or data frame. -#' @param v A character vector of column names to impute (only relevant if \code{x} is a data frame). The default \code{NULL} imputes all columns. +#' @param v A character vector of column names to impute (only relevant if \code{x} +#' is a data frame). The default \code{NULL} imputes all columns. #' @param seed An integer seed. #' #' @return \code{x} with imputed values. diff --git a/R/missRanger.R b/R/missRanger.R index 34b44d8..bd08d35 100644 --- a/R/missRanger.R +++ b/R/missRanger.R @@ -1,27 +1,58 @@ #' Fast Imputation of Missing Values by Chained Random Forests #' -#' Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. -#' Between the iterative model fitting, it offers the option of predictive mean matching. This firstly avoids imputation with values not present in the original data (like a value 0.3334 in a 0-1 coded variable). Secondly, predictive mean matching tries to raise the variance in the resulting conditional distributions to a realistic level. This allows to do multiple imputation when repeating the call to missRanger(). -#' The iterative chaining stops as soon as \code{maxiter} is reached or if the average out-of-bag estimate of performance stops improving. In the latter case, except for the first iteration, the second last (i.e. best) imputed data is returned. +#' Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by +#' chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. +#' Between the iterative model fitting, it offers the option of predictive mean matching. +#' This firstly avoids imputation with values not present in the original data +#' (like a value 0.3334 in a 0-1 coded variable). +#' Secondly, predictive mean matching tries to raise the variance in the resulting +#' conditional distributions to a realistic level. This allows to do multiple imputation +#' when repeating the call to \code{missRanger()}. +#' The iterative chaining stops as soon as \code{maxiter} is reached or if the average +#' out-of-bag estimate of performance stops improving. +#' In the latter case, except for the first iteration, the second last (i.e. best) +#' imputed data is returned. #' -#' A note on `mtry`: Be careful when passing a non-default `mtry` to `ranger()` because the number of available covariables might be growing during the first iteration, depending on the missing pattern. Values \code{NULL} (default) and 1 are safe choices. Additionally, recent versions of `ranger()` allow `mtry` to be a single-argument function of the number of available covariables, e.g. `mtry = function(m) max(1, m %/% 3)`. +#' A note on \code{mtry}: Be careful when passing a non-default \code{mtry} to +#' \code{ranger()} because the number of available covariates might be growing during +#' the first iteration, depending on the missing pattern. +#' Values \code{NULL} (default) and 1 are safe choices. +#' Additionally, recent versions of \code{ranger()} allow \code{mtry} to be a +#' single-argument function of the number of available covariables, +#' e.g. \code{mtry = function(m) max(1, m %/% 3)}. #' #' @importFrom stats var reformulate terms.formula predict setNames #' @importFrom ranger ranger #' @importFrom utils setTxtProgressBar txtProgressBar #' @param data A \code{data.frame} or \code{tibble} with missing values to impute. -#' @param formula A two-sided formula specifying variables to be imputed (left hand side) and variables used to impute (right hand side). Defaults to . ~ ., i.e. use all variables to impute all variables. -#' If e.g. all variables (with missings) should be imputed by all variables except variable "ID", use . ~ . - ID. Note that a "." is evaluated separately for each side of the formula. Further note that variables -#' with missings must appear in the left hand side if they should be used on the right hand side. -#' @param pmm.k Number of candidate non-missing values to sample from in the predictive mean matching steps. 0 to avoid this step. +#' @param formula A two-sided formula specifying variables to be imputed +#' (left hand side) and variables used to impute (right hand side). +#' Defaults to \code{. ~ .}, i.e. use all variables to impute all variables. +#' If e.g. all variables (with missings) should be imputed by all variables +#' except variable "ID", use \code{. ~ . - ID}. Note that a "." is evaluated +#' separately for each side of the formula. Further note that variables with missings +#' must appear in the left hand side if they should be used on the right hand side. +#' @param pmm.k Number of candidate non-missing values to sample from in the +#' predictive mean matching steps. 0 to avoid this step. #' @param maxiter Maximum number of chaining iterations. #' @param seed Integer seed to initialize the random generator. -#' @param verbose Controls how much info is printed to screen. 0 to print nothing. 1 (default) to print a progress bar per iteration, 2 to print the OOB prediction error per iteration and variable (1 minus R-squared for regression). -#' Furthermore, if \code{verbose} is positive, the variables used for imputation are listed as well as the variables to be imputed (in the imputation order). This will be useful to detect if some variables are unexpectedly skipped. -#' @param returnOOB Logical flag. If TRUE, the final average out-of-bag prediction error is added to the output as attribute "oob". This does not work in the special case when the variables are imputed univariately. +#' @param verbose Controls how much info is printed to screen. +#' 0 to print nothing. 1 (default) to print a progress bar per iteration, +#' 2 to print the OOB prediction error per iteration and variable +#' (1 minus R-squared for regression). +#' Furthermore, if \code{verbose} is positive, the variables used for imputation are +#' listed as well as the variables to be imputed (in the imputation order). +#' This will be useful to detect if some variables are unexpectedly skipped. +#' @param returnOOB Logical flag. If TRUE, the final average out-of-bag prediction error +#' is added to the output as attribute "oob". This does not work in the special case +#' when the variables are imputed univariately. #' @param case.weights Vector with non-negative case weights. -#' @param ... Arguments passed to \code{ranger()}. If the data set is large, better use less trees (e.g. \code{num.trees = 20}) and/or a low value of \code{sample.fraction}. -#' The following arguments are e.g. incompatible with \code{ranger}: \code{write.forest}, \code{probability}, \code{split.select.weights}, \code{dependent.variable.name}, and \code{classification}. +#' @param ... Arguments passed to \code{ranger()}. If the data set is large, +#' better use less trees (e.g. \code{num.trees = 20}) and/or a low value of +#' \code{sample.fraction}. +#' The following arguments are e.g. incompatible: +#' \code{write.forest}, \code{probability}, \code{split.select.weights}, +#' \code{dependent.variable.name}, and \code{classification}. #' #' @return An imputed \code{data.frame}. #' @@ -38,57 +69,6 @@ #' irisImputed <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100) #' head(irisImputed) #' head(irisWithNA) -#' -#' \dontrun{ -#' # With extra trees algorithm -#' irisImputed_et <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, splitrule = "extratrees") -#' head(irisImputed_et) -#' -#' # Passing `mtry` as a function of the number of covariables -# irisImputed_mtry <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, -# mtry = function(m) max(1, m %/% 3)) -# head(irisImputed_mtry) -#' -#' # Do not impute Species. Note: Since this variable contains missings, it won't be used -#' # for imputing other variables. -#' head(irisImputed <- missRanger(irisWithNA, . - Species ~ ., pmm.k = 3, num.trees = 100)) -#' -#' # Impute univariately only. -#' head(irisImputed <- missRanger(irisWithNA, . ~ 1)) -#' -#' # Use Species and Petal.Length to impute Species and Petal.Length. -#' head(irisImputed <- missRanger(irisWithNA, Species + Petal.Length ~ Species + Petal.Length, -#' pmm.k = 3, num.trees = 100)) -#' -#' # Multiple imputation: Fill data 20 times, run 20 analyses and pool their results. -#' require(mice) -#' filled <- replicate(20, missRanger(irisWithNA, verbose = 0, num.trees = 100, pmm.k = 5), -#' simplify = FALSE) -#' models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x)) -#' summary(pooled_fit <- pool(models)) # Realistically inflated standard errors and p values -#' -#' # A data set with logicals, numerics, characters and factors. -#' n <- 100 -#' X <- data.frame(x1 = seq_len(n), -#' x2 = log(seq_len(n)), -#' x3 = sample(LETTERS[1:3], n, replace = TRUE), -#' x4 = factor(sample(LETTERS[1:3], n, replace = TRUE)), -#' x5 = seq_len(n) > 50) -#' head(X) -#' X_NA <- generateNA(X, p = seq(0, 0.8, by = .2)) -#' head(X_NA) -#' -#' head(X_imp <- missRanger(X_NA)) -#' head(X_imp <- missRanger(X_NA, pmm = 3)) -#' head(X_imp <- missRanger(X_NA, pmm = 3, verbose = 0)) -#' head(X_imp <- missRanger(X_NA, pmm = 3, verbose = 2, returnOOB = TRUE)) -#' attr(X_imp, "oob") # OOB prediction errors per column. -#' -#' # The formula interface -#' head(X_imp <- missRanger(X_NA, x2 ~ x2 + x3, pmm = 3)) # Does not use x3 because of NAs -#' head(X_imp <- missRanger(X_NA, x2 + x3 ~ x2 + x3, pmm = 3)) -#' head(X_imp <- missRanger(X_NA, x2 + x3 ~ 1, pmm = 3)) # Univariate imputation -#' } missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, seed = NULL, verbose = 1, returnOOB = FALSE, case.weights = NULL, ...) { diff --git a/R/pmm.R b/R/pmm.R index e654689..f6f75f7 100644 --- a/R/pmm.R +++ b/R/pmm.R @@ -1,13 +1,20 @@ #' Predictive Mean Matching #' -#' For each value in the prediction vector \code{xtest}, one of the closest \code{k} values in the prediction vector \code{xtrain} is randomly chosen and its observed value in \code{ytrain} is returned. +#' For each value in the prediction vector \code{xtest}, one of the closest \code{k} +#' values in the prediction vector \code{xtrain} is randomly chosen and its observed +#' value in \code{ytrain} is returned. #' #' @importFrom stats rmultinom #' @importFrom FNN knnx.index #' -#' @param xtrain Vector with predicted values in the training data. Can be of type logical, numeric, character, or factor. -#' @param xtest Vector as \code{xtrain} with predicted values in the test data. Missing values are not allowed. -#' @param ytrain Vector of the observed values in the training data. Must be of same length as \code{xtrain}. Missing values in either of \code{xtrain} or \code{ytrain} will be dropped in a pairwise manner. +#' @param xtrain Vector with predicted values in the training data. +#' Can be of type logical, numeric, character, or factor. +#' @param xtest Vector as \code{xtrain} with predicted values in the test data. +#' Missing values are not allowed. +#' @param ytrain Vector of the observed values in the training data. +#' Must be of same length as \code{xtrain}. +#' Missing values in either of \code{xtrain} or \code{ytrain} will be dropped +#' in a pairwise manner. #' @param k Number of nearest neighbours to sample from. #' @param seed Integer random seed. #' diff --git a/cran-comments.md b/cran-comments.md index 757c37e..f0b64d8 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -12,4 +12,11 @@ ??? checking for future file timestamps ... NOTE unable to verify current time -## \ No newline at end of file +## RHub + +Found the following files/directories: + + 'lastMiKTeXException' + +## Winbuilder + diff --git a/man/generateNA.Rd b/man/generateNA.Rd index 4ae37e9..9e6b610 100644 --- a/man/generateNA.Rd +++ b/man/generateNA.Rd @@ -9,7 +9,9 @@ generateNA(x, p = 0.1, seed = NULL) \arguments{ \item{x}{A vector, matrix or \code{data.frame}.} -\item{p}{Proportion of missing values to add to \code{x}. In case \code{x} is a \code{data.frame}, \code{p} can also be a vector of probabilities per column or a named vector (see examples).} +\item{p}{Proportion of missing values to add to \code{x}. +In case \code{x} is a \code{data.frame}, \code{p} can also be a vector of +probabilities per column or a named vector (see examples).} \item{seed}{An integer seed.} } diff --git a/man/imputeUnivariate.Rd b/man/imputeUnivariate.Rd index 1177d96..4fcdf1d 100644 --- a/man/imputeUnivariate.Rd +++ b/man/imputeUnivariate.Rd @@ -9,7 +9,8 @@ imputeUnivariate(x, v = NULL, seed = NULL) \arguments{ \item{x}{A vector, matrix or data frame.} -\item{v}{A character vector of column names to impute (only relevant if \code{x} is a data frame). The default \code{NULL} imputes all columns.} +\item{v}{A character vector of column names to impute (only relevant if \code{x} +is a data frame). The default \code{NULL} imputes all columns.} \item{seed}{An integer seed.} } @@ -17,7 +18,8 @@ imputeUnivariate(x, v = NULL, seed = NULL) \code{x} with imputed values. } \description{ -Fills missing values of a vector, matrix or data frame by sampling with replacement from the non-missing values. For data frames, this sampling is done within column. +Fills missing values of a vector, matrix or data frame by sampling with replacement +from the non-missing values. For data frames, this sampling is done within column. } \examples{ imputeUnivariate(c(NA, 0, 1, 0, 1)) diff --git a/man/missRanger.Rd b/man/missRanger.Rd index a8af8c0..b559bc9 100644 --- a/man/missRanger.Rd +++ b/man/missRanger.Rd @@ -19,90 +19,73 @@ missRanger( \arguments{ \item{data}{A \code{data.frame} or \code{tibble} with missing values to impute.} -\item{formula}{A two-sided formula specifying variables to be imputed (left hand side) and variables used to impute (right hand side). Defaults to . ~ ., i.e. use all variables to impute all variables. -If e.g. all variables (with missings) should be imputed by all variables except variable "ID", use . ~ . - ID. Note that a "." is evaluated separately for each side of the formula. Further note that variables -with missings must appear in the left hand side if they should be used on the right hand side.} +\item{formula}{A two-sided formula specifying variables to be imputed +(left hand side) and variables used to impute (right hand side). +Defaults to \code{. ~ .}, i.e. use all variables to impute all variables. +If e.g. all variables (with missings) should be imputed by all variables +except variable "ID", use \code{. ~ . - ID}. Note that a "." is evaluated +separately for each side of the formula. Further note that variables with missings +must appear in the left hand side if they should be used on the right hand side.} -\item{pmm.k}{Number of candidate non-missing values to sample from in the predictive mean matching steps. 0 to avoid this step.} +\item{pmm.k}{Number of candidate non-missing values to sample from in the +predictive mean matching steps. 0 to avoid this step.} \item{maxiter}{Maximum number of chaining iterations.} \item{seed}{Integer seed to initialize the random generator.} -\item{verbose}{Controls how much info is printed to screen. 0 to print nothing. 1 (default) to print a progress bar per iteration, 2 to print the OOB prediction error per iteration and variable (1 minus R-squared for regression). -Furthermore, if \code{verbose} is positive, the variables used for imputation are listed as well as the variables to be imputed (in the imputation order). This will be useful to detect if some variables are unexpectedly skipped.} +\item{verbose}{Controls how much info is printed to screen. +0 to print nothing. 1 (default) to print a progress bar per iteration, +2 to print the OOB prediction error per iteration and variable +(1 minus R-squared for regression). +Furthermore, if \code{verbose} is positive, the variables used for imputation are +listed as well as the variables to be imputed (in the imputation order). +This will be useful to detect if some variables are unexpectedly skipped.} -\item{returnOOB}{Logical flag. If TRUE, the final average out-of-bag prediction error is added to the output as attribute "oob". This does not work in the special case when the variables are imputed univariately.} +\item{returnOOB}{Logical flag. If TRUE, the final average out-of-bag prediction error +is added to the output as attribute "oob". This does not work in the special case +when the variables are imputed univariately.} \item{case.weights}{Vector with non-negative case weights.} -\item{...}{Arguments passed to \code{ranger()}. If the data set is large, better use less trees (e.g. \code{num.trees = 20}) and/or a low value of \code{sample.fraction}. -The following arguments are e.g. incompatible with \code{ranger}: \code{write.forest}, \code{probability}, \code{split.select.weights}, \code{dependent.variable.name}, and \code{classification}.} +\item{...}{Arguments passed to \code{ranger()}. If the data set is large, +better use less trees (e.g. \code{num.trees = 20}) and/or a low value of +\code{sample.fraction}. +The following arguments are e.g. incompatible: +\code{write.forest}, \code{probability}, \code{split.select.weights}, +\code{dependent.variable.name}, and \code{classification}.} } \value{ An imputed \code{data.frame}. } \description{ -Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. -Between the iterative model fitting, it offers the option of predictive mean matching. This firstly avoids imputation with values not present in the original data (like a value 0.3334 in a 0-1 coded variable). Secondly, predictive mean matching tries to raise the variance in the resulting conditional distributions to a realistic level. This allows to do multiple imputation when repeating the call to missRanger(). -The iterative chaining stops as soon as \code{maxiter} is reached or if the average out-of-bag estimate of performance stops improving. In the latter case, except for the first iteration, the second last (i.e. best) imputed data is returned. +Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by +chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. +Between the iterative model fitting, it offers the option of predictive mean matching. +This firstly avoids imputation with values not present in the original data +(like a value 0.3334 in a 0-1 coded variable). +Secondly, predictive mean matching tries to raise the variance in the resulting +conditional distributions to a realistic level. This allows to do multiple imputation +when repeating the call to \code{missRanger()}. +The iterative chaining stops as soon as \code{maxiter} is reached or if the average +out-of-bag estimate of performance stops improving. +In the latter case, except for the first iteration, the second last (i.e. best) +imputed data is returned. } \details{ -A note on \code{mtry}: Be careful when passing a non-default \code{mtry} to \code{ranger()} because the number of available covariables might be growing during the first iteration, depending on the missing pattern. Values \code{NULL} (default) and 1 are safe choices. Additionally, recent versions of \code{ranger()} allow \code{mtry} to be a single-argument function of the number of available covariables, e.g. \code{mtry = function(m) max(1, m \%/\% 3)}. +A note on \code{mtry}: Be careful when passing a non-default \code{mtry} to +\code{ranger()} because the number of available covariates might be growing during +the first iteration, depending on the missing pattern. +Values \code{NULL} (default) and 1 are safe choices. +Additionally, recent versions of \code{ranger()} allow \code{mtry} to be a +single-argument function of the number of available covariables, +e.g. \code{mtry = function(m) max(1, m \%/\% 3)}. } \examples{ irisWithNA <- generateNA(iris, seed = 34) irisImputed <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100) head(irisImputed) head(irisWithNA) - -\dontrun{ -# With extra trees algorithm -irisImputed_et <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, splitrule = "extratrees") -head(irisImputed_et) - -# Passing `mtry` as a function of the number of covariables - -# Do not impute Species. Note: Since this variable contains missings, it won't be used -# for imputing other variables. -head(irisImputed <- missRanger(irisWithNA, . - Species ~ ., pmm.k = 3, num.trees = 100)) - -# Impute univariately only. -head(irisImputed <- missRanger(irisWithNA, . ~ 1)) - -# Use Species and Petal.Length to impute Species and Petal.Length. -head(irisImputed <- missRanger(irisWithNA, Species + Petal.Length ~ Species + Petal.Length, - pmm.k = 3, num.trees = 100)) - -# Multiple imputation: Fill data 20 times, run 20 analyses and pool their results. -require(mice) -filled <- replicate(20, missRanger(irisWithNA, verbose = 0, num.trees = 100, pmm.k = 5), - simplify = FALSE) -models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x)) -summary(pooled_fit <- pool(models)) # Realistically inflated standard errors and p values - -# A data set with logicals, numerics, characters and factors. -n <- 100 -X <- data.frame(x1 = seq_len(n), - x2 = log(seq_len(n)), - x3 = sample(LETTERS[1:3], n, replace = TRUE), - x4 = factor(sample(LETTERS[1:3], n, replace = TRUE)), - x5 = seq_len(n) > 50) -head(X) -X_NA <- generateNA(X, p = seq(0, 0.8, by = .2)) -head(X_NA) - -head(X_imp <- missRanger(X_NA)) -head(X_imp <- missRanger(X_NA, pmm = 3)) -head(X_imp <- missRanger(X_NA, pmm = 3, verbose = 0)) -head(X_imp <- missRanger(X_NA, pmm = 3, verbose = 2, returnOOB = TRUE)) -attr(X_imp, "oob") # OOB prediction errors per column. - -# The formula interface -head(X_imp <- missRanger(X_NA, x2 ~ x2 + x3, pmm = 3)) # Does not use x3 because of NAs -head(X_imp <- missRanger(X_NA, x2 + x3 ~ x2 + x3, pmm = 3)) -head(X_imp <- missRanger(X_NA, x2 + x3 ~ 1, pmm = 3)) # Univariate imputation -} } \references{ \enumerate{ diff --git a/man/pmm.Rd b/man/pmm.Rd index 9166e04..33cac05 100644 --- a/man/pmm.Rd +++ b/man/pmm.Rd @@ -7,11 +7,16 @@ pmm(xtrain, xtest, ytrain, k = 1L, seed = NULL) } \arguments{ -\item{xtrain}{Vector with predicted values in the training data. Can be of type logical, numeric, character, or factor.} +\item{xtrain}{Vector with predicted values in the training data. +Can be of type logical, numeric, character, or factor.} -\item{xtest}{Vector as \code{xtrain} with predicted values in the test data. Missing values are not allowed.} +\item{xtest}{Vector as \code{xtrain} with predicted values in the test data. +Missing values are not allowed.} -\item{ytrain}{Vector of the observed values in the training data. Must be of same length as \code{xtrain}. Missing values in either of \code{xtrain} or \code{ytrain} will be dropped in a pairwise manner.} +\item{ytrain}{Vector of the observed values in the training data. +Must be of same length as \code{xtrain}. +Missing values in either of \code{xtrain} or \code{ytrain} will be dropped +in a pairwise manner.} \item{k}{Number of nearest neighbours to sample from.} @@ -21,7 +26,9 @@ pmm(xtrain, xtest, ytrain, k = 1L, seed = NULL) Vector of the same length as \code{xtest} with values from \code{xtrain}. } \description{ -For each value in the prediction vector \code{xtest}, one of the closest \code{k} values in the prediction vector \code{xtrain} is randomly chosen and its observed value in \code{ytrain} is returned. +For each value in the prediction vector \code{xtest}, one of the closest \code{k} +values in the prediction vector \code{xtrain} is randomly chosen and its observed +value in \code{ytrain} is returned. } \examples{ pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) # 0 From a4c46bcc073370bb4ae7d679664ce5a37558b03a Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 20:28:43 +0100 Subject: [PATCH 3/7] update 2023-03-24 20:28:43 --- cran-comments.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index f0b64d8..88d2ac1 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,22 +1,22 @@ # missRanger 2.2.0 - removed suggested dependencies dplyr, mice, survival -- improved docu +- improved documentation ## R CMD check -??? checking for unstated dependencies in examples ... OK - WARNING +checking for unstated dependencies in examples ... OK + +WARNING 'qpdf' is needed for checks on size reduction of PDFs -??? checking for future file timestamps ... NOTE +checking for future file timestamps ... NOTE unable to verify current time -## RHub +## RHub -Found the following files/directories: +Note: Found the following files/directories: 'lastMiKTeXException' -## Winbuilder From f519bd0dad9824d7b27bc6fae86e18dbd91adf2c Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 20:37:22 +0100 Subject: [PATCH 4/7] reverse dependency checks --- .Rbuildignore | 1 + .gitignore | 1 + cran-comments.md | 12 +++++++++++- packaging.R | 6 ++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index b526e8b..f16d2e5 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.github$ +^revdep$ diff --git a/.gitignore b/.gitignore index 36f5b4f..325ec1c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ Meta /doc/ /Meta/ inst/doc +revdep diff --git a/cran-comments.md b/cran-comments.md index 88d2ac1..e36d37e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -19,4 +19,14 @@ Note: Found the following files/directories: 'lastMiKTeXException' - +## Reverse dependency check of 7 packages + +??? hdImpute 0.1.1 ?????? E: 0 | W: 0 | N: 0 +??? marginaleffects 0.11.0 ?????? E: 0 | W: 0 | N: 0 +??? mlim 0.3.0 ?????? E: 0 | W: 0 | N: 0 +??? NADIA 0.4.2 ?????? E: 0 | W: 0 | N: 1 +??? outForest 0.1.2 ?????? E: 0 | W: 0 | N: 0 +??? wiseR 1.0.1 ?????? E: 0 | W: 0 | N: 3 +??? worcs 0.1.10 ?????? E: 0 | W: 0 | N: 0 +OK: 7 +BROKEN: 0 diff --git a/packaging.R b/packaging.R index 04b0f19..f974dc7 100644 --- a/packaging.R +++ b/packaging.R @@ -79,6 +79,9 @@ use_github_action("check-standard") use_github_action("test-coverage") use_github_action("pkgdown") +# Revdep + +use_revdep() #============================================================================= # Finish package building (can use fresh session) @@ -98,6 +101,9 @@ install() if (FALSE) { check_win_devel() check_rhub() + + # Takes long + revdepcheck::revdep_check() # Wait until above checks are passed without relevant notes/warnings # then submit to CRAN From 50248bcb89cd3e020872516f3ddfa8653104cf62 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 20:37:47 +0100 Subject: [PATCH 5/7] 4 workers --- packaging.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging.R b/packaging.R index f974dc7..8ccadf1 100644 --- a/packaging.R +++ b/packaging.R @@ -103,7 +103,7 @@ if (FALSE) { check_rhub() # Takes long - revdepcheck::revdep_check() + revdepcheck::revdep_check(num_workers = 4) # Wait until above checks are passed without relevant notes/warnings # then submit to CRAN From 7213048cfb0727fc7f705715bf603ab86550dc61 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 20:48:01 +0100 Subject: [PATCH 6/7] update cran comments --- cran-comments.md | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index e36d37e..e5b93e2 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -15,18 +15,11 @@ checking for future file timestamps ... NOTE ## RHub -Note: Found the following files/directories: - - 'lastMiKTeXException' +Note: lastMiKTeXException ## Reverse dependency check of 7 packages -??? hdImpute 0.1.1 ?????? E: 0 | W: 0 | N: 0 -??? marginaleffects 0.11.0 ?????? E: 0 | W: 0 | N: 0 -??? mlim 0.3.0 ?????? E: 0 | W: 0 | N: 0 -??? NADIA 0.4.2 ?????? E: 0 | W: 0 | N: 1 -??? outForest 0.1.2 ?????? E: 0 | W: 0 | N: 0 -??? wiseR 1.0.1 ?????? E: 0 | W: 0 | N: 3 -??? worcs 0.1.10 ?????? E: 0 | W: 0 | N: 0 +- hdImpute 0.1.1 -- E: 0 | W: 0 | N: 0 - marginaleffects 0.11.0 -- E: 0 | W: 0 | N: 0 - mlim 0.3.0 -- E: 0 | W: 0 | N: 0 - NADIA 0.4.2 -- E: 0 | W: 0 | N: 1 - outForest 0.1.2 -- E: 0 | W: 0 | N: 0 - wiseR 1.0.1 -- E: 0 | W: 0 | N: 3 +- worcs 0.1.10 -- E: 0 | W: 0 | N: 0 OK: 7 BROKEN: 0 From 741596e9652a91617047c59e2c7dc23fb8619a5b Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 24 Mar 2023 20:50:52 +0100 Subject: [PATCH 7/7] CRAN candidate --- .Rbuildignore | 1 + CRAN-SUBMISSION | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 CRAN-SUBMISSION diff --git a/.Rbuildignore b/.Rbuildignore index f16d2e5..a597561 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ ^\.Rproj\.user$ ^\.github$ ^revdep$ +^CRAN-SUBMISSION$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..8bb114e --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 2.2.0 +Date: 2023-03-24 19:48:56 UTC +SHA: 7213048cfb0727fc7f705715bf603ab86550dc61