From 716169a879edd7258397b92890dc39a2874ef8d4 Mon Sep 17 00:00:00 2001 From: DominiqueMakowski Date: Fri, 10 Nov 2017 23:17:25 +0100 Subject: [PATCH 1/2] 0.0.3 --- DESCRIPTION | 5 +- NAMESPACE | 10 + NEWS.md | 9 + R/format_p.R | 2 +- R/n_factors.R | 177 ++++++++++++++++++ man/n_factors.Rd | 39 ++++ tests/testthat/test-n_factors.R | 12 ++ vignettes/overview.Rmd | 38 +++- vignettes/overview.html | 170 ++++++++++++++--- .../figure-html/unnamed-chunk-21-1.png | Bin 6238 -> 0 bytes 10 files changed, 433 insertions(+), 29 deletions(-) create mode 100644 R/n_factors.R create mode 100644 man/n_factors.Rd create mode 100644 tests/testthat/test-n_factors.R delete mode 100644 vignettes/overview_files/figure-html/unnamed-chunk-21-1.png diff --git a/DESCRIPTION b/DESCRIPTION index c2e3f59..df08f76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: psycho Type: Package Title: Efficient and Publishing-Oriented Workflow for Psychological Science -Version: 0.0.2 +Version: 0.0.3 Authors@R: c( person("Dominique", "Makowski", @@ -33,6 +33,9 @@ Imports: tidyverse, rtf, psych, + MASS, + nFactors, + qgraph, ppcor, corrplot, rstanarm, diff --git a/NAMESPACE b/NAMESPACE index b3684d6..94ceb7c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(find_season) export(format_digit) export(format_p) export(interpret_d) +export(n_factors) export(normalize) export(values) import(corrplot) @@ -23,10 +24,19 @@ import(ppcor) import(purrr) import(rstanarm) import(tidyr) +importFrom(MASS,ginv) +importFrom(MASS,mvrnorm) importFrom(MuMIn,r.squaredGLMM) importFrom(MuMIn,std.coef) +importFrom(nFactors,moreStats) +importFrom(nFactors,nScree) +importFrom(psych,VSS) importFrom(psych,corr.p) importFrom(psych,corr.test) +importFrom(qgraph,cor_auto) +importFrom(stats,cov) +importFrom(stats,dnorm) importFrom(stats,ecdf) importFrom(stats,na.omit) +importFrom(stats,qnorm) importFrom(stats,quantile) diff --git a/NEWS.md b/NEWS.md index c45322e..3d41d8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# 0.0.2 (2017-11-10) + +### Breaking changes +### New functions / parameters +- `n_factors`: How many factors to retain for PCA or factor analysis? +### Major changes +### Minor changes + + # 0.0.2 (2017-10-12) ### Breaking changes diff --git a/R/format_p.R b/R/format_p.R index 0638e35..31bfeb8 100644 --- a/R/format_p.R +++ b/R/format_p.R @@ -9,6 +9,6 @@ format_p <- function(pvalues) { ifelse(pvalues < 0.001, "< .001***", ifelse(pvalues < 0.01, "< .01**", ifelse(pvalues < 0.05, "< .05*", - ifelse(pvalues < 0.1, paste0(round(pvalues, 2), "\xB0"), + ifelse(pvalues < 0.1, paste0("= ", round(pvalues, 2), "\xB0"), "> .1")))) } diff --git a/R/n_factors.R b/R/n_factors.R new file mode 100644 index 0000000..c33a12c --- /dev/null +++ b/R/n_factors.R @@ -0,0 +1,177 @@ +#' Find Optimal Factor Number. +#' +#' Find optimal factor number using various solutions. +#' +#' @param df The dataframe +#' @param rotate What rotation to use c("none", "varimax", "oblimin","promax") +#' @param fm Factoring method – fm="pa" Principal Axis Factor Analysis, +#' fm = "minres" minimum residual (OLS) factoring fm="mle" Maximum Likelihood FA, +#' fm="pc" Principal Components" +#' @param n_max How many factors to test. +#' +#' @return output +#' +#' @examples +#' df <- dplyr::select_if(attitude, is.numeric) +#' results <- psycho::n_factors(df) +#' +#' summary(results) +#' plot(results) +#' +#' # See details on methods +#' psycho::values(results)$methods +#' +#' @author \href{https://dominiquemakowski.github.io/}{Dominique Makowski} +#' +#' @importFrom qgraph cor_auto +#' @importFrom psych VSS +#' @importFrom MASS mvrnorm +#' @importFrom MASS ginv +#' @importFrom nFactors moreStats +#' @importFrom nFactors nScree +#' @importFrom stats cov +#' @importFrom stats dnorm +#' @importFrom stats qnorm +#' @export +n_factors <- function(df, rotate="varimax", fm="minres", n_max=8){ + + # Copy the parallel function from nFactors to correct the use of mvrnorm + parallel <- function (subject = 100, var = 10, rep = 100, cent = 0.05, quantile = cent, + model = "components", sd = diag(1, var), ...) + { + r <- subject + c <- var + y <- matrix(c(1:r * c), nrow = r, ncol = c) + ycor <- matrix(c(1:c * c), nrow = c, ncol = c) + evpea <- NULL + leg.txt <- "Pearson" + for (k in c(1:rep)) { + y <- MASS::mvrnorm(n = r, mu = rep(0, var), Sigma = sd, empirical = FALSE) + corY <- cov(y, ...) + if (model == "components") + diag(corY) <- diag(sd) + if (model == "factors") + corY <- corY - MASS::ginv(diag(diag(MASS::ginv(corY)))) + evpea <- rbind(evpea, eigen(corY)[[1]]) + } + SEcentile <- function(sd, n = 100, p = 0.95) { + return(sd/sqrt(n) * sqrt(p * (1 - p))/dnorm(qnorm(p))) + } + sprob <- c(cent) + mevpea <- sapply(as.data.frame(evpea), mean) + sevpea <- sapply(as.data.frame(evpea), sd) + qevpea <- nFactors::moreStats(evpea, quantile = quantile)[3, ] + sqevpea <- sevpea + sqevpea <- sapply(as.data.frame(sqevpea), SEcentile, n = rep, + p = cent) + result <- list(eigen = data.frame(mevpea, sevpea, qevpea, + sqevpea), subject = r, variables = c, centile = cent) + class(result) <- "parallel" + return(result) + } + + + cor <- qgraph::cor_auto(df) + + ap <- parallel(subject=nrow(df), var=ncol(df)) + nS <- nFactors::nScree(x=eigen(cor)$values, aparallel=ap$eigen$qevpea) + + # Eigeinvalues data + eigenvalues <- nS$Analysis %>% + dplyr::select_("Eigenvalues", + "Exp.Variance"="Prop", + "Cum.Variance"="Cumu") %>% + mutate_("n.Factors"= ~1:nrow(nS$Analysis)) + + + # Processing + # ------------------- + results <- data.frame(Method=c("Optimal Coordinates", "Acceleration Factor", "Parallel Analysis", "Eigenvalues (Kaiser Criterion)"), n_optimal=as.numeric(nS$Components[1,])) + + + vss <- psych::VSS(cor, n=n_max, n.obs=nrow(df), rotate=rotate, fm=fm, plot=F) # fm can be "pa", "pc", "minres", "mle" + stats <- vss$vss.stats + stats$map <- vss$map + stats$n_factors <- 1:nrow(stats) + + results2 <- data.frame(Method=c("Velicer MAP", + "BIC", + "Sample Size Adjusted BIC"), + n_optimal=c(na.omit(stats[stats$map==min(stats$map, na.rm = T),])$n_factors, + na.omit(stats[stats$BIC==min(stats$BIC, na.rm = T),])$n_factors, + na.omit(stats[stats$SABIC==min(stats$SABIC, na.rm = T),])$n_factors)) + results <- rbind(results, results2) + + + cfits <- vss[grep("cfit", names(vss))] + for (name in names(cfits)){ + cfit <- cfits[[name]] + + cfit <- data.frame(cfit=cfit, n_factors=1:length(cfit)) + + result3 <- data.frame(Method=c(gsub("cfit.", "VSS Complexity ", name)), + n_optimal=c(na.omit(cfit[cfit$cfit==max(cfit$cfit, na.rm = T),])$n_factors)) + + results <- rbind(results, result3) + } + + + eigenvalues <- results %>% + group_by_("n_optimal") %>% + summarise_("n_method"=~n()) %>% + mutate_("n_optimal"=~factor(n_optimal, levels=1:nrow(eigenvalues))) %>% + complete_("n_optimal", fill=list(n_method = 0)) %>% + arrange_("n_optimal") %>% + rename_("n.Factors"="n_optimal", + "n.Methods"="n_method") %>% + mutate_("n.Factors"=~as.integer(n.Factors)) %>% + left_join(eigenvalues, by="n.Factors") + + + # Values + # ------------- + values <- list(eigenvalues=eigenvalues, methods=results) + + # Summary + # ------------- + summary <- eigenvalues + + # Text + # ------------- + text <- "Not implemented yet :(" + + + # Plot + # ------------- + plot_data <- eigenvalues + plot_data$n.Methods.Ratio <- plot_data$n.Methods/sum(plot_data$n.Methods) + plot_data$n.Methods.Ratio <- plot_data$n.Methods.Ratio*(1/max(plot_data$n.Methods.Ratio)) + plot_data$area <- plot_data$n.Methods.Ratio/(max(plot_data$n.Methods.Ratio) / max(plot_data$Eigenvalues)) + plot_data$var <- plot_data$Cum.Variance/(max(plot_data$Cum.Variance) / max(plot_data$Eigenvalues)) + + plot <- plot_data %>% + ggplot(aes_string(x="n.Factors", y="Eigenvalues")) + + geom_area(aes_string(y="area"), + fill="#FFC107", + alpha=0.5) + + geom_line(colour="#E91E63", + size=1) + + geom_hline(yintercept = 1, linetype="dashed", colour="#607D8B") + + geom_line(aes_string(y = "var"), + colour="#2196F3", + size=1) + + scale_y_continuous(sec.axis = sec_axis(trans= ~.*(max(eigenvalues$Cum.Variance) / max(eigenvalues$Eigenvalues)), + name = 'Cumulative Variance\n')) + + ylab("Eigenvalues\n") + + xlab("\nNumber of Factors") + + theme_minimal() + plot + + # Output + # ------------- + output <- list(text = text, plot = plot, summary = summary, values = values) + + class(output) <- c("psychobject", "list") + return(output) + +} diff --git a/man/n_factors.Rd b/man/n_factors.Rd new file mode 100644 index 0000000..83f3d39 --- /dev/null +++ b/man/n_factors.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/n_factors.R +\name{n_factors} +\alias{n_factors} +\title{Find Optimal Factor Number.} +\usage{ +n_factors(df, rotate = "varimax", fm = "minres", n_max = 8) +} +\arguments{ +\item{df}{The dataframe} + +\item{rotate}{What rotation to use c("none", "varimax", "oblimin","promax")} + +\item{fm}{Factoring method – fm="pa" Principal Axis Factor Analysis, +fm = "minres" minimum residual (OLS) factoring fm="mle" Maximum Likelihood FA, +fm="pc" Principal Components"} + +\item{n_max}{How many factors to test.} +} +\value{ +output +} +\description{ +Find optimal factor number using various solutions. +} +\examples{ +df <- dplyr::select_if(attitude, is.numeric) +results <- psycho::n_factors(df) + +summary(results) +plot(results) + +# See details on methods +psycho::values(results)$methods + +} +\author{ +\href{https://dominiquemakowski.github.io/}{Dominique Makowski} +} diff --git a/tests/testthat/test-n_factors.R b/tests/testthat/test-n_factors.R new file mode 100644 index 0000000..9e60768 --- /dev/null +++ b/tests/testthat/test-n_factors.R @@ -0,0 +1,12 @@ +context("n_factors") + +test_that("Correct Value", { + results <- attitude %>% + select_if(is.numeric) %>% + psycho::n_factors() + + testthat::expect_equal(nrow(summary(results)), 7) + testthat::expect_equal(nrow(psycho::values(results)$methods), 9) + testthat::expect_equal(length(plot(results)), 9) + +}) diff --git a/vignettes/overview.Rmd b/vignettes/overview.Rmd index 82a14b6..cbfca74 100644 --- a/vignettes/overview.Rmd +++ b/vignettes/overview.Rmd @@ -8,7 +8,7 @@ author: date: "`r Sys.Date()`" tags: [r, psychology, neuroscience] abstract: | - Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze() function transforms other R objects (for now, only stan_lmer type) into psychobjects. Four functions can then be applied on a psychobject: summary(), print(), plot() and values(). Contrary to many other packages which goal is to produce statistical analyzes, `psycho`'s goal is to fill the gap between statistical R output and statistical report writing, with a focus on APA formatting guidelines. Complex outputs, such as those of Bayesian linear models, are automatically transformed into readable text, important values are extracted and plots are drawn to illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, saving time and preventing errors for better, reproducible, science. + Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the `analyze()` function transforms other R objects into psychobjects. Four functions can then be applied on a psychobject: `summary()`, `print()`, `plot()` and `values()`. Contrary to many other packages which goal is to produce statistical analyzes, `psycho` aims at filling the gap between statistical R outputs and statistical report writing, with a focus on APA formatting guidelines, to enhance the standardization of results reporting. Complex outputs, such as those of Bayesian and frequentist mixed models, are automatically transformed into readable text, tables, and plots that illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, promoting data exploration, saving time and preventing errors for better, reproducible, science. vignette: > %\VignetteIndexEntry{Overview} %\VignetteEngine{knitr::rmarkdown} @@ -149,7 +149,43 @@ print(results) plot(results) ``` +------ + +## How many factors/components to retain? + +The `n_factors()` function is useful in before running principal component (PCA) or factor (FA) analysis. As many statistical methods exists to that purpose, this function gathers them together and gives an overview on the most frequent result. It also draw a nice plot with the eigenvalues and the proportion of explained variance. + + +```{r echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +results <- attitude %>% + select_if(is.numeric) %>% + psycho::n_factors() + +# Get a summary +summary(results) +``` + +```{r echo=FALSE, message=FALSE, warning=FALSE} +kable(summary(results)) +``` +We can also extract the final result (the optimal number of factors) for each method: + +```{r echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +psycho::values(results)$methods +``` + +```{r echo=FALSE, message=FALSE, warning=FALSE} +kable(psycho::values(results)$methods) +``` + +And, of course, plot it :) + +```{r, fig.width=7, fig.height=4.5, eval = TRUE, results='markup', fig.align='center'} +plot(results) +``` + + ------ ## Analyze the Mixed-Modelling Framework diff --git a/vignettes/overview.html b/vignettes/overview.html index 560c6ed..7a415b7 100644 --- a/vignettes/overview.html +++ b/vignettes/overview.html @@ -12,7 +12,7 @@ - + psycho for R @@ -80,10 +80,10 @@

psycho for R

Dominique Makowski

-

2017-10-13

+

2017-11-10

Abstract

-

Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze() function transforms other R objects (for now, only stan_lmer type) into psychobjects. Four functions can then be applied on a psychobject: summary(), print(), plot() and values(). Contrary to many other packages which goal is to produce statistical analyzes, psycho’s goal is to fill the gap between statistical R output and statistical report writing, with a focus on APA formatting guidelines. Complex outputs, such as those of Bayesian linear models, are automatically transformed into readable text, important values are extracted and plots are drawn to illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, saving time and preventing errors for better, reproducible, science.

+

Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze() function transforms other R objects into psychobjects. Four functions can then be applied on a psychobject: summary(), print(), plot() and values(). Contrary to many other packages which goal is to produce statistical analyzes, psycho aims at filling the gap between statistical R outputs and statistical report writing, with a focus on APA formatting guidelines, to enhance the standardization of results reporting. Complex outputs, such as those of Bayesian and frequentist mixed models, are automatically transformed into readable text, tables, and plots that illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, promoting data exploration, saving time and preventing errors for better, reproducible, science.

@@ -100,6 +100,7 @@

2017-10-13

  • Normalize / Z-score / Scale
  • Assess
  • +
  • How many factors/components to retain?
  • Analyze the Mixed-Modelling Framework