diff --git a/DESCRIPTION b/DESCRIPTION index c2e3f59..df08f76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: psycho Type: Package Title: Efficient and Publishing-Oriented Workflow for Psychological Science -Version: 0.0.2 +Version: 0.0.3 Authors@R: c( person("Dominique", "Makowski", @@ -33,6 +33,9 @@ Imports: tidyverse, rtf, psych, + MASS, + nFactors, + qgraph, ppcor, corrplot, rstanarm, diff --git a/NAMESPACE b/NAMESPACE index b3684d6..94ceb7c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(find_season) export(format_digit) export(format_p) export(interpret_d) +export(n_factors) export(normalize) export(values) import(corrplot) @@ -23,10 +24,19 @@ import(ppcor) import(purrr) import(rstanarm) import(tidyr) +importFrom(MASS,ginv) +importFrom(MASS,mvrnorm) importFrom(MuMIn,r.squaredGLMM) importFrom(MuMIn,std.coef) +importFrom(nFactors,moreStats) +importFrom(nFactors,nScree) +importFrom(psych,VSS) importFrom(psych,corr.p) importFrom(psych,corr.test) +importFrom(qgraph,cor_auto) +importFrom(stats,cov) +importFrom(stats,dnorm) importFrom(stats,ecdf) importFrom(stats,na.omit) +importFrom(stats,qnorm) importFrom(stats,quantile) diff --git a/NEWS.md b/NEWS.md index c45322e..3d41d8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# 0.0.2 (2017-11-10) + +### Breaking changes +### New functions / parameters +- `n_factors`: How many factors to retain for PCA or factor analysis? +### Major changes +### Minor changes + + # 0.0.2 (2017-10-12) ### Breaking changes diff --git a/R/format_p.R b/R/format_p.R index 0638e35..31bfeb8 100644 --- a/R/format_p.R +++ b/R/format_p.R @@ -9,6 +9,6 @@ format_p <- function(pvalues) { ifelse(pvalues < 0.001, "< .001***", ifelse(pvalues < 0.01, "< .01**", ifelse(pvalues < 0.05, "< .05*", - ifelse(pvalues < 0.1, paste0(round(pvalues, 2), "\xB0"), + ifelse(pvalues < 0.1, paste0("= ", round(pvalues, 2), "\xB0"), "> .1")))) } diff --git a/R/n_factors.R b/R/n_factors.R new file mode 100644 index 0000000..c33a12c --- /dev/null +++ b/R/n_factors.R @@ -0,0 +1,177 @@ +#' Find Optimal Factor Number. +#' +#' Find optimal factor number using various solutions. +#' +#' @param df The dataframe +#' @param rotate What rotation to use c("none", "varimax", "oblimin","promax") +#' @param fm Factoring method – fm="pa" Principal Axis Factor Analysis, +#' fm = "minres" minimum residual (OLS) factoring fm="mle" Maximum Likelihood FA, +#' fm="pc" Principal Components" +#' @param n_max How many factors to test. +#' +#' @return output +#' +#' @examples +#' df <- dplyr::select_if(attitude, is.numeric) +#' results <- psycho::n_factors(df) +#' +#' summary(results) +#' plot(results) +#' +#' # See details on methods +#' psycho::values(results)$methods +#' +#' @author \href{https://dominiquemakowski.github.io/}{Dominique Makowski} +#' +#' @importFrom qgraph cor_auto +#' @importFrom psych VSS +#' @importFrom MASS mvrnorm +#' @importFrom MASS ginv +#' @importFrom nFactors moreStats +#' @importFrom nFactors nScree +#' @importFrom stats cov +#' @importFrom stats dnorm +#' @importFrom stats qnorm +#' @export +n_factors <- function(df, rotate="varimax", fm="minres", n_max=8){ + + # Copy the parallel function from nFactors to correct the use of mvrnorm + parallel <- function (subject = 100, var = 10, rep = 100, cent = 0.05, quantile = cent, + model = "components", sd = diag(1, var), ...) + { + r <- subject + c <- var + y <- matrix(c(1:r * c), nrow = r, ncol = c) + ycor <- matrix(c(1:c * c), nrow = c, ncol = c) + evpea <- NULL + leg.txt <- "Pearson" + for (k in c(1:rep)) { + y <- MASS::mvrnorm(n = r, mu = rep(0, var), Sigma = sd, empirical = FALSE) + corY <- cov(y, ...) + if (model == "components") + diag(corY) <- diag(sd) + if (model == "factors") + corY <- corY - MASS::ginv(diag(diag(MASS::ginv(corY)))) + evpea <- rbind(evpea, eigen(corY)[[1]]) + } + SEcentile <- function(sd, n = 100, p = 0.95) { + return(sd/sqrt(n) * sqrt(p * (1 - p))/dnorm(qnorm(p))) + } + sprob <- c(cent) + mevpea <- sapply(as.data.frame(evpea), mean) + sevpea <- sapply(as.data.frame(evpea), sd) + qevpea <- nFactors::moreStats(evpea, quantile = quantile)[3, ] + sqevpea <- sevpea + sqevpea <- sapply(as.data.frame(sqevpea), SEcentile, n = rep, + p = cent) + result <- list(eigen = data.frame(mevpea, sevpea, qevpea, + sqevpea), subject = r, variables = c, centile = cent) + class(result) <- "parallel" + return(result) + } + + + cor <- qgraph::cor_auto(df) + + ap <- parallel(subject=nrow(df), var=ncol(df)) + nS <- nFactors::nScree(x=eigen(cor)$values, aparallel=ap$eigen$qevpea) + + # Eigeinvalues data + eigenvalues <- nS$Analysis %>% + dplyr::select_("Eigenvalues", + "Exp.Variance"="Prop", + "Cum.Variance"="Cumu") %>% + mutate_("n.Factors"= ~1:nrow(nS$Analysis)) + + + # Processing + # ------------------- + results <- data.frame(Method=c("Optimal Coordinates", "Acceleration Factor", "Parallel Analysis", "Eigenvalues (Kaiser Criterion)"), n_optimal=as.numeric(nS$Components[1,])) + + + vss <- psych::VSS(cor, n=n_max, n.obs=nrow(df), rotate=rotate, fm=fm, plot=F) # fm can be "pa", "pc", "minres", "mle" + stats <- vss$vss.stats + stats$map <- vss$map + stats$n_factors <- 1:nrow(stats) + + results2 <- data.frame(Method=c("Velicer MAP", + "BIC", + "Sample Size Adjusted BIC"), + n_optimal=c(na.omit(stats[stats$map==min(stats$map, na.rm = T),])$n_factors, + na.omit(stats[stats$BIC==min(stats$BIC, na.rm = T),])$n_factors, + na.omit(stats[stats$SABIC==min(stats$SABIC, na.rm = T),])$n_factors)) + results <- rbind(results, results2) + + + cfits <- vss[grep("cfit", names(vss))] + for (name in names(cfits)){ + cfit <- cfits[[name]] + + cfit <- data.frame(cfit=cfit, n_factors=1:length(cfit)) + + result3 <- data.frame(Method=c(gsub("cfit.", "VSS Complexity ", name)), + n_optimal=c(na.omit(cfit[cfit$cfit==max(cfit$cfit, na.rm = T),])$n_factors)) + + results <- rbind(results, result3) + } + + + eigenvalues <- results %>% + group_by_("n_optimal") %>% + summarise_("n_method"=~n()) %>% + mutate_("n_optimal"=~factor(n_optimal, levels=1:nrow(eigenvalues))) %>% + complete_("n_optimal", fill=list(n_method = 0)) %>% + arrange_("n_optimal") %>% + rename_("n.Factors"="n_optimal", + "n.Methods"="n_method") %>% + mutate_("n.Factors"=~as.integer(n.Factors)) %>% + left_join(eigenvalues, by="n.Factors") + + + # Values + # ------------- + values <- list(eigenvalues=eigenvalues, methods=results) + + # Summary + # ------------- + summary <- eigenvalues + + # Text + # ------------- + text <- "Not implemented yet :(" + + + # Plot + # ------------- + plot_data <- eigenvalues + plot_data$n.Methods.Ratio <- plot_data$n.Methods/sum(plot_data$n.Methods) + plot_data$n.Methods.Ratio <- plot_data$n.Methods.Ratio*(1/max(plot_data$n.Methods.Ratio)) + plot_data$area <- plot_data$n.Methods.Ratio/(max(plot_data$n.Methods.Ratio) / max(plot_data$Eigenvalues)) + plot_data$var <- plot_data$Cum.Variance/(max(plot_data$Cum.Variance) / max(plot_data$Eigenvalues)) + + plot <- plot_data %>% + ggplot(aes_string(x="n.Factors", y="Eigenvalues")) + + geom_area(aes_string(y="area"), + fill="#FFC107", + alpha=0.5) + + geom_line(colour="#E91E63", + size=1) + + geom_hline(yintercept = 1, linetype="dashed", colour="#607D8B") + + geom_line(aes_string(y = "var"), + colour="#2196F3", + size=1) + + scale_y_continuous(sec.axis = sec_axis(trans= ~.*(max(eigenvalues$Cum.Variance) / max(eigenvalues$Eigenvalues)), + name = 'Cumulative Variance\n')) + + ylab("Eigenvalues\n") + + xlab("\nNumber of Factors") + + theme_minimal() + plot + + # Output + # ------------- + output <- list(text = text, plot = plot, summary = summary, values = values) + + class(output) <- c("psychobject", "list") + return(output) + +} diff --git a/man/n_factors.Rd b/man/n_factors.Rd new file mode 100644 index 0000000..83f3d39 --- /dev/null +++ b/man/n_factors.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/n_factors.R +\name{n_factors} +\alias{n_factors} +\title{Find Optimal Factor Number.} +\usage{ +n_factors(df, rotate = "varimax", fm = "minres", n_max = 8) +} +\arguments{ +\item{df}{The dataframe} + +\item{rotate}{What rotation to use c("none", "varimax", "oblimin","promax")} + +\item{fm}{Factoring method – fm="pa" Principal Axis Factor Analysis, +fm = "minres" minimum residual (OLS) factoring fm="mle" Maximum Likelihood FA, +fm="pc" Principal Components"} + +\item{n_max}{How many factors to test.} +} +\value{ +output +} +\description{ +Find optimal factor number using various solutions. +} +\examples{ +df <- dplyr::select_if(attitude, is.numeric) +results <- psycho::n_factors(df) + +summary(results) +plot(results) + +# See details on methods +psycho::values(results)$methods + +} +\author{ +\href{https://dominiquemakowski.github.io/}{Dominique Makowski} +} diff --git a/tests/testthat/test-n_factors.R b/tests/testthat/test-n_factors.R new file mode 100644 index 0000000..9e60768 --- /dev/null +++ b/tests/testthat/test-n_factors.R @@ -0,0 +1,12 @@ +context("n_factors") + +test_that("Correct Value", { + results <- attitude %>% + select_if(is.numeric) %>% + psycho::n_factors() + + testthat::expect_equal(nrow(summary(results)), 7) + testthat::expect_equal(nrow(psycho::values(results)$methods), 9) + testthat::expect_equal(length(plot(results)), 9) + +}) diff --git a/vignettes/overview.Rmd b/vignettes/overview.Rmd index 82a14b6..cbfca74 100644 --- a/vignettes/overview.Rmd +++ b/vignettes/overview.Rmd @@ -8,7 +8,7 @@ author: date: "`r Sys.Date()`" tags: [r, psychology, neuroscience] abstract: | - Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze() function transforms other R objects (for now, only stan_lmer type) into psychobjects. Four functions can then be applied on a psychobject: summary(), print(), plot() and values(). Contrary to many other packages which goal is to produce statistical analyzes, `psycho`'s goal is to fill the gap between statistical R output and statistical report writing, with a focus on APA formatting guidelines. Complex outputs, such as those of Bayesian linear models, are automatically transformed into readable text, important values are extracted and plots are drawn to illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, saving time and preventing errors for better, reproducible, science. + Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the `analyze()` function transforms other R objects into psychobjects. Four functions can then be applied on a psychobject: `summary()`, `print()`, `plot()` and `values()`. Contrary to many other packages which goal is to produce statistical analyzes, `psycho` aims at filling the gap between statistical R outputs and statistical report writing, with a focus on APA formatting guidelines, to enhance the standardization of results reporting. Complex outputs, such as those of Bayesian and frequentist mixed models, are automatically transformed into readable text, tables, and plots that illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, promoting data exploration, saving time and preventing errors for better, reproducible, science. vignette: > %\VignetteIndexEntry{Overview} %\VignetteEngine{knitr::rmarkdown} @@ -149,7 +149,43 @@ print(results) plot(results) ``` +------ + +## How many factors/components to retain? + +The `n_factors()` function is useful in before running principal component (PCA) or factor (FA) analysis. As many statistical methods exists to that purpose, this function gathers them together and gives an overview on the most frequent result. It also draw a nice plot with the eigenvalues and the proportion of explained variance. + + +```{r echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +results <- attitude %>% + select_if(is.numeric) %>% + psycho::n_factors() + +# Get a summary +summary(results) +``` + +```{r echo=FALSE, message=FALSE, warning=FALSE} +kable(summary(results)) +``` +We can also extract the final result (the optimal number of factors) for each method: + +```{r echo=FALSE, message=FALSE, warning=FALSE, results='hide'} +psycho::values(results)$methods +``` + +```{r echo=FALSE, message=FALSE, warning=FALSE} +kable(psycho::values(results)$methods) +``` + +And, of course, plot it :) + +```{r, fig.width=7, fig.height=4.5, eval = TRUE, results='markup', fig.align='center'} +plot(results) +``` + + ------ ## Analyze the Mixed-Modelling Framework diff --git a/vignettes/overview.html b/vignettes/overview.html index 560c6ed..7a415b7 100644 --- a/vignettes/overview.html +++ b/vignettes/overview.html @@ -12,7 +12,7 @@ - +
Abstract
-Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze() function transforms other R objects (for now, only stan_lmer type) into psychobjects. Four functions can then be applied on a psychobject: summary(), print(), plot() and values(). Contrary to many other packages which goal is to produce statistical analyzes, psycho
’s goal is to fill the gap between statistical R output and statistical report writing, with a focus on APA formatting guidelines. Complex outputs, such as those of Bayesian linear models, are automatically transformed into readable text, important values are extracted and plots are drawn to illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, saving time and preventing errors for better, reproducible, science.
Psycho is an R package that aims at providing tools for psychologists, neuropsychologists and neuroscientists, to transform statistical outputs into something readable that can be, almost directly, copied and pasted into a report. It also implements various functions useful in psychological science, such as correlation matrices, assessment plot creation or normalization. The package revolves around the psychobject. Main functions from the package return this type, and the analyze()
function transforms other R objects into psychobjects. Four functions can then be applied on a psychobject: summary()
, print()
, plot()
and values()
. Contrary to many other packages which goal is to produce statistical analyzes, psycho
aims at filling the gap between statistical R outputs and statistical report writing, with a focus on APA formatting guidelines, to enhance the standardization of results reporting. Complex outputs, such as those of Bayesian and frequentist mixed models, are automatically transformed into readable text, tables, and plots that illustrate the effects. Thus, the results can easily be incorporated into shareable reports and publications, promoting data exploration, saving time and preventing errors for better, reproducible, science.
You can save this correlation matrix using write.csv(print(cor), "correlation_table.csv")
. That is very useful to copy/paste it from excel to a paper or a report :)
You can also draw a quick visualization:
cor$plot()
## [1] "The participant (score = 124) is positioned at 1.6 standard deviations from the mean (M = 100, SD = 15). The participant's score is greater than 94.54 % of the general population."
+## [1] "The participant (score = 124) is positioned at 1.6 standard deviations from the mean (M = 100, SD = 15). The participant's score is greater than 94.44 % of the general population."
# Plot it
plot(results)
The n_factors()
function is useful in before running principal component (PCA) or factor (FA) analysis. As many statistical methods exists to that purpose, this function gathers them together and gives an overview on the most frequent result. It also draw a nice plot with the eigenvalues and the proportion of explained variance.
n.Factors | +n.Methods | +Eigenvalues | +Exp.Variance | +Cum.Variance | +
---|---|---|---|---|
1 | +5 | +3.7163758 | +0.5309108 | +0.5309108 | +
2 | +3 | +1.1409219 | +0.1629888 | +0.6938997 | +
3 | +1 | +0.8471915 | +0.1210274 | +0.8149270 | +
4 | +0 | +0.6128697 | +0.0875528 | +0.9024798 | +
5 | +0 | +0.3236728 | +0.0462390 | +0.9487188 | +
6 | +0 | +0.2185306 | +0.0312187 | +0.9799375 | +
7 | +0 | +0.1404378 | +0.0200625 | +1.0000000 | +
We can also extract the final result (the optimal number of factors) for each method:
+Method | +n_optimal | +
---|---|
Optimal Coordinates | +1 | +
Acceleration Factor | +1 | +
Parallel Analysis | +1 | +
Eigenvalues (Kaiser Criterion) | +2 | +
Velicer MAP | +1 | +
BIC | +2 | +
Sample Size Adjusted BIC | +3 | +
VSS Complexity 1 | +1 | +
VSS Complexity 2 | +2 | +
And, of course, plot it :)
+plot(results)
print(results)
## [1] "We fitted a Markov Chain Monte Carlo [type] model to predict[Y] with [X] (formula = RT ~ Condition + (1 | Participant) + (1 | Item)).Priors were set as follow: [INSERT INFO ABOUT PRIORS]."
-## [2] "Concerning the effect of (Intercept), there is a probability of 65.3% that its coefficient is between 0 and 1.58 (Median = 0.1, MAD = 0.25, Mean = 0.1, SD = 0.3, 95% CI [-0.5, 0.7])."
-## [3] "Based on Cohen (1988) recommandations, there is a probability of 0.22% that this effect size is very large, 1.62% that this effect size is large, 5.35% that this effect size is medium, 27.45% that this effect size is small, 30.65% that this effect is very small and 34.7% that it has an opposite direction(between 0 and 2e-04)."
-## [4] "Concerning the effect of ConditionB, there is a probability of 79.92% that its coefficient is between -0.87 and 0 (Median = -0.16, MAD = 0.2, Mean = -0.16, SD = 0.2, 95% CI [-0.54, 0.23])."
-## [5] "Based on Cohen (1988) recommandations, there is a probability of 0% that this effect size is very large, 0.18% that this effect size is large, 3.92% that this effect size is medium, 38.38% that this effect size is small, 37.45% that this effect is very small and 20.08% that it has an opposite direction(between 0 and 0.55)."
+## [1] "We fitted a Markov Chain Monte Carlo [type] model to predict[Y] with [X] (formula = RT ~ Condition + (1 | Participant) + (1 | Item)).Priors were set as follow: [INSERT INFO ABOUT PRIORS]."
+## [2] "Concerning the effect of (Intercept), there is a probability of 64.55% that its coefficient is between 0 and 1.62 (Median = 0.092, MAD = 0.26, Mean = 0.089, SD = 0.31, 95% CI [-0.53, 0.71])."
+## [3] "Based on Cohen (1988) recommandations, there is a probability of 0.2% that this effect size is very large, 1.55% that this effect size is large, 5.1% that this effect size is medium, 26.82% that this effect size is small, 30.88% that this effect is very small and 35.45% that it has an opposite direction(between 0 and 0.00025)."
+## [4] "Concerning the effect of ConditionB, there is a probability of 79.38% that its coefficient is between -0.93 and 0 (Median = -0.16, MAD = 0.2, Mean = -0.16, SD = 0.19, 95% CI [-0.53, 0.22])."
+## [5] "Based on Cohen (1988) recommandations, there is a probability of 0% that this effect size is very large, 0.05% that this effect size is large, 3.43% that this effect size is medium, 38.88% that this effect size is small, 37.03% that this effect is very small and 20.62% that it has an opposite direction(between 0 and 0.56)."
We can also plot the effects:
plot(results)
Obviously, you need to learn more about Bayesian analyses before running them. You can find more information in the rstanarm’s vignettes.