diff --git a/.Rbuildignore b/.Rbuildignore index 03e6c10..fd1746c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,5 +1,9 @@ ^.*\.Rproj$ ^\.Rproj\.user$ .github +.vscode metadata.yaml -LICENSE.md \ No newline at end of file +LICENSE.md +RCMD-CHECK.R +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 03b14f4..11accbe 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ *.Rproj* .Rproj.user inst/doc +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index d202974..ee934b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ApplyPolygenicScore Type: Package Title: Utilities for the Application of a Polygenic Score to a VCF -Version: 0.1.0 +Version: 1.0.0 Authors@R: c( person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'), person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '000-0001-7246-2771')), diff --git a/NEWS.md b/NEWS.md index 1049c53..3354caa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ -# ApplyPolygenicScore (0.1.0) +# ApplyPolygenicScore 1.0.0 (2024-07-25) + +* First release + +# ApplyPolygenicScore 0.1.0 (2024-07-25) * INITIAL FEATURES diff --git a/R/apply-pgs.R b/R/apply-pgs.R index 959ef5e..54a9254 100644 --- a/R/apply-pgs.R +++ b/R/apply-pgs.R @@ -98,9 +98,9 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' @param n.percentiles An integer indicating the number of percentiles to calculate for the PGS. Default is \code{NULL}. #' @param analysis.source.pgs A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none". #' When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection. -#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application. If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}. +#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application. +#' If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}. #' @return A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided. -#' #' \strong{Output Structure} #' #' The outputed list contains the following elements: @@ -154,7 +154,7 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes. #' #' \code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome: -#' \deqn{\overline{dosage_{k}} = EAF_k * P}} +#' \deqn{\overline{dosage_{k}} = EAF_k * P} #' where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2 #' This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium. #' By default, the effect allele frequency is calculated from the provided VCF data. @@ -224,7 +224,6 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' pgs.weight.data = pgs.import$pgs.weight.data, #' validate.inputs.only = TRUE #' ); -#' #' @export apply.polygenic.score <- function( vcf.data, diff --git a/README.md b/README.md index a2ebe7c..388a249 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,11 @@ To install the latest development version from GitHub: # install.packages("devtools") devtools::install_github("uclahs-cds/package-ApplyPolygenicScore") + +# To access vignettes, make sure to add the `build_vignettes` argument: + +devtools::install_github("uclahs-cds/package-ApplyPolygenicScore", build_vignettes = TRUE) + ``` ## Getting Started @@ -72,6 +77,19 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol For more step-by-step instructions, check out our vignettes. ## Resources +This package is in the process of being submitted to CRAN, where the manual and vignettes will be readily available. In the meantime, if you have installed the package from GitHub with `build_vignettes = TRUE`, you may view the vignette by running the following: + +``` +vignette('UserGuide', package = 'ApplyPolygenicScore') +``` +Or by simply opening the rendered file that will be automatically written to the `doc` folder in your local package directory. + +View function-specific documentation using `?`: + +``` +?apply.polygenic.score +``` + ## Getting Help Looking for guidance or support with ApplyPolygenicScore? Check out our [Discussions](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions) page. diff --git a/man/apply.polygenic.score.Rd b/man/apply.polygenic.score.Rd index 2c7fb7e..d307fda 100644 --- a/man/apply.polygenic.score.Rd +++ b/man/apply.polygenic.score.Rd @@ -45,10 +45,76 @@ Provide allele frequency as a column is \code{pgs.weight.data} named \code{allel \item{analysis.source.pgs}{A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none". When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection.} -\item{validate.inputs.only}{A logical indicating whether to only perform input data validation checks without running PGS application. If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.} +\item{validate.inputs.only}{A logical indicating whether to only perform input data validation checks without running PGS application. +If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.} } \value{ +A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided. +\strong{Output Structure} +The outputed list contains the following elements: +\itemize{ +\item pgs.output: A data.frame containing the PGS per sample and optional phenotype data. +\item regression.output: A data.frame containing the results of the regression analysis if phenotype.analysis.columns are provided, otherwise \code{NULL}. +} + +pgs.output columns: +\itemize{ +\item \code{Indiv}: A character string indicating the sample ID. +\item \code{PGS}: A numeric vector indicating the PGS per sample. (only if missing.genotype.method includes "none") +\item \code{PGS.with.normalized.missing}: A numeric vector indicating the PGS per sample with missing genotypes normalized. (only if missing.genotype.method includes "normalize") +\item \code{PGS.with.replaced.missing}: A numeric vector indicating the PGS per sample with missing genotypes replaced by mean dosage. (only if missing.genotype.method includes "mean.dosage") +\item \code{percentile}: A numeric vector indicating the percentile rank of the PGS. +\item \code{decile}: A numeric vector indicating the decile rank of the PGS. +\item \code{quartile}: A numeric vector indicating the quartile rank of the PGS. +\item \code{percentile.X:} A numeric vector indicating the user-specified percentile rank of the PGS where "X" is substituted by \code{n.percentiles}. (only if \code{n.percentiles} is specified) +\item \code{n.missing.genotypes}: A numeric vector indicating the number of missing genotypes per sample. +\item \code{percent.missing.genotypes}: A numeric vector indicating the percentage of missing genotypes per sample. +\item All columns in \code{phenotype.data} if provided. +} + +regression.output columns: +\itemize{ +\item phenotype: A character vector of phenotype names. +\item \code{model}: A character vector indicating the regression model used. One of "logistic.regression" or "linear.regression". +\item \code{beta}: A numeric vector indicating the beta coefficient of the regression analysis. +\item \code{se}: A numeric vector indicating the standard error of the beta coefficient. +\item \code{p.value}: A numeric vector indicating the p-value of the beta coefficient. +\item \code{r.squared}: A numeric vector indicating the r-squared value of linear regression analysis. NA for logistic regression. +\item \code{AUC}: A numeric vector indicating the area under the curve of logistic regression analysis. NA for linear regression. +} + +\strong{PGS Calculation} + +PGS for each individual \emph{i} is calculated as the sum of the product of the dosage and beta coefficient for each variant in the PGS: +\deqn{PGS_i = \sum_{m=1}^{M} \left( \beta_m \times dosage_{im} \right)} +Where \emph{m} is a PGS component variant out of a total \emph{M} variants. + +\strong{Missing Genotype Handling} + +Missing genotypes are handled by three methods: + +\code{none}: Missing genotype dosages are excluded from the PGS calculation. +This is equivalent to assuming that all missing genotypes are homozygous for the non-effect allele, resulting in a dosage of 0. + +\code{normalize}: Missing genotypes are excluded from score calculation but the final score is normalized by the number of non-missing alleles. +The calculation assumes a diploid genome: +\deqn{PGS_i = \dfrac{\sum \left( \beta_m \times dosage_{im} \right)}{P_i * M_{non-missing}}} +Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes. + +\code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome: +\deqn{\overline{dosage_{k}} = EAF_k * P} +where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2 +This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium. +By default, the effect allele frequency is calculated from the provided VCF data. +For variants that are missing in all individuals, dosage is assumed to be zero (homozygous non-reference) for all individuals. +An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}. + +\strong{Multiallelic Site Handling} + +VCF genotype data are matched to PGS data by chromosome, position, and effect allele. If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the +alleles that each individual carries. It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted. +VCF data can be formatted to merged format using external tools for VCF file manipulation. } \description{ Apply a polygenic score to VCF data. @@ -110,5 +176,4 @@ apply.polygenic.score( pgs.weight.data = pgs.import$pgs.weight.data, validate.inputs.only = TRUE ); - } diff --git a/metadata.yaml b/metadata.yaml index eb9a6dd..fcc6015 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -1,8 +1,8 @@ --- -Category: '' # shoule be one of docker/pipeline/project/template/tool/training/users -Description: '' # Description of why the repository exists -Maintainers: ['someone@mednet.ucla.edu', 'someoneelse@mednet.ucla.edu'] # email address of maintainers -Contributors: 'Xavier Hernandez' # Full names of contributors -Languages: ['R', 'perl', 'nextflow'] # programming languages used -Dependencies: 'BPG' # packages, tools that repo needs to run +Category: 'tool' # should be one of docker/pipeline/project/template/tool/training/users +Description: 'Utilities for matching a polygenic score coordinate file to a VCF input and the subsequent calculation of the provided polygenic score in each individual.' # Description of why the repository exists +Maintainers: ['nzeltser@mednet.ucla.edu'] # email address of maintainers +Contributors: ['Nicole Zeltser', 'Rachel Dang'] # Full names of contributors +Languages: ['R'] # programming languages used +Dependencies: ['BPG', 'vcfR', 'pROC', 'data.table', 'reshape2'] # packages, tools that repo needs to run References: '' # is the tool/dependencies published, is there a confluence page