From cdf5dd5a1909d27b3b4a50f766df9f3ee4774e86 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Tue, 23 Jul 2024 10:47:04 -0700 Subject: [PATCH 1/5] bracket mismatch in manual --- R/apply-pgs.R | 7 ++-- man/apply.polygenic.score.Rd | 69 ++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/R/apply-pgs.R b/R/apply-pgs.R index 959ef5e..54a9254 100644 --- a/R/apply-pgs.R +++ b/R/apply-pgs.R @@ -98,9 +98,9 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' @param n.percentiles An integer indicating the number of percentiles to calculate for the PGS. Default is \code{NULL}. #' @param analysis.source.pgs A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none". #' When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection. -#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application. If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}. +#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application. +#' If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}. #' @return A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided. -#' #' \strong{Output Structure} #' #' The outputed list contains the following elements: @@ -154,7 +154,7 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes. #' #' \code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome: -#' \deqn{\overline{dosage_{k}} = EAF_k * P}} +#' \deqn{\overline{dosage_{k}} = EAF_k * P} #' where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2 #' This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium. #' By default, the effect allele frequency is calculated from the provided VCF data. @@ -224,7 +224,6 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col #' pgs.weight.data = pgs.import$pgs.weight.data, #' validate.inputs.only = TRUE #' ); -#' #' @export apply.polygenic.score <- function( vcf.data, diff --git a/man/apply.polygenic.score.Rd b/man/apply.polygenic.score.Rd index 2c7fb7e..d307fda 100644 --- a/man/apply.polygenic.score.Rd +++ b/man/apply.polygenic.score.Rd @@ -45,10 +45,76 @@ Provide allele frequency as a column is \code{pgs.weight.data} named \code{allel \item{analysis.source.pgs}{A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none". When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection.} -\item{validate.inputs.only}{A logical indicating whether to only perform input data validation checks without running PGS application. If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.} +\item{validate.inputs.only}{A logical indicating whether to only perform input data validation checks without running PGS application. +If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.} } \value{ +A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided. +\strong{Output Structure} +The outputed list contains the following elements: +\itemize{ +\item pgs.output: A data.frame containing the PGS per sample and optional phenotype data. +\item regression.output: A data.frame containing the results of the regression analysis if phenotype.analysis.columns are provided, otherwise \code{NULL}. +} + +pgs.output columns: +\itemize{ +\item \code{Indiv}: A character string indicating the sample ID. +\item \code{PGS}: A numeric vector indicating the PGS per sample. (only if missing.genotype.method includes "none") +\item \code{PGS.with.normalized.missing}: A numeric vector indicating the PGS per sample with missing genotypes normalized. (only if missing.genotype.method includes "normalize") +\item \code{PGS.with.replaced.missing}: A numeric vector indicating the PGS per sample with missing genotypes replaced by mean dosage. (only if missing.genotype.method includes "mean.dosage") +\item \code{percentile}: A numeric vector indicating the percentile rank of the PGS. +\item \code{decile}: A numeric vector indicating the decile rank of the PGS. +\item \code{quartile}: A numeric vector indicating the quartile rank of the PGS. +\item \code{percentile.X:} A numeric vector indicating the user-specified percentile rank of the PGS where "X" is substituted by \code{n.percentiles}. (only if \code{n.percentiles} is specified) +\item \code{n.missing.genotypes}: A numeric vector indicating the number of missing genotypes per sample. +\item \code{percent.missing.genotypes}: A numeric vector indicating the percentage of missing genotypes per sample. +\item All columns in \code{phenotype.data} if provided. +} + +regression.output columns: +\itemize{ +\item phenotype: A character vector of phenotype names. +\item \code{model}: A character vector indicating the regression model used. One of "logistic.regression" or "linear.regression". +\item \code{beta}: A numeric vector indicating the beta coefficient of the regression analysis. +\item \code{se}: A numeric vector indicating the standard error of the beta coefficient. +\item \code{p.value}: A numeric vector indicating the p-value of the beta coefficient. +\item \code{r.squared}: A numeric vector indicating the r-squared value of linear regression analysis. NA for logistic regression. +\item \code{AUC}: A numeric vector indicating the area under the curve of logistic regression analysis. NA for linear regression. +} + +\strong{PGS Calculation} + +PGS for each individual \emph{i} is calculated as the sum of the product of the dosage and beta coefficient for each variant in the PGS: +\deqn{PGS_i = \sum_{m=1}^{M} \left( \beta_m \times dosage_{im} \right)} +Where \emph{m} is a PGS component variant out of a total \emph{M} variants. + +\strong{Missing Genotype Handling} + +Missing genotypes are handled by three methods: + +\code{none}: Missing genotype dosages are excluded from the PGS calculation. +This is equivalent to assuming that all missing genotypes are homozygous for the non-effect allele, resulting in a dosage of 0. + +\code{normalize}: Missing genotypes are excluded from score calculation but the final score is normalized by the number of non-missing alleles. +The calculation assumes a diploid genome: +\deqn{PGS_i = \dfrac{\sum \left( \beta_m \times dosage_{im} \right)}{P_i * M_{non-missing}}} +Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes. + +\code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome: +\deqn{\overline{dosage_{k}} = EAF_k * P} +where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2 +This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium. +By default, the effect allele frequency is calculated from the provided VCF data. +For variants that are missing in all individuals, dosage is assumed to be zero (homozygous non-reference) for all individuals. +An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}. + +\strong{Multiallelic Site Handling} + +VCF genotype data are matched to PGS data by chromosome, position, and effect allele. If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the +alleles that each individual carries. It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted. +VCF data can be formatted to merged format using external tools for VCF file manipulation. } \description{ Apply a polygenic score to VCF data. @@ -110,5 +176,4 @@ apply.polygenic.score( pgs.weight.data = pgs.import$pgs.weight.data, validate.inputs.only = TRUE ); - } From 177bdb2665f1df6cdfd34686f93b3bd13762df68 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Thu, 25 Jul 2024 17:19:54 -0700 Subject: [PATCH 2/5] fix NEWS formatting --- .Rbuildignore | 4 +++- NEWS.md | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 03e6c10..53bd77b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,5 +1,7 @@ ^.*\.Rproj$ ^\.Rproj\.user$ .github +.vscode metadata.yaml -LICENSE.md \ No newline at end of file +LICENSE.md +RCMD-CHECK.R diff --git a/NEWS.md b/NEWS.md index 1049c53..cfb2cc1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,3 @@ -# ApplyPolygenicScore (0.1.0) +# ApplyPolygenicScore 0.1.0 (Date) * INITIAL FEATURES From cec95f379543b51921d018eda4c4448f51ceac6c Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Thu, 25 Jul 2024 19:08:35 -0700 Subject: [PATCH 3/5] update version --- DESCRIPTION | 2 +- NEWS.md | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d202974..ee934b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ApplyPolygenicScore Type: Package Title: Utilities for the Application of a Polygenic Score to a VCF -Version: 0.1.0 +Version: 1.0.0 Authors@R: c( person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'), person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '000-0001-7246-2771')), diff --git a/NEWS.md b/NEWS.md index cfb2cc1..3354caa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ -# ApplyPolygenicScore 0.1.0 (Date) +# ApplyPolygenicScore 1.0.0 (2024-07-25) + +* First release + +# ApplyPolygenicScore 0.1.0 (2024-07-25) * INITIAL FEATURES From 4a5e783b0adc2273f3556efea42075f3d0557351 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Thu, 25 Jul 2024 19:12:53 -0700 Subject: [PATCH 4/5] update metadata yaml --- metadata.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/metadata.yaml b/metadata.yaml index eb9a6dd..fcc6015 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -1,8 +1,8 @@ --- -Category: '' # shoule be one of docker/pipeline/project/template/tool/training/users -Description: '' # Description of why the repository exists -Maintainers: ['someone@mednet.ucla.edu', 'someoneelse@mednet.ucla.edu'] # email address of maintainers -Contributors: 'Xavier Hernandez' # Full names of contributors -Languages: ['R', 'perl', 'nextflow'] # programming languages used -Dependencies: 'BPG' # packages, tools that repo needs to run +Category: 'tool' # should be one of docker/pipeline/project/template/tool/training/users +Description: 'Utilities for matching a polygenic score coordinate file to a VCF input and the subsequent calculation of the provided polygenic score in each individual.' # Description of why the repository exists +Maintainers: ['nzeltser@mednet.ucla.edu'] # email address of maintainers +Contributors: ['Nicole Zeltser', 'Rachel Dang'] # Full names of contributors +Languages: ['R'] # programming languages used +Dependencies: ['BPG', 'vcfR', 'pROC', 'data.table', 'reshape2'] # packages, tools that repo needs to run References: '' # is the tool/dependencies published, is there a confluence page From 6d8930ad93548a91e348c2cc68a276922494e74b Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 26 Jul 2024 11:22:46 -0700 Subject: [PATCH 5/5] update README with resources --- .Rbuildignore | 2 ++ .gitignore | 2 ++ README.md | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 53bd77b..fd1746c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,5 @@ metadata.yaml LICENSE.md RCMD-CHECK.R +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 03b14f4..11accbe 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ *.Rproj* .Rproj.user inst/doc +/doc/ +/Meta/ diff --git a/README.md b/README.md index a2ebe7c..388a249 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,11 @@ To install the latest development version from GitHub: # install.packages("devtools") devtools::install_github("uclahs-cds/package-ApplyPolygenicScore") + +# To access vignettes, make sure to add the `build_vignettes` argument: + +devtools::install_github("uclahs-cds/package-ApplyPolygenicScore", build_vignettes = TRUE) + ``` ## Getting Started @@ -72,6 +77,19 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol For more step-by-step instructions, check out our vignettes. ## Resources +This package is in the process of being submitted to CRAN, where the manual and vignettes will be readily available. In the meantime, if you have installed the package from GitHub with `build_vignettes = TRUE`, you may view the vignette by running the following: + +``` +vignette('UserGuide', package = 'ApplyPolygenicScore') +``` +Or by simply opening the rendered file that will be automatically written to the `doc` folder in your local package directory. + +View function-specific documentation using `?`: + +``` +?apply.polygenic.score +``` + ## Getting Help Looking for guidance or support with ApplyPolygenicScore? Check out our [Discussions](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions) page.