Skip to content

Commit

Permalink
Merge pull request #92 from opain/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
opain authored Apr 30, 2024
2 parents da8acad + a02f8c8 commit ef91489
Show file tree
Hide file tree
Showing 13 changed files with 2,762 additions and 1,797 deletions.
36 changes: 24 additions & 12 deletions Scripts/pipeline_reports/indiv_report_creator.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ if(imp_incl){
cat0("Individual: ", paste(unlist(strsplit(params$name, "_|-|\\.")), collapse = ' '), "\n\n")
} else {
target_psam <- fread(paste0(outdir, '/', params$name, '/geno/', params$name, '.ref.chr', CHROMS[1], '.psam'))
id_tmp <- target_psam[paste0(target_psam$V1,'.',target_psam$V2) == params$id,]
names(target_psam)[1]<-'FID'
id_tmp <- target_psam[paste0(target_psam$FID,'.',target_psam$IID) == params$id,]
cat0("Sample: ", params$name, "\n\n")
cat0("Family ID: ", id_tmp$V1, "\n\n")
cat0("Individual ID: ", id_tmp$V2, "\n\n")
cat0("Family ID: ", id_tmp$FID, "\n\n")
cat0("Individual ID: ", id_tmp$IID, "\n\n")
}
cat0("</font>\n\n")
Expand Down Expand Up @@ -244,12 +245,13 @@ cat0("\n\n")
cat0("<details><summary>Show principal component plots</summary>\n\n")
cat0(paste0("![Principal Component Scores Compared to Reference Populations](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)\n\n"))
cat0(paste0("![](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
cat0("Note. Black circle indicates the target individual. \n\n")
cat0("**Target Principal Component Scores Compared to Reference Populations.**\n\n")
cat0("**Note.** Black circle indicate target sample individuals.\n\n")
cat0("</details> \n\n")
cat0("</details>\n\n")
cat0("***\n\n")
cat0("*** \n\n")
```

Expand All @@ -269,6 +271,10 @@ if(!pgs_incl){
cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statistics were provided for polygenic scoring.\n")
cat0("- ", length(pgs_methods_list), " PGS methods were applied, including (", paste0(pgs_method_labels$label[pgs_method_labels$method %in% pgs_methods_list], collapse = ', '), ").\n")
if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){
cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n")
}
if(is.null(score_list)){
cat0("- No external score files were provided in score_list.\n\n")
} else {
Expand All @@ -294,11 +300,12 @@ for(gwas in gwas_list$name) {
sumstat_qc <- rbind(sumstat_qc, data.frame(
name = gwas,
label = gwas_list$label[gwas_list$name == gwas],
population = gwas_list$population[gwas_list$name == gwas],
orig_n = orig_n,
final_n = final_n))
}
names(sumstat_qc) <- c('Name', 'Label', 'NSNP Original', 'NSNP Final')
names(sumstat_qc) <- c('Name', 'Label', 'Population', 'NSNP Original', 'NSNP Final')
datatable(sumstat_qc,
rownames = FALSE,
Expand Down Expand Up @@ -355,6 +362,8 @@ datatable(score_qc,
width = '100%',
selection = 'none')
cat0("**Note.** The `Pass` column indicates whether a sufficient number of variants within the score file were present in the reference data.\n\n")
cat0("***\n\n")
```
Expand Down Expand Up @@ -681,8 +690,10 @@ for(pgs_method_i in unique(pgs_dat$pgs_method[pgs_dat$gwas == gwas_i])){
add_header_above(c(" " = 1, "PGS Descriptives" = 2, "Distribution in General\nPopulation"=3, "Distribution in People\nLike You" = 4)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive")))
cat('\n\n')
cat('**Note.** PGS R-squared/AUC is estimated via pseudovalidation, and may be inaccurate. To specify alternative parameters, check out our [interactive tool](https://opain.github.io/GenoPred/PRS_to_Abs_tool.html) for converting polygenic scores to the absolute scale.\n\n')
cat('***\n\n')
for(gwas_i in unique(pgs_dat$gwas[pgs_dat$pgs_method != 'external'])){
cat0('#### GWAS: ',gwas_list$label[gwas_list$name == gwas_i],' {.tabset .tabset-fade}\n')
Expand All @@ -700,13 +711,14 @@ for(pgs_method_i in unique(pgs_dat$pgs_method[pgs_dat$gwas == gwas_i])){
if(is.na(gwas_list$prevalence[gwas_list$name == gwas_i])){
cat0('- Assuming the PGS explains ', round(100*pseudoval$pseudoval_r2obs[pseudoval$gwas == gwas_i],1),'% of the variance in ', gwas_list$label[gwas_list$name == gwas_i],',\n and the mean and SD of ', gwas_list$label[gwas_list$name == gwas_i],' in the general population is ',gwas_list$mean[gwas_list$name == gwas_i],' and ',gwas_list$sd[gwas_list$name == gwas_i],' respectively,\n on average people with your PGS have a ', gwas_list$label[gwas_list$name == gwas_i],' of ', round(abs_res_i$mean,2), " (95% CI = ",round(lowCI,2),' - ',round(highCI,2),").\n\n")
cat0('- **Note.** These predictions are estimated using a range of assumptions and should be interpreted with caution. This report is merely a demonstration of how polygenic scores can be interpreted.\n\n')
} else {
cat0('- Assuming the PGS has an AUC of ', round(pseudoval$pseudoval_auc[pseudoval$gwas == gwas_i],3),', and the prevelance of ', gwas_list$label[gwas_list$name == gwas_i],' in the general population is ',gwas_list$prevalence[gwas_list$name == gwas_i],',\n on average ',round(100*abs_res_i$abs_prob,1),'% of people with your PGS have ', gwas_list$label[gwas_list$name == gwas_i],'.\n\n')
cat0('- **Note.** These predictions are estimated using a range of assumptions and should be interpreted with caution. This report is merely a demonstration of how polygenic scores can be interpreted.\n\n')
}
}
}
Expand Down
17 changes: 13 additions & 4 deletions Scripts/pipeline_reports/samp_report_creator.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,13 @@ if(is.na(refdir)){
cat0("Note. AFR = African, AMR = American, EAS = East Asian, EUR = European, CSA = Central and South Asian, MID = Middle Eastern.")
}
cat0("<details><summary>Show principal component plots</summary>")
cat0("</br>\n\n")
cat0("<details><summary>Show principal component plots</summary>\n\n")
cat0(paste0("![Principal Component Scores Compared to Reference Populations](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
cat0(paste0("![](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
cat0("Note. Black circles indicate target sample individuals. \n\n")
cat0("**Target Principal Component Scores Compared to Reference Populations.**\n\n")
cat0("**Note.** Black circles indicate target sample individuals.\n\n")
cat0("</details> \n\n")
cat0("*** \n\n")
Expand All @@ -175,6 +177,10 @@ cat0("# Polygenic Scores \n\n")
cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statistics were provided for polygenic scoring.\n")
cat0("- ", length(pgs_methods_list), " PGS methods were applied, including (", paste0(pgs_methods_list, collapse = ', '), ").\n")
if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){
cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n")
}
if(is.null(score_list)){
cat0("- No external score files were provided in score_list.\n\n")
} else {
Expand All @@ -200,11 +206,12 @@ for(gwas in gwas_list$name) {
sumstat_qc <- rbind(sumstat_qc, data.frame(
name = gwas,
label = gwas_list$label[gwas_list$name == gwas],
population = gwas_list$population[gwas_list$name == gwas],
orig_n = orig_n,
final_n = final_n))
}
names(sumstat_qc) <- c('Name', 'Label', 'NSNP Original', 'NSNP Final')
names(sumstat_qc) <- c('Name', 'Label', 'Population', 'NSNP Original', 'NSNP Final')
datatable(sumstat_qc,
rownames = FALSE,
Expand Down Expand Up @@ -262,6 +269,8 @@ datatable(score_qc,
width = '100%',
selection = 'none')
cat0("**Note.** The `Pass` column indicates whether a sufficient number of variants within the score file were present in the reference data.\n\n")
cat0("***\n\n")
```
Expand Down
19 changes: 16 additions & 3 deletions docs/pipeline_readme.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ This document will provides instructions for the GenoPred pipeline. It covers th
- [Requesting outputs](#requesting-outputs)
- [Computational resources](#computational-resources)
- [Additional parameters](#additional-parameters)
- [Running in offline environment](#running-in-offline-environment)

For a general overview of the GenoPred pipeline, click [here](pipeline_overview.html).
For a technical details of the GenoPred pipeline, click [here](pipeline_technical.html).
Expand Down Expand Up @@ -177,9 +178,9 @@ The pipeline is configured using a configfile, which tell the pipeline what to d

## `configfile`

By default, when you run snakemake it reads the `config.yaml` file in the snakemake directory, which lists various parameters, telling it what to do.
Snakemake reads the default `config.yaml` file located in the `pipeline` directory to obtain its default parameters. When using your own data, it’s recommended to create a new `configfile` rather than modifying the default one. You can then specify this custom `configfile` when running Snakemake using the `--configfile` option.

When using your own data, instead of updating the default `config.yaml` file, I would recommend creating a new config file, and then telling snakemake to use that config file, using the `--configfile` parameter. This allows you to use the GenoPred pipeline with multiple configurations.
This approach allows you to use the GenoPred pipeline with multiple configurations. Importantly, only parameters that differ from the defaults need to be included in your custom `configfile`. Any parameter not explicitly defined in the custom `configfile` will be automatically sourced from the default `pipeline/config.yaml` file. This ensures that Snakemake only overrides the parameters you specify, while continuing to use the default settings for all others.

```{bash}
snakemake -j1 --use-conda --configfile=misc/23andMe/config.yaml output_all
Expand Down Expand Up @@ -220,7 +221,7 @@ config <- list(
pgs_methods = list(
description = 'List of polygenic scoring methods to run',
example = "`['ptclump','dbslmm']`",
note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `ldpred2`, `megaprs`"
note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `ldpred2`, `megaprs`. **Note.** `sbayesr` and `ldpred2` are only implemented for GWAS of EUR ancestry."
),
testing = list(
description = 'Controls testing mode',
Expand Down Expand Up @@ -1250,6 +1251,18 @@ By default, individuals are assigned to a reference super population if the prob

***

## Specify resources directory

The GenoPred pipeline automatically downloads resources from the internet. By default these resources are stored within the `resources` folder of the `GenoPred/pipeline` folder. However, the `resdir` parameter in the `configfile` can be used to specify a different folder to store the downloaded resources. This is mainly used when running the pipeline in an offline environment, as then previously downloaded resources can be used by the pipeline.

***

# Running in offline environment

See [here](running_offline.html) if you would like to run the GenoPred pipeline in an environment that does not have access to the internet. In brief the user must download the resources required by GenoPred, transfer them to their offline environment.

***

# Troubleshooting

Please post questions as an issue on the GenoPred GitHub repo [here](https://github.com/opain/GenoPred/issues). If errors occur while running the pipeline, log files will be saved in the `GenoPred/pipeline/logs` folder. If running interactively (i.e. -j1), the error should be printed on the screen.
Expand Down
56 changes: 43 additions & 13 deletions docs/pipeline_readme.html
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ <h1 class="title toc-ignore">GenoPred Pipeline - Instructions</h1>
<li><a href="#requesting-outputs">Requesting outputs</a></li>
<li><a href="#computational-resources">Computational resources</a></li>
<li><a href="#additional-parameters">Additional parameters</a></li>
<li><a href="#running-in-offline-environment">Running in offline
environment</a></li>
</ul>
<p>For a general overview of the GenoPred pipeline, click <a
href="pipeline_overview.html">here</a>. For a technical details of the
Expand Down Expand Up @@ -580,25 +582,31 @@ <h1>Pipeline configuration</h1>
<hr />
<div id="configfile" class="section level2">
<h2><code>configfile</code></h2>
<p>By default, when you run snakemake it reads the
<code>config.yaml</code> file in the snakemake directory, which lists
various parameters, telling it what to do.</p>
<p>When using your own data, instead of updating the default
<code>config.yaml</code> file, I would recommend creating a new config
file, and then telling snakemake to use that config file, using the
<code>--configfile</code> parameter. This allows you to use the GenoPred
pipeline with multiple configurations.</p>
<p>Snakemake reads the default <code>config.yaml</code> file located in
the <code>pipeline</code> directory to obtain its default parameters.
When using your own data, it’s recommended to create a new
<code>configfile</code> rather than modifying the default one. You can
then specify this custom <code>configfile</code> when running Snakemake
using the <code>--configfile</code> option.</p>
<p>This approach allows you to use the GenoPred pipeline with multiple
configurations. Importantly, only parameters that differ from the
defaults need to be included in your custom <code>configfile</code>. Any
parameter not explicitly defined in the custom <code>configfile</code>
will be automatically sourced from the default
<code>pipeline/config.yaml</code> file. This ensures that Snakemake only
overrides the parameters you specify, while continuing to use the
default settings for all others.</p>
<pre class="bash"><code> snakemake -j1 --use-conda --configfile=misc/23andMe/config.yaml output_all</code></pre>
<details>
<summary>
View configfile parameters
</summary>
<table>
<colgroup>
<col width="6%" />
<col width="26%" />
<col width="15%" />
<col width="51%" />
<col width="5%" />
<col width="20%" />
<col width="11%" />
<col width="62%" />
</colgroup>
<thead>
<tr class="header">
Expand Down Expand Up @@ -652,7 +660,9 @@ <h2><code>configfile</code></h2>
<td align="left"><code>['ptclump','dbslmm']</code></td>
<td align="left">Options are: <code>ptclump</code>, <code>dbslmm</code>,
<code>prscs</code>, <code>sbayesr</code>, <code>lassosum</code>,
<code>ldpred2</code>, <code>megaprs</code></td>
<code>ldpred2</code>, <code>megaprs</code>. <strong>Note.</strong>
<code>sbayesr</code> and <code>ldpred2</code> are only implemented for
GWAS of EUR ancestry.</td>
</tr>
<tr class="odd">
<td align="left"><code>testing</code></td>
Expand Down Expand Up @@ -1984,6 +1994,26 @@ <h2>Altering ancestry threshold</h2>
config file.</p>
<hr />
</div>
<div id="specify-resources-directory" class="section level2">
<h2>Specify resources directory</h2>
<p>The GenoPred pipeline automatically downloads resources from the
internet. By default these resources are stored within the
<code>resources</code> folder of the <code>GenoPred/pipeline</code>
folder. However, the <code>resdir</code> parameter in the
<code>configfile</code> can be used to specify a different folder to
store the downloaded resources. This is mainly used when running the
pipeline in an offline environment, as then previously downloaded
resources can be used by the pipeline.</p>
<hr />
</div>
</div>
<div id="running-in-offline-environment" class="section level1">
<h1>Running in offline environment</h1>
<p>See <a href="running_offline.html">here</a> if you would like to run
the GenoPred pipeline in an environment that does not have access to the
internet. In brief the user must download the resources required by
GenoPred, transfer them to their offline environment.</p>
<hr />
</div>
<div id="troubleshooting" class="section level1">
<h1>Troubleshooting</h1>
Expand Down
Loading

0 comments on commit ef91489

Please sign in to comment.