Merge pull request #92 from opain/dev

Dev
opain · Apr 30, 2024 · ef91489 · ef91489
2 parents da8acad + a02f8c8
commit ef91489
Show file tree

Hide file tree

Showing 13 changed files with 2,762 additions and 1,797 deletions.
diff --git a/Scripts/pipeline_reports/indiv_report_creator.Rmd b/Scripts/pipeline_reports/indiv_report_creator.Rmd
@@ -100,11 +100,12 @@ if(imp_incl){
   cat0("Individual: ", paste(unlist(strsplit(params$name, "_|-|\\.")), collapse = ' '), "\n\n")
 } else {
   target_psam <- fread(paste0(outdir, '/', params$name, '/geno/', params$name, '.ref.chr', CHROMS[1], '.psam'))
-  id_tmp <- target_psam[paste0(target_psam$V1,'.',target_psam$V2) == params$id,] 
+  names(target_psam)[1]<-'FID'
+  id_tmp <- target_psam[paste0(target_psam$FID,'.',target_psam$IID) == params$id,] 
   
   cat0("Sample: ", params$name, "\n\n")
-  cat0("Family ID: ", id_tmp$V1, "\n\n")
-  cat0("Individual ID: ", id_tmp$V2, "\n\n")
+  cat0("Family ID: ", id_tmp$FID, "\n\n")
+  cat0("Individual ID: ", id_tmp$IID, "\n\n")
 }
 cat0("</font>\n\n")
 
@@ -244,12 +245,13 @@ cat0("\n\n")
 
 cat0("<details><summary>Show principal component plots</summary>\n\n")
 
-cat0(paste0("![Principal Component Scores Compared to Reference Populations](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)\n\n"))
+cat0(paste0("![](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
 
-cat0("Note. Black circle indicates the target individual. \n\n")
+cat0("**Target Principal Component Scores Compared to Reference Populations.**\n\n")
+cat0("**Note.** Black circle indicate target sample individuals.\n\n")
+cat0("</details> \n\n")
 
-cat0("</details>\n\n")
-cat0("***\n\n")
+cat0("*** \n\n")
 
 ```
 
@@ -269,6 +271,10 @@ if(!pgs_incl){
 cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statistics were provided for polygenic scoring.\n")
 cat0("- ", length(pgs_methods_list), " PGS methods were applied, including (", paste0(pgs_method_labels$label[pgs_method_labels$method %in% pgs_methods_list], collapse = ', '), ").\n")
 
+if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){
+  cat0("    - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n")
+}
+
 if(is.null(score_list)){
   cat0("- No external score files were provided in score_list.\n\n")
 } else {
@@ -294,11 +300,12 @@ for(gwas in gwas_list$name) {
     sumstat_qc <- rbind(sumstat_qc, data.frame(
       name = gwas,
       label = gwas_list$label[gwas_list$name == gwas],
+      population = gwas_list$population[gwas_list$name == gwas],
       orig_n = orig_n,
       final_n = final_n))
 }
 
-names(sumstat_qc) <- c('Name', 'Label', 'NSNP Original', 'NSNP Final')
+names(sumstat_qc) <- c('Name', 'Label', 'Population', 'NSNP Original', 'NSNP Final')
 
 datatable(sumstat_qc, 
           rownames = FALSE,
@@ -355,6 +362,8 @@ datatable(score_qc,
           width = '100%',
           selection = 'none')
 
+cat0("**Note.** The `Pass` column indicates whether a sufficient number of variants within the score file were present in the reference data.\n\n")
+
 cat0("***\n\n")
 
 ```
@@ -681,8 +690,10 @@ for(pgs_method_i in unique(pgs_dat$pgs_method[pgs_dat$gwas == gwas_i])){
     add_header_above(c(" " = 1, "PGS Descriptives" = 2, "Distribution in General\nPopulation"=3, "Distribution in People\nLike You" = 4)) %>%
     kable_styling(bootstrap_options = c("striped", "hover", "responsive")))
   
-  cat('\n\n')
-  
+  cat('**Note.** PGS R-squared/AUC is estimated via pseudovalidation, and may be inaccurate. To specify alternative parameters, check out our [interactive tool](https://opain.github.io/GenoPred/PRS_to_Abs_tool.html) for converting polygenic scores to the absolute scale.\n\n')
+
+  cat('***\n\n')
+
   for(gwas_i in unique(pgs_dat$gwas[pgs_dat$pgs_method != 'external'])){
     cat0('#### GWAS: ',gwas_list$label[gwas_list$name == gwas_i],' {.tabset .tabset-fade}\n')
     
@@ -700,13 +711,14 @@ for(pgs_method_i in unique(pgs_dat$pgs_method[pgs_dat$gwas == gwas_i])){
     if(is.na(gwas_list$prevalence[gwas_list$name == gwas_i])){
       
       cat0('- Assuming the PGS explains ', round(100*pseudoval$pseudoval_r2obs[pseudoval$gwas == gwas_i],1),'% of the variance in ', gwas_list$label[gwas_list$name == gwas_i],',\n and the mean and SD of ', gwas_list$label[gwas_list$name == gwas_i],' in the general population is ',gwas_list$mean[gwas_list$name == gwas_i],' and ',gwas_list$sd[gwas_list$name == gwas_i],' respectively,\n on average people with your PGS have a ', gwas_list$label[gwas_list$name == gwas_i],' of ', round(abs_res_i$mean,2), " (95% CI = ",round(lowCI,2),' - ',round(highCI,2),").\n\n")
+      cat0('- **Note.** These predictions are estimated using a range of assumptions and should be interpreted with caution. This report is merely a demonstration of how polygenic scores can be interpreted.\n\n')
 
     } else {
         
       cat0('- Assuming the PGS has an AUC of ', round(pseudoval$pseudoval_auc[pseudoval$gwas == gwas_i],3),', and the prevelance of ', gwas_list$label[gwas_list$name == gwas_i],' in the general population is ',gwas_list$prevalence[gwas_list$name == gwas_i],',\n on average ',round(100*abs_res_i$abs_prob,1),'% of people with your PGS have ', gwas_list$label[gwas_list$name == gwas_i],'.\n\n')
-      
+      cat0('- **Note.** These predictions are estimated using a range of assumptions and should be interpreted with caution. This report is merely a demonstration of how polygenic scores can be interpreted.\n\n')
+
     }
-  
   }
 }
 

diff --git a/Scripts/pipeline_reports/samp_report_creator.Rmd b/Scripts/pipeline_reports/samp_report_creator.Rmd
@@ -157,11 +157,13 @@ if(is.na(refdir)){
   cat0("Note. AFR = African, AMR = American, EAS = East Asian, EUR = European, CSA = Central and South Asian, MID = Middle Eastern.")
 }
 
-cat0("<details><summary>Show principal component plots</summary>")
+cat0("</br>\n\n")
+cat0("<details><summary>Show principal component plots</summary>\n\n")
 
-cat0(paste0("![Principal Component Scores Compared to Reference Populations](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
+cat0(paste0("![](", normalizePath(outdir, mustWork = FALSE), '/', params$name, '/ancestry/', params$name, ".Ancestry.pc_plot.png)"))
 
-cat0("Note. Black circles indicate target sample individuals. \n\n")
+cat0("**Target Principal Component Scores Compared to Reference Populations.**\n\n")
+cat0("**Note.** Black circles indicate target sample individuals.\n\n")
 cat0("</details> \n\n")
 
 cat0("*** \n\n")
@@ -175,6 +177,10 @@ cat0("# Polygenic Scores \n\n")
 cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statistics were provided for polygenic scoring.\n")
 cat0("- ", length(pgs_methods_list), " PGS methods were applied, including (", paste0(pgs_methods_list, collapse = ', '), ").\n")
 
+if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){
+  cat0("    - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n")
+}
+
 if(is.null(score_list)){
   cat0("- No external score files were provided in score_list.\n\n")
 } else {
@@ -200,11 +206,12 @@ for(gwas in gwas_list$name) {
     sumstat_qc <- rbind(sumstat_qc, data.frame(
       name = gwas,
       label = gwas_list$label[gwas_list$name == gwas],
+      population = gwas_list$population[gwas_list$name == gwas],
       orig_n = orig_n,
       final_n = final_n))
 }
 
-names(sumstat_qc) <- c('Name', 'Label', 'NSNP Original', 'NSNP Final')
+names(sumstat_qc) <- c('Name', 'Label', 'Population', 'NSNP Original', 'NSNP Final')
 
 datatable(sumstat_qc, 
           rownames = FALSE,
@@ -262,6 +269,8 @@ datatable(score_qc,
           width = '100%',
           selection = 'none')
 
+cat0("**Note.** The `Pass` column indicates whether a sufficient number of variants within the score file were present in the reference data.\n\n")
+
 cat0("***\n\n")
 
 ```

diff --git a/docs/pipeline_readme.Rmd b/docs/pipeline_readme.Rmd
@@ -29,6 +29,7 @@ This document will provides instructions for the GenoPred pipeline. It covers th
 -   [Requesting outputs](#requesting-outputs)
 -   [Computational resources](#computational-resources)
 -   [Additional parameters](#additional-parameters)
+-   [Running in offline environment](#running-in-offline-environment)
 
 For a general overview of the GenoPred pipeline, click [here](pipeline_overview.html).
 For a technical details of the GenoPred pipeline, click [here](pipeline_technical.html).
@@ -177,9 +178,9 @@ The pipeline is configured using a configfile, which tell the pipeline what to d
 
 ## `configfile`
 
-By default, when you run snakemake it reads the `config.yaml` file in the snakemake directory, which lists various parameters, telling it what to do. 
+Snakemake reads the default `config.yaml` file located in the `pipeline` directory to obtain its default parameters. When using your own data, it’s recommended to create a new `configfile` rather than modifying the default one. You can then specify this custom `configfile` when running Snakemake using the `--configfile` option.
 
-When using your own data, instead of updating the default `config.yaml` file, I would recommend creating a new config file, and then telling snakemake to use that config file, using the `--configfile` parameter. This allows you to use the GenoPred pipeline with multiple configurations.
+This approach allows you to use the GenoPred pipeline with multiple configurations. Importantly, only parameters that differ from the defaults need to be included in your custom `configfile`. Any parameter not explicitly defined in the custom `configfile` will be automatically sourced from the default `pipeline/config.yaml` file. This ensures that Snakemake only overrides the parameters you specify, while continuing to use the default settings for all others.
 
 ```{bash}
  snakemake -j1 --use-conda --configfile=misc/23andMe/config.yaml output_all
@@ -220,7 +221,7 @@ config <- list(
     pgs_methods = list(
       description = 'List of polygenic scoring methods to run',
       example = "`['ptclump','dbslmm']`",
-      note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `ldpred2`, `megaprs`"
+      note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `ldpred2`, `megaprs`. **Note.** `sbayesr` and `ldpred2` are only implemented for GWAS of EUR ancestry."
     ),
     testing = list(
       description = 'Controls testing mode',
@@ -1250,6 +1251,18 @@ By default, individuals are assigned to a reference super population if the prob
 
 ***
 
+## Specify resources directory
+
+The GenoPred pipeline automatically downloads resources from the internet. By default these resources are stored within the `resources` folder of the `GenoPred/pipeline` folder. However, the `resdir` parameter in the `configfile` can be used to specify a different folder to store the downloaded resources. This is mainly used when running the pipeline in an offline environment, as then previously downloaded resources can be used by the pipeline.
+
+***
+
+# Running in offline environment
+
+See [here](running_offline.html) if you would like to run the GenoPred pipeline in an environment that does not have access to the internet. In brief the user must download the resources required by GenoPred, transfer them to their offline environment.
+
+***
+
 # Troubleshooting
 
 Please post questions as an issue on the GenoPred GitHub repo [here](https://github.com/opain/GenoPred/issues). If errors occur while running the pipeline, log files will be saved in the `GenoPred/pipeline/logs` folder. If running interactively (i.e. -j1), the error should be printed on the screen.

diff --git a/docs/pipeline_readme.html b/docs/pipeline_readme.html
@@ -429,6 +429,8 @@ <h1 class="title toc-ignore">GenoPred Pipeline - Instructions</h1>
 <li><a href="#requesting-outputs">Requesting outputs</a></li>
 <li><a href="#computational-resources">Computational resources</a></li>
 <li><a href="#additional-parameters">Additional parameters</a></li>
+<li><a href="#running-in-offline-environment">Running in offline
+environment</a></li>
 </ul>
 <p>For a general overview of the GenoPred pipeline, click <a
 href="pipeline_overview.html">here</a>. For a technical details of the
@@ -580,25 +582,31 @@ <h1>Pipeline configuration</h1>
 <hr />
 <div id="configfile" class="section level2">
 <h2><code>configfile</code></h2>
-<p>By default, when you run snakemake it reads the
-<code>config.yaml</code> file in the snakemake directory, which lists
-various parameters, telling it what to do.</p>
-<p>When using your own data, instead of updating the default
-<code>config.yaml</code> file, I would recommend creating a new config
-file, and then telling snakemake to use that config file, using the
-<code>--configfile</code> parameter. This allows you to use the GenoPred
-pipeline with multiple configurations.</p>
+<p>Snakemake reads the default <code>config.yaml</code> file located in
+the <code>pipeline</code> directory to obtain its default parameters.
+When using your own data, it’s recommended to create a new
+<code>configfile</code> rather than modifying the default one. You can
+then specify this custom <code>configfile</code> when running Snakemake
+using the <code>--configfile</code> option.</p>
+<p>This approach allows you to use the GenoPred pipeline with multiple
+configurations. Importantly, only parameters that differ from the
+defaults need to be included in your custom <code>configfile</code>. Any
+parameter not explicitly defined in the custom <code>configfile</code>
+will be automatically sourced from the default
+<code>pipeline/config.yaml</code> file. This ensures that Snakemake only
+overrides the parameters you specify, while continuing to use the
+default settings for all others.</p>
 <pre class="bash"><code> snakemake -j1 --use-conda --configfile=misc/23andMe/config.yaml output_all</code></pre>
 <details>
 <summary>
 View configfile parameters
 </summary>
 <table>
 <colgroup>
-<col width="6%" />
-<col width="26%" />
-<col width="15%" />
-<col width="51%" />
+<col width="5%" />
+<col width="20%" />
+<col width="11%" />
+<col width="62%" />
 </colgroup>
 <thead>
 <tr class="header">
@@ -652,7 +660,9 @@ <h2><code>configfile</code></h2>
 <td align="left"><code>['ptclump','dbslmm']</code></td>
 <td align="left">Options are: <code>ptclump</code>, <code>dbslmm</code>,
 <code>prscs</code>, <code>sbayesr</code>, <code>lassosum</code>,
-<code>ldpred2</code>, <code>megaprs</code></td>
+<code>ldpred2</code>, <code>megaprs</code>. <strong>Note.</strong>
+<code>sbayesr</code> and <code>ldpred2</code> are only implemented for
+GWAS of EUR ancestry.</td>
 </tr>
 <tr class="odd">
 <td align="left"><code>testing</code></td>
@@ -1984,6 +1994,26 @@ <h2>Altering ancestry threshold</h2>
 config file.</p>
 <hr />
 </div>
+<div id="specify-resources-directory" class="section level2">
+<h2>Specify resources directory</h2>
+<p>The GenoPred pipeline automatically downloads resources from the
+internet. By default these resources are stored within the
+<code>resources</code> folder of the <code>GenoPred/pipeline</code>
+folder. However, the <code>resdir</code> parameter in the
+<code>configfile</code> can be used to specify a different folder to
+store the downloaded resources. This is mainly used when running the
+pipeline in an offline environment, as then previously downloaded
+resources can be used by the pipeline.</p>
+<hr />
+</div>
+</div>
+<div id="running-in-offline-environment" class="section level1">
+<h1>Running in offline environment</h1>
+<p>See <a href="running_offline.html">here</a> if you would like to run
+the GenoPred pipeline in an environment that does not have access to the
+internet. In brief the user must download the resources required by
+GenoPred, transfer them to their offline environment.</p>
+<hr />
 </div>
 <div id="troubleshooting" class="section level1">
 <h1>Troubleshooting</h1>