Skip to content

Commit

Permalink
Merge pull request #65 from PGScatalog/dev
Browse files Browse the repository at this point in the history
Release v1.3
  • Loading branch information
nebfield authored Nov 21, 2022
2 parents 7e11510 + 86fa8e3 commit 94d054e
Show file tree
Hide file tree
Showing 28 changed files with 316 additions and 137 deletions.
109 changes: 59 additions & 50 deletions bin/report.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,6 @@ read_log <- function(path) {
return(
log %>%
mutate(
accession = ifelse(
stringr::str_detect(accession, "^PGS"),
stringr::str_extract(accession, "^PGS[0-9]{6}"),
accession
)
) %>%
mutate(
is_multiallelic = factor(is_multiallelic, levels = c("false", "true")),
ambiguous = factor(ambiguous, levels = c("false", "true"))
Expand Down Expand Up @@ -69,53 +62,69 @@ cat command.txt
### Scoring files

```{r, echo = FALSE}
call_api <- function(log) {
accessions <- trimws(unique(log$accession))
pgs_ids <- paste(accessions, collapse = ",")
api_url <- paste0("https://www.pgscatalog.org/rest/score/search?pgs_ids=", pgs_ids)
res <- GET(api_url)
return(fromJSON(rawToChar(res$content)))
}
add_urls <- function(df) {
df$accession <- df$pgs_id
df$pgs_id <- paste0('<a href="https://www.pgscatalog.org/score/', df$pgs_id, '">', df$pgs_id, '</a>')
df$pgp_id <- paste0('<a href="https://www.pgscatalog.org/publication/', df$id, '">', df$id, '</a>')
df$id <- NULL
return(df)
}
combine_mapped_traits <- function(json_efo_entry){
mapped_html <- paste0('<a href="', json_efo_entry$url, '">', json_efo_entry$label, '</a>')
return(paste(mapped_html, collapse=', '))
}
make_publication_table <- function(log) {
json <- call_api(log)
if (json$size == 0) {
return(data.frame(accession=NA, pgp_id = NA, Publication = NA, reported = NA, mapped=NA, genome_build=NA,pgs_id=NA))
json_scorefiles <- read_json('log_combined.json', simplifyVector=TRUE)
parse_json2rows <- function(score_header){
result <- list(pgs_id=NA, Publication = NA, Trait=NA, n_variants = as.integer(score_header[['variants_number']]), genome_build=NA)
# pgs_id
pgs_id <- ''
if(is.null(score_header[['pgs_id']]) == FALSE){
pgs_id <- paste0('<a href="https://www.pgscatalog.org/score/', score_header[['pgs_id']], '">', score_header[['pgs_id']], '</a>')
}
# Add name
if(is.null(score_header[['pgs_name']]) == FALSE){
if(pgs_id == ''){
pgs_id <- score_header[['pgs_name']]
}else{
pgs_id <- paste0(pgs_id, '<br><small>(', score_header[['pgs_name']], ')</small>')
}
}
result[['pgs_id']] <- pgs_id
df <- data.frame(pgs_id = json$results$id)
pubs <- json$results$publication[, c("id", "firstauthor", "journal", "date_publication")]
traits <- data.frame(reported = json$results$trait_reported,
mapped = sapply(json$results$trait_efo, combine_mapped_traits))
unformatted <- dplyr::bind_cols(df, pubs, traits)
# pgp_id
pgp_id <- ''
if(is.null(score_header[['pgp_id']]) == FALSE){
pgp_id <- paste0('<a href="https://www.pgscatalog.org/publication/', score_header[['pgp_id']], '">', score_header[['pgp_id']], '</a>')
}
if(is.null(score_header[['citation']]) == FALSE){
if(pgp_id == ''){
pgp_id <- paste0('<small>', score_header[['citation']], '</small>')
}else{
pgp_id <- paste(pgp_id, paste0('<br><small>', score_header[['citation']], '</small>'))
}
}
if(pgp_id != ''){
result[['Publication']] <- pgp_id
}
# Trait
trait_mapped <- NA
if(length(score_header[['trait_efo']]) > 0){
efo <- data.frame(label = score_header[['trait_mapped']], id = score_header[['trait_efo']])
efo$display <- paste0('<a href="http://www.ebi.ac.uk/efo/', efo$id, '">', efo$label,'</a>')
trait_mapped <- paste0('<u>Mapped trait</u>: ', paste(efo$display, collapse=" "))
}
if(is.null(score_header[['trait_reported']])){
result[['Trait']] <- trait_mapped
}else if(is.na(trait_mapped)){
result[['Trait']] <- paste0('<u>Reported trait</u>: ', score_header[['trait_reported']])
}else{
result[['Trait']] <- paste0('<u>Reported trait</u>: ', score_header[['trait_reported']], "<br>", trait_mapped)
}
unformatted$genome_build <- json$results$variants_genomebuild
# Genome build
build_info <- paste0('<u>Reported</u>: ', score_header[['genome_build']])
if(score_header[['use_harmonised']] == TRUE){
build_info <- paste0(build_info, '<br><u>Harmonized Build</u>: ', score_header[['HmPOS_build']])
}
result[['genome_build']] <- build_info
return(add_urls(tidyr::unite(unformatted, "Publication", firstauthor:date_publication, sep = " ")))
}
return(result)
}
df_scorefiles <- do.call(rbind.data.frame, lapply(json_scorefiles, parse_json2rows))
df_scorefiles$accession <- rownames(df_scorefiles)
log %>%
count(accession, wt = count, name = "n_variants") %>%
left_join(make_publication_table(log), by = "accession") %>%
tidyr::unite("Publication", c(pgp_id, Publication), sep = " | ") %>%
mutate(reported = paste0("<u>Reported trait</u>: ", reported),
mapped = paste0("<u>Mapped trait</u>: ", mapped)) %>%
tidyr::unite("Trait", c(reported, mapped), sep = "<br>") %>%
df_scorefiles %>%
select(accession, pgs_id, Publication, Trait, n_variants, genome_build) %>%
DT::datatable(
rownames = FALSE,
Expand All @@ -124,7 +133,7 @@ log %>%
"Scoring file" = "accession",
"Polygenic Score ID" = "pgs_id",
"Number of variants" = "n_variants",
"Genome build (reported)" = "genome_build"
"Genome build" = "genome_build"
),
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
Expand Down
15 changes: 9 additions & 6 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,21 @@
*/

process {
withName: 'PGSCATALOG_GET|SCORE_REPORT|SCORE_AGGREGATE|MATCH_VARIANTS' {
withName: 'PGSCATALOG_GET|SCORE_REPORT|SCORE_AGGREGATE' {
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: 'copy'
]
}

withName: 'MATCH_VARIANTS|MATCH_COMBINE' {
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
pattern: '*.{gz,csv,yml}',
mode: 'copy'
]
}

withLabel: copy_genomes {
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
Expand All @@ -30,7 +38,6 @@ process {
ext.args = '-v'
}


withName: PLINK2_VCF {
ext.args = "--new-id-max-allele-len 100 missing"
}
Expand All @@ -43,10 +50,6 @@ process {
ext.args = "--new-id-max-allele-len 100 missing --allow-extra-chr"
}

withName: MATCH_VARIANTS {
ext.args = "--min_overlap " + params.min_overlap
}

withName: PLINK2_SCORE {
ext.args2 = "zs" // compress .sscore with zstd by default
}
Expand Down
1 change: 1 addition & 0 deletions docs/_templates/globaltoc.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ <h3>Contents</h3>
<ul>
<li><a href="{{ pathto('reference/params') }}">Input Parameters/Flags</a></li>
<li><a href="{{ pathto('reference/input') }}">Samplesheet schema</a></li>
<li><a href="{{ pathto('reference/containers') }}">Containers</a></li>
</ul>
<li><a href="{{ pathto('output') }}">Outputs & results</a></li>
<li><a href="{{ pathto('troubleshooting') }}">Troubleshooting</a></li>
Expand Down
42 changes: 35 additions & 7 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
:orphan:

Changelog
---------

Expand All @@ -8,6 +8,31 @@ will only occur in major versions with changes noted in this changelog.

.. _`semantic versioning`: https://semver.org/

pgsc_calc v1.3.0 (2022-11-21)
-----------------------------

This release is focused on improving scalability.

Features
~~~~~~~~

- Variant matching is made more efficient using a split - apply - combine
approach when the data is split across chromosomes. This supports parallel PGS
calculation for the largest traits (e.g. cancer, 418 PGS [avg 261,000
variants/score) ) in the PGS Catalog on big datasets such as UK Biobank.

- Better support for running in offline environments:

- Internet access is only required to download scores by ID. Scores can be
pre-downloaded using the utils package
(https://pypi.org/project/pgscatalog-utils/)

- Scoring file metadata is read from headers and displayed in the report
(removed API calls during report generation)

- Implemented flag (--efo_direct) to return only PGS tagged with exact EFO term
(e.g. no PGS for child/descendant terms in the ontology)

pgsc_calc v1.2.0 (2022-10-11)
-----------------------------

Expand All @@ -24,12 +49,15 @@ Features
- Improvements to use less storage space:

- All intermediate files are now compressed by default

- Add parameter to support zstd compressed input files

- Improved memory usage when matching variants (``pgscatalog_utils=v0.1.2``
https://github.com/PGScatalog/pgscatalog_utils)

- Revised interface to select scores from the PGS Catalog using flags:
``--trait_efo`` (EFO ID / traits), ``--pgp_id`` (PGP ID / publications), ``--pgs_id`` (PGS ID, individual scores).

.. _samplesheet: https://pgsc-calc.readthedocs.io/en/dev/reference/input.html
.. _durable caching: https://pgsc-calc.readthedocs.io/en/dev/reference/params.html#parameter-schema

Expand Down Expand Up @@ -78,16 +106,16 @@ Bug fixes

- Implemented a more robust prioritisation procedure if a variant has multiple
candidate matches or duplicated IDs

- Fixed processing multiple samplesets in parallel (e.g. 1000 Genomes + UK
Biobank)

- When combining multiple scoring files, all variants are now kept to reflect the
correct denominator for % matching statistics.

- When trying to correct for strand flips the matched effect allele wasn't being
correctly complemented

pgsc_calc v1.0.0 (2022-05-24)
--------------------------------

Expand Down Expand Up @@ -117,7 +145,7 @@ Features
- Simplified JSON input processes
- Add first draft of documentation
- Add JSON schemas for validating input data (mostly for web platform)

pgsc_calc v0.1.2dev (2022-01-17)
--------------------------------

Expand Down
6 changes: 5 additions & 1 deletion docs/how-to/bigjob.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ allocations (e.g. ``process_low``). Here's an example for an LSF cluster:
.. code-block:: text
process {
executor = 'lsf'
queue = 'short'
clusterOptions = ''
scratch = true
Expand All @@ -93,6 +92,11 @@ allocations (e.g. ``process_low``). Here's an example for an LSF cluster:
}
}
executor {
name = 'lsf'
jobName = { "$task.hash" }
}
In SLURM, queue is equivalent to a partition. Specific cluster parameters can be
provided by modifying ``clusterOptions``. You should change ``cpus``,
``memory``, and ``time`` to match the amount of resources used. Assuming the
Expand Down
12 changes: 6 additions & 6 deletions docs/how-to/calculate_pgscatalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ Traits
~~~~~~

If you would like to calculate every polygenic score in the Catalog for a
`trait`_, like `coronary artery disease`_, then you can use the ``--trait``
`trait`_, like `coronary artery disease`_, then you can use the ``--trait_efo``
parameter:

.. code-block:: console
--trait EFO_0001645
--trait_efo EFO_0001645
Multiple traits can be set by using a comma separated list.

Expand All @@ -76,11 +76,11 @@ Publications
~~~~~~~~~~~~

If you would like to calculate every polygenic score associated with a
`publication`_ in the PGS Catalog, you can use the ``--publication`` parameter:
`publication`_ in the PGS Catalog, you can use the ``--pgp_id`` parameter:

.. code-block:: console
--publication PGP000001
--pgp_id PGP000001
Multiple traits can be set by using a comma separated list.

Expand All @@ -98,8 +98,8 @@ Multiple traits can be set by using a comma separated list.
-profile <docker/singularity/conda> \
--input samplesheet.csv \
--pgs_id PGS001229 \
--trait EFO_0001645 \
--publication PGP000001
--trait_efo EFO_0001645 \
--pgp_id PGP000001
.. note:: For more details about calculating multiple scores, see :ref:`multiple`

6 changes: 3 additions & 3 deletions docs/output.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ variant was matched against the target genomes:
* - ``duplicate_ID``
- True/False flag indicating whether multiple scoring file variants match a single target ID.
* - ``match_status``
- Indicates whether the variant is matched (included in the final scoring file), excluded (matched but removed
based on variant filters), not_best (a different match candidate was selected for this scoring file variant),
or unmatched.
- Indicates whether the variant is *matched* (included in the final scoring file), *excluded* (matched but removed
based on variant filters), *not_best* (a different match candidate was selected for this scoring file variant),
or *unmatched*.
* - ``dataset``
- Name of the sampleset/genotyping data.

Expand Down
16 changes: 10 additions & 6 deletions docs/reference/containers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@ software that we need to calculate scores. Below is a list of container images
for reference, which might be helpful if you'd like to download and inspect them
manually.

.. note:: 4 containers are currently required to run ``pgsc_calc``

Docker
-------------
Software
--------

.. include:: ../_build/docker_containers.txt
:literal:
.. note:: 4 containers are currently required to run ``pgsc_calc``


Some containers are made by `Biocontainers`_, and hosted on their container
Expand All @@ -36,6 +33,13 @@ Other containers are hosted on a Gitlab container registry:
.. _`plink2 2.00a3.3`: https://www.cog-genomics.org/plink/2.0/
.. _`multiqc`: https://quay.io/repository/biocontainers/multiqc?tab=info
.. _`pgscatalog_utils`: https://github.com/PGScatalog/pgscatalog_utils


Docker
------

.. include:: ../_build/docker_containers.txt
:literal:


Choosing a container architecture
Expand Down
Loading

0 comments on commit 94d054e

Please sign in to comment.