Merge pull request #65 from PGScatalog/dev

Release v1.3
PGScatalog · Nov 21, 2022 · 94d054e · 94d054e
2 parents 7e11510 + 86fa8e3
commit 94d054e
Show file tree

Hide file tree

Showing 28 changed files with 316 additions and 137 deletions.
diff --git a/bin/report.Rmd b/bin/report.Rmd
@@ -25,13 +25,6 @@ read_log <- function(path) {
   
   return(
     log %>%
-      mutate(
-        accession = ifelse(
-          stringr::str_detect(accession, "^PGS"),
-          stringr::str_extract(accession, "^PGS[0-9]{6}"),
-          accession
-        )
-      ) %>%
       mutate(
         is_multiallelic =  factor(is_multiallelic, levels = c("false", "true")),
         ambiguous = factor(ambiguous, levels = c("false", "true"))
@@ -69,53 +62,69 @@ cat command.txt
 ### Scoring files
 
 ```{r, echo = FALSE}
-call_api <- function(log) {
-  accessions <- trimws(unique(log$accession))
-  pgs_ids <- paste(accessions, collapse = ",")
-  api_url <- paste0("https://www.pgscatalog.org/rest/score/search?pgs_ids=", pgs_ids)
-  res <- GET(api_url)
-  return(fromJSON(rawToChar(res$content)))
-}
-
-add_urls <- function(df) {
-  df$accession <- df$pgs_id
-  df$pgs_id <- paste0('<a href="https://www.pgscatalog.org/score/', df$pgs_id, '">', df$pgs_id, '</a>')
-  df$pgp_id <- paste0('<a href="https://www.pgscatalog.org/publication/', df$id, '">', df$id, '</a>')
-  df$id <- NULL
-  return(df)
-}
-
-combine_mapped_traits <- function(json_efo_entry){
-  mapped_html <- paste0('<a href="', json_efo_entry$url, '">', json_efo_entry$label, '</a>')
-  return(paste(mapped_html, collapse=', '))
-}
-
-make_publication_table <- function(log) {
-  json <- call_api(log)
-  
-  if (json$size == 0) {
-    return(data.frame(accession=NA, pgp_id = NA, Publication = NA, reported = NA, mapped=NA, genome_build=NA,pgs_id=NA))
+json_scorefiles <- read_json('log_combined.json', simplifyVector=TRUE)
+
+parse_json2rows <- function(score_header){
+  result <- list(pgs_id=NA, Publication = NA, Trait=NA, n_variants = as.integer(score_header[['variants_number']]), genome_build=NA)
+  # pgs_id
+  pgs_id <- ''
+  if(is.null(score_header[['pgs_id']]) == FALSE){
+    pgs_id <- paste0('<a href="https://www.pgscatalog.org/score/', score_header[['pgs_id']], '">', score_header[['pgs_id']], '</a>')
   }
+  # Add name
+  if(is.null(score_header[['pgs_name']]) == FALSE){
+    if(pgs_id == ''){
+      pgs_id <- score_header[['pgs_name']]
+    }else{
+      pgs_id <- paste0(pgs_id, '<br><small>(', score_header[['pgs_name']], ')</small>')
+    }
+  }
+  result[['pgs_id']] <- pgs_id
   
-  df <- data.frame(pgs_id = json$results$id) 
-  pubs <- json$results$publication[, c("id", "firstauthor", "journal", "date_publication")]
-  traits <- data.frame(reported = json$results$trait_reported,
-                       mapped = sapply(json$results$trait_efo, combine_mapped_traits))
-  
-  unformatted <- dplyr::bind_cols(df, pubs, traits)
+  # pgp_id
+  pgp_id <- ''
+  if(is.null(score_header[['pgp_id']]) == FALSE){
+    pgp_id <- paste0('<a href="https://www.pgscatalog.org/publication/', score_header[['pgp_id']], '">', score_header[['pgp_id']], '</a>')
+  }
+  if(is.null(score_header[['citation']]) == FALSE){
+    if(pgp_id == ''){
+      pgp_id <- paste0('<small>', score_header[['citation']], '</small>')
+    }else{
+      pgp_id <- paste(pgp_id, paste0('<br><small>', score_header[['citation']], '</small>'))
+    }
+  }
+  if(pgp_id != ''){
+    result[['Publication']] <- pgp_id
+  }
+
+  # Trait
+  trait_mapped <- NA
+  if(length(score_header[['trait_efo']]) > 0){
+    efo <- data.frame(label = score_header[['trait_mapped']], id = score_header[['trait_efo']])
+    efo$display <- paste0('<a href="http://www.ebi.ac.uk/efo/', efo$id, '">', efo$label,'</a>')
+    trait_mapped <- paste0('<u>Mapped trait</u>: ', paste(efo$display, collapse=" "))
+  }
+  if(is.null(score_header[['trait_reported']])){
+    result[['Trait']] <- trait_mapped
+  }else if(is.na(trait_mapped)){
+    result[['Trait']] <- paste0('<u>Reported trait</u>: ', score_header[['trait_reported']])
+  }else{
+    result[['Trait']] <- paste0('<u>Reported trait</u>: ', score_header[['trait_reported']], "<br>", trait_mapped)
+  }
   
-  unformatted$genome_build <- json$results$variants_genomebuild
+  # Genome build
+  build_info <- paste0('<u>Reported</u>: ', score_header[['genome_build']])
+  if(score_header[['use_harmonised']] == TRUE){
+    build_info <- paste0(build_info, '<br><u>Harmonized Build</u>: ', score_header[['HmPOS_build']])
+  }
+  result[['genome_build']] <- build_info
   
-  return(add_urls(tidyr::unite(unformatted, "Publication", firstauthor:date_publication, sep = " ")))
-}
+  return(result)
+} 
+df_scorefiles <- do.call(rbind.data.frame, lapply(json_scorefiles, parse_json2rows))
+df_scorefiles$accession <- rownames(df_scorefiles)
 
-log %>%
-  count(accession, wt = count, name = "n_variants") %>%
-  left_join(make_publication_table(log), by = "accession") %>%
-  tidyr::unite("Publication", c(pgp_id, Publication), sep = " | ") %>%
-  mutate(reported = paste0("<u>Reported trait</u>: ", reported),
-         mapped = paste0("<u>Mapped trait</u>: ", mapped)) %>%
-  tidyr::unite("Trait", c(reported, mapped), sep = "<br>") %>%
+df_scorefiles %>%
   select(accession, pgs_id, Publication, Trait, n_variants, genome_build) %>% 
   DT::datatable(
     rownames = FALSE,
@@ -124,7 +133,7 @@ log %>%
       "Scoring file" = "accession",
       "Polygenic Score ID" = "pgs_id",
       "Number of variants" = "n_variants",
-      "Genome build (reported)" = "genome_build"
+      "Genome build" = "genome_build"
     ),
     extensions = 'Buttons',
     options = list(dom = 'Bfrtip',

diff --git a/conf/modules.config b/conf/modules.config
@@ -11,13 +11,21 @@
 */
 
 process {    
-    withName: 'PGSCATALOG_GET|SCORE_REPORT|SCORE_AGGREGATE|MATCH_VARIANTS' {
+    withName: 'PGSCATALOG_GET|SCORE_REPORT|SCORE_AGGREGATE' {
         publishDir = [
             path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
             mode: 'copy'
         ]    
     }
 
+    withName: 'MATCH_VARIANTS|MATCH_COMBINE' {
+        publishDir = [
+            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+            pattern: '*.{gz,csv,yml}',
+            mode: 'copy'
+        ]    
+    }
+
     withLabel: copy_genomes {
         publishDir = [
             path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
@@ -30,7 +38,6 @@ process {
         ext.args = '-v'
     }
 
-
     withName: PLINK2_VCF {
         ext.args = "--new-id-max-allele-len 100 missing"
     }
@@ -43,10 +50,6 @@ process {
         ext.args = "--new-id-max-allele-len 100 missing --allow-extra-chr"
     }
 
-    withName: MATCH_VARIANTS {
-        ext.args = "--min_overlap " + params.min_overlap
-    }
-
     withName: PLINK2_SCORE {
         ext.args2 = "zs"  // compress .sscore with zstd by default
     }

diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html
@@ -8,6 +8,7 @@ <h3>Contents</h3>
   <ul>
     <li><a href="{{ pathto('reference/params') }}">Input Parameters/Flags</a></li>
     <li><a href="{{ pathto('reference/input') }}">Samplesheet schema</a></li>
+    <li><a href="{{ pathto('reference/containers') }}">Containers</a></li>
   </ul>
   <li><a href="{{ pathto('output') }}">Outputs & results</a></li>
   <li><a href="{{ pathto('troubleshooting') }}">Troubleshooting</a></li>

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,5 +1,5 @@
 :orphan:
-   
+
 Changelog
 ---------
 
@@ -8,6 +8,31 @@ will only occur in major versions with changes noted in this changelog.
 
 .. _`semantic versioning`: https://semver.org/
 
+pgsc_calc v1.3.0 (2022-11-21)
+-----------------------------
+
+This release is focused on improving scalability.
+
+Features
+~~~~~~~~
+
+- Variant matching is made more efficient using a split - apply - combine
+  approach when the data is split across chromosomes. This supports parallel PGS
+  calculation for the largest traits (e.g. cancer, 418 PGS [avg 261,000
+  variants/score) ) in the PGS Catalog on big datasets such as UK Biobank.
+
+- Better support for running in offline environments:
+
+  - Internet access is only required to download scores by ID. Scores can be
+    pre-downloaded using the utils package
+    (https://pypi.org/project/pgscatalog-utils/)
+
+  - Scoring file metadata is read from headers and displayed in the report
+    (removed API calls during report generation)
+
+- Implemented flag (--efo_direct) to return only PGS tagged with exact EFO term
+  (e.g. no PGS for child/descendant terms in the ontology)
+
 pgsc_calc v1.2.0 (2022-10-11)
 -----------------------------
 
@@ -24,12 +49,15 @@ Features
 - Improvements to use less storage space:
 
   - All intermediate files are now compressed by default
-  
+
   - Add parameter to support zstd compressed input files
 
 - Improved memory usage when matching variants (``pgscatalog_utils=v0.1.2``
   https://github.com/PGScatalog/pgscatalog_utils)
 
+- Revised interface to select scores from the PGS Catalog using flags:
+  ``--trait_efo`` (EFO ID / traits), ``--pgp_id`` (PGP ID / publications), ``--pgs_id`` (PGS ID, individual scores).
+
 .. _samplesheet: https://pgsc-calc.readthedocs.io/en/dev/reference/input.html
 .. _durable caching: https://pgsc-calc.readthedocs.io/en/dev/reference/params.html#parameter-schema
 
@@ -78,16 +106,16 @@ Bug fixes
 
 - Implemented a more robust prioritisation procedure if a variant has multiple
   candidate matches or duplicated IDs
-  
+
 - Fixed processing multiple samplesets in parallel (e.g. 1000 Genomes + UK
   Biobank)
-  
+
 - When combining multiple scoring files, all variants are now kept to reflect the
   correct denominator for % matching statistics.
-  
+
 - When trying to correct for strand flips the matched effect allele wasn't being
   correctly complemented
-  
+
 pgsc_calc v1.0.0 (2022-05-24)
 --------------------------------
 
@@ -117,7 +145,7 @@ Features
 - Simplified JSON input processes
 - Add first draft of documentation
 - Add JSON schemas for validating input data (mostly for web platform)
-  
+
 pgsc_calc v0.1.2dev (2022-01-17)
 --------------------------------
 

diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst
@@ -76,7 +76,6 @@ allocations (e.g. ``process_low``). Here's an example for an LSF cluster:
 .. code-block:: text
 
     process {
-        executor = 'lsf'
         queue = 'short'
         clusterOptions = ''
         scratch = true
@@ -93,6 +92,11 @@ allocations (e.g. ``process_low``). Here's an example for an LSF cluster:
         }
     }
 
+    executor {
+        name = 'lsf'
+        jobName = { "$task.hash" }
+    } 
+
 In SLURM, queue is equivalent to a partition. Specific cluster parameters can be
 provided by modifying ``clusterOptions``. You should change ``cpus``,
 ``memory``, and ``time`` to match the amount of resources used. Assuming the

diff --git a/docs/how-to/calculate_pgscatalog.rst b/docs/how-to/calculate_pgscatalog.rst
@@ -59,12 +59,12 @@ Traits
 ~~~~~~
 
 If you would like to calculate every polygenic score in the Catalog for a
-`trait`_, like `coronary artery disease`_, then you can use the ``--trait``
+`trait`_, like `coronary artery disease`_, then you can use the ``--trait_efo``
 parameter:
 
 .. code-block:: console
 
-    --trait EFO_0001645
+    --trait_efo EFO_0001645
 
 Multiple traits can be set by using a comma separated list.
 
@@ -76,11 +76,11 @@ Publications
 ~~~~~~~~~~~~
 
 If you would like to calculate every polygenic score associated with a
-`publication`_ in the PGS Catalog, you can use the ``--publication`` parameter:
+`publication`_ in the PGS Catalog, you can use the ``--pgp_id`` parameter:
 
 .. code-block:: console
 
-    --publication PGP000001
+    --pgp_id PGP000001
 
 Multiple traits can be set by using a comma separated list.
 
@@ -98,8 +98,8 @@ Multiple traits can be set by using a comma separated list.
         -profile <docker/singularity/conda> \    
         --input samplesheet.csv \
         --pgs_id PGS001229 \
-        --trait EFO_0001645 \
-        --publication PGP000001
+        --trait_efo EFO_0001645 \
+        --pgp_id PGP000001
 
 .. note:: For more details about calculating multiple scores, see :ref:`multiple` 
 
diff --git a/docs/output.rst b/docs/output.rst
@@ -175,9 +175,9 @@ variant was matched against the target genomes:
     * - ``duplicate_ID``
       - True/False flag indicating whether multiple scoring file variants match a single target ID.
     * - ``match_status``
-      - Indicates whether the variant is matched (included in the final scoring file), excluded (matched but removed
-        based on variant filters), not_best (a different match candidate was selected for this scoring file variant),
-        or unmatched.
+      - Indicates whether the variant is *matched* (included in the final scoring file), *excluded* (matched but removed
+        based on variant filters), *not_best* (a different match candidate was selected for this scoring file variant),
+        or *unmatched*.
     * - ``dataset``
       - Name of the sampleset/genotyping data.
 

diff --git a/docs/reference/containers.rst b/docs/reference/containers.rst
@@ -7,13 +7,10 @@ software that we need to calculate scores. Below is a list of container images
 for reference, which might be helpful if you'd like to download and inspect them
 manually.
 
-.. note:: 4 containers are currently required to run ``pgsc_calc``
-
-Docker
--------------
+Software
+--------
 
-.. include:: ../_build/docker_containers.txt
-    :literal:
+.. note:: 4 containers are currently required to run ``pgsc_calc``
 
 
 Some containers are made by `Biocontainers`_, and hosted on their container
@@ -36,6 +33,13 @@ Other containers are hosted on a Gitlab container registry:
 .. _`plink2 2.00a3.3`: https://www.cog-genomics.org/plink/2.0/
 .. _`multiqc`: https://quay.io/repository/biocontainers/multiqc?tab=info
 .. _`pgscatalog_utils`: https://github.com/PGScatalog/pgscatalog_utils
+
+
+Docker
+------
+
+.. include:: ../_build/docker_containers.txt
+    :literal:
 
 
 Choosing a container architecture