From 43e7aeeb5ad22afba9751713046abf98b237a57d Mon Sep 17 00:00:00 2001 From: rafael buono <77321541+rabuono@users.noreply.github.com> Date: Wed, 29 May 2024 10:51:29 +0200 Subject: [PATCH 01/18] add metadata --- data-analysis/pathogen-characterisation.md | 79 ++++++++++++++++++---- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 2c2018bc..7e390de6 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -1,27 +1,78 @@ --- title: Pathogen characterisation description: Generic workflows for different data types. -contributors: [] -no_robots: true -search_exclude: true -sitemap: false +contributors: [Francesco Messina, Rafael Andrade Buono] page_id: pc_data_analysis redirect_from: /pathogen-characterisation/data-analysis -rdmkit: - - name: - url: +related_pages: + showcase: [covid19_galaxy_project] training: - - name: - registry: - url: + - name: SARS-CoV-2 data analysis + registry: Carpentries + url: https://gallantries.github.io/video-library/modules/covid-analysis + - name: SARS-CoV-2, viruses and bacteria data analysis + registry: Carpentries + url: https://gallantries.github.io/video-library/modules/one-health + - name: Pathway analysis with the MINERVA Platform + registry: Other + url: https://gxy.io/GTN:T00437 +rdmkit: + - name: “Your tasks: Data Analysis” + url: https://rdmkit.elixir-europe.org/data_analysis +faircookbook: + - name: + url: +fairsharing: + - name: + url: + # More information on how to fill in this metadata section can be found here https://www.infectious-diseases-toolkit.org/contribute/page-metadata --- -**We are still working on the content for this page.** If you are interested in adding to the page, then: + + +## Introduction + +Provide first a short introduction to the subject covered on this page (up to 3 sentences). This could be, for example, data analysis methods relevant to pathogen characterisation. This would be the key sentences for the readers to capture the main idea of the page. + +If needed be, add a further introduction to the topic that is general. Concrete topic introductions should go below. + + +## Concrete topic 1 + +Short explanation of what this topic is about and why it is important, with an emphasis on infectious diseases and the category that you selected e.g. pathogen characterisation. + +### Considerations + +Using a bullet point style list format as much as possible, describe what should be taken into account (i.e. what considerations you should have) for the topic being covered on this page, specifically with regard to infectious diseases in the broader category selected (e.g. pathogen characterisation). This could be, for example; which features are important to consider when selecting data sources? What capabilities are important when defining tools to be used for quality control? What are general characteristics that you should look for in data standards for human biomolecular data. The considerations provided here should help to justify the existing approaches/solutions described in the next section. + +Please avoid replicating 'generic' guidelines, i.e. those not specific to infectious diseases, here. Add links to RDMkit in the metadata above, if any are needed. Links to other sources can also be provided in text as needed. + +### Existing approaches + +Using a bullet point list style as much as possible, describe when, why and for what purpose a specific tool or resource should be used. + +Please avoid replicating 'generic' guidelines, i.e. those not specific to infectious diseases. + +Avoid making long lists of links to tools and resources. The IDTk does not aim to list all possible approaches and solutions. The focus is on contextualised best practices approaches. + +The tools or resources inserted in this section do not have to be considered a 'final' or 'perfect' solution, but should be something that is used by the wider community working in this area or topic. The existing approaches should also reflect the considerations mentioned in the “Considerations” subheading. + +Make sure to add to the Tools and resources list table all of the tools and resources mentioned in the text + +## Concrete topic 2 + +Follow the same guidelines as in Concrete topic 1 + +### Considerations + +Follow the same guidelines as in Concrete topic 1 + +### Existing approaches -[Feel free to contribute](/contribute/){: class="btn btn-primary btn-lg rounded-pill"} +Follow the same guidelines as in Concrete topic 1 -This is a community-driven website, so contributions are welcome! You will, of course, be listed as a contributor on the page. + -New content is announced on the [home page](/) and [news page](/about/news), so please check for updates there. You can also watch for changes on this page by using a free service like [Visual Ping](https://visualping.io/) or [Distill Web Monitor](https://distill.io/), or by using a [browser add-on](https://chrome.google.com/webstore/detail/distill-web-monitor/inlikjemeeknofckkjolnjbpehgadgge?hl=en). + From ebbafd02e6b6134e876305db89980afa34c4b979 Mon Sep 17 00:00:00 2001 From: bedroesb Date: Thu, 30 May 2024 10:04:31 +0200 Subject: [PATCH 02/18] adding tools --- _data/tool_and_resource_list.yml | 58 ++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index b31e1ddd..70b4275d 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -286,12 +286,12 @@ biotools: Flye tess: Flye url: https://github.com/fenderglass/Flye -- description: FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment. +- description: freebayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment. id: freebayes - name: FreeBayes + name: freebayes registry: biotools: freebayes - tess: FreeBayes + tess: freebayes url: https://github.com/freebayes/freebayes - description: The metadata model for GA4GH, an international coalition of both public and private interested parties, formed to enable the sharing of genomic and clinical data. id: ga4gh @@ -909,3 +909,55 @@ id: graf-sex name: GRAF sex url: https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GRAF_README.html +- description: Velvet is an algorithm package that has been designed to deal with de novo genome assembly and short read sequencing alignments. + id: velvet + name: Velvet + url: https://github.com/dzerbino/velvet +- description: A tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies + id: raxml + name: RAxML + url: https://github.com/stamatak/standard-RAxML +- description: IQ-TREE is designed to efficiently handle large phylogenomic datasets, utilize multicore and distributed parallel computing for faster analysis, and automatically resume interrupted analyses through checkpointing. + id: iqtree + name: IQtree + url: https://github.com/iqtree/iqtree2 +- description: MrBayes is a program for Bayesian inference and model choice across a wide range of phylogenetic and evolutionary models. MrBayes uses Markov chain Monte Carlo (MCMC) methods to estimate the posterior distribution of model parameters. + id: mrbayes + name: MrBayes + url: https://nbisweden.github.io/MrBayes/ +- description: BEAST is a cross-platform program for Bayesian phylogenetic analysis, estimating rooted, time-measured phylogenies using strict or relaxed molecular clock models. It uses Markov chain Monte Carlo (MCMC) to average over tree space and includes a graphical user interface for setting up analyses and tools for result analysis. + id: beast + name: BEAST + url: https://www.beast2.org/ +- description: Rapid haploid variant calling and core genome alignment. + id: snippy + name: SNippy + url: https://github.com/tseemann/snippy +- description: Convert ThermoFinningan RAW mass spectrometry files to the mzXML format. + id: readw + name: ReAdW + url: https://github.com/PedrioliLab/ReAdW +- description: X! Tandem open source is software that can match tandem mass spectra with peptide sequences, in a process that has come to be known as protein identification. + id: x-tandem + name: X! Tandem + url: https://www.thegpm.org/TANDEM/ +- description: OMSSA (Open Mass Spectrometry Search Algorithm) is a tool to identify peptides in tandem mass spectrometry (MS/MS) data. The OMSSA algorithm uses a classic probability score to compute specificity. See also The NCBI C++ Toolkit and The NCBI C++ Toolkit Book. + id: omssa + name: OMSSA + url: https://ftp.ncbi.nlm.nih.gov/pub/lewisg/omssa/ +- description: MaxQuant is a quantitative proteomics software package designed for analyzing large mass-spectrometric data sets. It is specifically aimed at high-resolution MS data. + id: maxquant + name: MAXQUANT + url: https://www.maxquant.org/ +- description: + id: + name: + url: +- description: + id: + name: + url: +- description: + id: + name: + url: \ No newline at end of file From 1885b5ff7057cdeabd1de0c1ff7503dc27995fd5 Mon Sep 17 00:00:00 2001 From: rafael buono <77321541+rabuono@users.noreply.github.com> Date: Thu, 30 May 2024 11:56:29 +0200 Subject: [PATCH 03/18] add intro --- data-analysis/pathogen-characterisation.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 7e390de6..92068c16 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -33,9 +33,17 @@ fairsharing: ## Introduction -Provide first a short introduction to the subject covered on this page (up to 3 sentences). This could be, for example, data analysis methods relevant to pathogen characterisation. This would be the key sentences for the readers to capture the main idea of the page. +Data analysis for pathogen characterization allows us to understand the evolution of pathogens, and the relationship among different strains and provides insights on host-pathogen interactions and drug resistance. The tasks can involve processing data collected from a diverse spectrum of sources, from both clinical and environmental samples. As in every data analysis procedure, the general workflow involves: -If needed be, add a further introduction to the topic that is general. Concrete topic introductions should go below. +- Preprocessing: Includes the initial steps required to prepare data, genomics and not, for further analysis. + +- Analysis: Is the core stage where the actual detection and characterization of pathogens occur. This stage employs many techniques for pathogen characterization, such as Next-Generation Sequencing (NGS). + +- Postprocessing: Includes interpreting and validating the data obtained from the analysis stage, as well as integrating it into broader contexts. Moreover, this is often followed by reporting and communication, and archiving and data management. + + +Each stage is crucial for the accurate and comprehensive characterisation of pathogens, from the initial handling of samples to the final reporting and data management, and will be detailed below. +Scalable and reproducible data analysis activities enable rapid surveillance of infectious epidemics of emerging and re-emerging pathogens in foodborne, hospital settings, and local community outbreaks. Ensuring reproducibility is critical for the usability of the analysis results. Following community-recognised best practices and the FAIR principles (Findability, Accessibility, Interoperability, and Reusability) is fundamental for guaranteeing the trustworthiness of the results and enabling collaboration and sharing of information. ## Concrete topic 1 From 684af13d0a3de5eb0b73aa71e006c7f578e0312b Mon Sep 17 00:00:00 2001 From: rafael buono <77321541+rabuono@users.noreply.github.com> Date: Thu, 30 May 2024 11:59:18 +0200 Subject: [PATCH 04/18] add general considerations --- data-analysis/pathogen-characterisation.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 92068c16..bed9d797 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -46,6 +46,25 @@ Each stage is crucial for the accurate and comprehensive characterisation of pat Scalable and reproducible data analysis activities enable rapid surveillance of infectious epidemics of emerging and re-emerging pathogens in foodborne, hospital settings, and local community outbreaks. Ensuring reproducibility is critical for the usability of the analysis results. Following community-recognised best practices and the FAIR principles (Findability, Accessibility, Interoperability, and Reusability) is fundamental for guaranteeing the trustworthiness of the results and enabling collaboration and sharing of information. +### General considerations + +When analysing pathogen data involved in a health emergency or epidemic outbreak are: +- Define the pathogen and specific aspects to be investigated, e.g. genomic features of interest +- Collect the suitable reference data about the pathogen of interest, preferentially from community-accepted repositories, e.g. ENA, GISAID. It is worth noting that the right reference should be chosen taking into account mutation features, time of isolation, classification, phenotype, and genomic structure. +- Before analysing the data, define which specific aspect of the pathogen’s variability will be investigated. For example, if your aim is to describe the whole variability along the genome, the data should be compared with the whole reference genome. +- Define the type of data you are using, e.g. DNA or RNAseq for viral genome characterisation +- Select the tools best suited for the analysis of your data +- Estimate the computing resources needed +- Define which computing infrastructure is most suitable, e.g. cluster or cloud +- Ensure to follow the FAIR principles when handling data +- Guarantee findability of the data and tools for all collaborators for reproducibility by providing your: + - Code + - Execution environment + - Workflows + - Data analysis execution, including parameters used + - Accompanied by documentation that lists all parameters and other relevant information to reproduce the findings + + ## Concrete topic 1 Short explanation of what this topic is about and why it is important, with an emphasis on infectious diseases and the category that you selected e.g. pathogen characterisation. From 40d6cb7bd3a9ca7496fad916ea0d4edd726e8910 Mon Sep 17 00:00:00 2001 From: bedroesb Date: Wed, 5 Jun 2024 14:59:23 +0200 Subject: [PATCH 05/18] remove empty ones --- _data/tool_and_resource_list.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 3800c38c..4996347e 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -985,15 +985,3 @@ id: maxquant name: MAXQUANT url: https://www.maxquant.org/ -- description: - id: - name: - url: -- description: - id: - name: - url: -- description: - id: - name: - url: From 6956b6b9f7def59eac46e9cb12a17d12415ecb9c Mon Sep 17 00:00:00 2001 From: bedroesb Date: Fri, 21 Jun 2024 16:46:21 +0200 Subject: [PATCH 06/18] extra tools --- _data/tool_and_resource_list.yml | 72 ++++++++++++++++++++---- data-analysis/human-biomolecular-data.md | 2 +- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 4996347e..99bfc32f 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -205,8 +205,7 @@ id: dragen-gatk name: Dragen-GATK url: https://gatk.broadinstitute.org/hc/en-us/articles/360045944831 -- description: 'Dryad is an open-source, community-led data curation, publishing, and preservation platform for CC0 publicly available research data. Dryad has a long-term data preservation strategy, and is a Core Trust Seal Certified Merritt repository with storage in US and EU at the San Diego Supercomputing Center, DANS, and Zenodo. While data is undergoing peer review, it is embargoed if the related journal requires / allows this. Dryad is an independent non-profit that works directly with: researchers to publish datasets utilising best practices for discovery and reuse; publishers to support the integration of data availability statements and data citations into their workflows; and institutions to enable scalable campus support for research data management best practices at low cost. Costs are covered by institutional, publisher, and funder members, otherwise a one-time fee of $120 for authors to cover cost of curation and preservation. Dryad also receives direct funder support through - grants.' +- description: Dryad is an open-source, community-led data curation, publishing, and preservation platform for CC0 publicly available research data. id: dryad name: Dryad registry: @@ -233,7 +232,7 @@ fairsharing: mya1ff tess: European Genome-phenome Archive (EGA) url: https://ega-archive.org/ -- description: 'The European Language Social Science Thesaurus (ELSST) is a broad-based, multilingual thesaurus for the social sciences. It is owned and published by the Consortium of European Social Science Data Archives (CESSDA) and its national Service Providers. The thesaurus consists of over 3,300 concepts and covers the core social science disciplines: politics, sociology, economics, education, law, crime, demography, health, employment, information, communication technology, and environmental science. ELSST is used for data discovery within CESSDA and facilitates access to data resources across Europe, independent of domain, resource, language, or vocabulary. ELSST is currently available in 16 languages: Danish, Dutch, Czech, English, Finnish, French, German, Greek, Hungarian, Icelandic, Lithuanian, Norwegian, Romanian, Slovenian, Spanish, and Swedish' +- description: The European Language Social Science Thesaurus (ELSST) is a broad-based, multilingual thesaurus for the social sciences. It is owned and published by the Consortium of European Social Science Data Archives (CESSDA) and its national Service Providers. id: european-language-social-science-thesaurus name: European Language Social Science Thesaurus (ELSST) registry: @@ -273,14 +272,14 @@ fairsharing: dj8nt8 tess: European Nucleotide Archive (ENA) url: https://www.ebi.ac.uk/ena/browser/home -- description: FAIRsharing is a FAIR-supporting resource that provides an informative and educational registry on data standards, databases, repositories and policy, alongside search and visualization tools and services that interoperate with other FAIR-enabling resources. fairsharing guides consumers to discover, select and use standards, databases, repositories and policy with confidence, and producers to make their resources more discoverable, more widely adopted and cited. Each record in fairsharing is curated in collaboration with the maintainers of the resource themselves, ensuring that the metadata in the fairsharing registry is accurate and timely. Every record is manually reviewed at least once a year. Records can be collated into collections, based on a project, society or organisation, or Recommendations, where they are collated around a policy, such as a journal or funder data policy. +- description: FAIRsharing is a FAIR-supporting resource that provides an informative and educational registry on data standards, databases, repositories and policy, alongside search and visualization tools and services that interoperate with other FAIR-enabling resources. FAIRsharing guides consumers to discover, select and use standards, databases, repositories and policy with confidence, and producers to make their resources more discoverable, more widely adopted and cited. Each record in fairsharing is curated in collaboration with the maintainers of the resource themselves, ensuring that the metadata in the fairsharing registry is accurate and timely. id: fairsharing name: FAIRsharing registry: fairsharing: 2abjs5 tess: FAIRsharing url: https://fairsharing.org/ -- description: Figshare is a generalist, subject-agnostic repository for many different types of digital objects that can be used without cost to researchers. Data can be submitted to the central figshare repository (described here), or institutional repositories using the figshare software can be installed locally, e.g. by universities and publishers. Metadata in figshare is licenced under is CC0. figshare has also partnered with DuraSpace and Chronopolis to offer further assurances that public data will be archived under the stewardship of Chronopolis. figshare is supported through Institutional, Funder, and Governmental service subscriptions. +- description: Figshare is a generalist, subject-agnostic repository for many different types of digital objects that can be used without cost to researchers. Data can be submitted to the central figshare repository (described here), or institutional repositories using the figshare software can be installed locally, e.g. by universities and publishers. id: figshare name: Figshare registry: @@ -671,14 +670,13 @@ registry: biotools: wtdbg2 url: https://github.com/ruanjue/wtdbg2 -- description: Metabolomic and lipidomic platform - id: xcms - name: XCMS +- description: A systems biology tool for analyzing metabolomic data. It automatically superimposes raw metabolomic data onto metabolic pathways and integrates it with transcriptomic and proteomic data. + id: xcms-online + name: XCMS Online registry: - biotools: xcms - tess: XCMS + biotools: xcms_online url: https://xcmsonline.scripps.edu/landing_page.php?pgcontent=mainPage -- description: Zenodo is a generalist research data repository built and developed by OpenAIRE and CERN. It was developed to aid Open Science and is built on open source code. Zenodo helps researchers receive credit by making the research results citable and through OpenAIRE integrates them into existing reporting lines to funding agencies like the European Commission. Citation information is also passed to DataCite and onto the scholarly aggregators. Content is available publicly under any one of 400 open licences (from opendefinition.org and spdx.org). Restricted and Closed content is also supported. Free for researchers below 50 GB/dataset. Content is both online on disk and offline on tape as part of a long-term preservation policy. Zenodo supports managed access (with an access request workflow) as well as embargoing generally and during peer review. The base infrastructure of Zenodo is provided by CERN, a non-profit IGO. Projects are funded through grants. +- description: Zenodo is a generalist research data repository built and developed by OpenAIRE and CERN. id: zenodo name: Zenodo registry: @@ -985,3 +983,55 @@ id: maxquant name: MAXQUANT url: https://www.maxquant.org/ +- description: Absolute protein expression Quantitative Proteomics Tool, is a free and open source Java implementation of the APEX technique for the quantitation of proteins based on standard LC- MS/MS proteomics data. + id: apex + name: apex + url: http://sourceforge.net/projects/apexqpt/ + regsitry: + biotools: apex +- description: Framework for processing and visualization of chromatographically separated and single-spectra mass spectral data. + id: xcms + name: xcms + url: http://bioconductor.org/packages/release/bioc/html/xcms.html + regsitry: + biotools: xcms +- description: A Meta-Search Peptide Identification Platform for Tandem Mass Spectra + id: peparml + name: PepArMl + url: https://peparml.sourceforge.net/ + regsitry: + biotools: peparml +- description: A commercial software package for NMR spectral processing that offers a semi-automated tool for spectral deconvolution, enabling interactive fitting of metabolite peaks to reference spectra and quantifying their concentrations. + id: chenomx + name: Chenomx + url: https://www.chenomx.com/ +- description: ResFinder identifies acquired genes and/or finds chromosomal mutations mediating antimicrobial resistance in total or partial DNA sequence of bacteria. + id: resfinder + name: ResFinder + url: http://genepi.food.dtu.dk/resfinder + regsitry: + biotools: resfinder +- description: Pathogenwatch provides species and taxonomy prediction for over 60,000 variants of bacteria, viruses, and fungi. + id: pathogenwatch + name: Pathogenwatch + url: https://pathogen.watch/ +- description: CellDesigner is a structured diagram editor for drawing gene-regulatory and biochemical networks. + id: celldesigner + name: CellDesigner + url: https://www.celldesigner.org/ +- description: "A curated database containing nearly all published HIV RT and protease sequences: a resource designed for researchers studying evolutionary and drug-related variation in the molecular targets of anti-HIV therapy." + id: hivdb-stanford + name: Stanford HIV Drug Resistance Database (HIVDB) + url: https://hivdb.stanford.edu/ +- description: Nextstrain is an open-source project to harness the scientific and public health potential of pathogen genome data. + id: nextstrain + name: Nextstrain + url: http://nextstrain.org + regsitry: + biotools: nextstrain.org +- description: g:GOSt performs functional enrichment analysis, also known as over-representation analysis (ORA) or gene set enrichment analysis, on input gene list. + id: g-profiler + name: g:Profiler + url: https://biit.cs.ut.ee/gprofiler/gost + regsitry: + biotools: gprofiler diff --git a/data-analysis/human-biomolecular-data.md b/data-analysis/human-biomolecular-data.md index 49d9f913..1d544d90 100644 --- a/data-analysis/human-biomolecular-data.md +++ b/data-analysis/human-biomolecular-data.md @@ -112,7 +112,7 @@ There are several types of analysis that can be performed on human biomolecular - *Interaction databases*: {% tool "biogrid" %} and {% tool "intact" %} - *Network analysis*: {% tool "cytoscape" %} and {% tool "genemania" %} - **Metabolomics analysis**: This involves measuring the levels of small molecules (metabolites) in biological samples and comparing them across different conditions or groups of samples. This can help to identify biomarkers of disease or drug response. - - *Data processing*: {% tool "xcms" %}, {% tool "mzmine" %} and {% tool "openms" %} + - *Data processing*: {% tool "xcms-online" %}, {% tool "mzmine" %} and {% tool "openms" %} - *Statistical analysis*: {% tool "metaboanalyst" %} and {% tool "metsign" %} ## Postprocessing From a4711c833824249fee16a99a68e6303bd89aa9ad Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:59:17 +0200 Subject: [PATCH 07/18] moved more content from doc --- _data/tool_and_resource_list.yml | 10 +++ data-analysis/pathogen-characterisation.md | 79 +++++++++++++++++++++- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 99bfc32f..4788f12a 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -1035,3 +1035,13 @@ url: https://biit.cs.ut.ee/gprofiler/gost regsitry: biotools: gprofiler +- description: EuroHPC Joint Undertaking is a joint initiative between the EU, European countries and private partners to develop a World Class Supercomputing Ecosystem in Europe. + id: eurohpc + name: EuroHPC + url: https://eurohpc-ju.europa.eu/ + regsitry: +- description: BEAUti is a graphical user-interface (GUI) application for generating BEAST XML files. + id: beauti + name: BEAUti + url: https://beast.community/beauti.html + regsitry: diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index bed9d797..ab97d248 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -1,7 +1,7 @@ --- title: Pathogen characterisation description: Generic workflows for different data types. -contributors: [Francesco Messina, Rafael Andrade Buono] +contributors: [Eva Garcia Alvarez, Francesco Messina, Fotis Psomopoulos, Rafael Andrade Buono] page_id: pc_data_analysis redirect_from: /pathogen-characterisation/data-analysis related_pages: @@ -65,6 +65,83 @@ When analysing pathogen data involved in a health emergency or epidemic outbreak - Accompanied by documentation that lists all parameters and other relevant information to reproduce the findings +### Existing approaches +- **Container and environments**: Consider using containers and environments to collect and isolate dependencies for tools and pipelines. Environment management systems, such as Conda, help with reproducibility but are not inherently portable across platforms. Containers provide a higher level of portability, being able to encapsulate both the software and its dependencies. +- **Web-based code collaboration platform**: Consider using a centralised location for software developers to store, manage, collaborate, and share their code. For instance, {% tool "github" %}, {% tool "gitlab" %}, or {% tool "bitbucket" %}. +- **Workflow management systems**: Allow you to formalise your workflows in a standardised format and execute them locally or on a remote computer infrastructure. Popular systems are {% tool "nextflow" %} and {% tool "snakemake" %}. +- **Workflow platforms**: Allow users to manage data, run formalised workflows, and review their results. Platforms, such as {% tool "galaxy" %}, may offer multiple interfaces, e.g. web, GUI, and APIs. +- **Reference databases**: Collect the suitable reference data about pathogens to be investigated. {% tool "european-nucleotide-archive" %} and {% tool "gisaid" %} are examples of genomic databases to which researchers share their data. In this context, the European Pathogens Portal aggregates databases relating to pathogens, as well as hosts and their vectors. Other countries host their own instance of the {% tool "pathogens-portal" %}, e.g. see the {% tool "swedish-pathogens-portal" %}Swedish Pathogens Portal [showcase](https://www.infectious-diseases-toolkit.org/showcase/swedish-pathogens-portal). +- **Workflow registries**: Register workflows in platforms, such as {% tool "workflowhub" %}, that facilitate sharing, versioning, and authorship attribution of the pipelines. + + +For more general information and solutions on data analysis, you may have a look at the content available on the [RDMkit data analysis page] +(https://rdmkit.elixir-europe.org/data_analysis#what-are-the-best-practices-for-data-analysis). +While the examples on this page focus on the genomic characterisation of pathogens, similar principles apply to other data types. + +## Preprocessing + +Data preprocessing is an initial step in data analysis involving the preparation of raw data for the main analysis. It is an important factor in quality control, and involves steps for the cleaning of the data, with the identification of inconsistencies, errors, and missing values. Preprocessing may also include data conversion and transformation steps to get the data in a format compatible with the expected inputs of the chosen analysis pipelines. + +### Considerations + +Some typical considerations involved in this step: +- **Data cleaning**: Finds and corrects errors in the data. For example, eliminating duplicates, removing too short genomic reads, and trimming not useful information such as contaminating host data. +- **Quality control checks**: Should be conducted at each step to ensure that the data is suitable for the intended analysis. +- **Exclusion of low-quality samples**: Samples with low-quality scores should be marked and removed. In genomics studies, samples with missing values, low sequencing depth, and contaminations might be removed. + +### Existing approaches + +Preprocessing steps may depend on the technology used and the pathogen being studied and thus should be adjusted accordingly. Some common approaches in genomics studies include: + +- Raw sequences quality check: {% tool "fastqc" %} +- Trimming out adapters and low-quality sequences: {% "trimmomatic" %} +- Quality checks: further information can be found on the [Quality control - Pathogen characterisation](/quality-control/pathogen-characterisation) page. + +## Analysis + +The analysis of data to characterise a pathogen of interest can involve methodologies from different fields. While genomics approaches are of common interest, analysis of other data types, such as proteomics and metabolomics, and their combination can be of special importance. + +### Considerations + +- **The computational resources**: Verify that the appropriate computational resources are available. Depending on the volume and complexity of the data, you might need to make use of large computing clusters or cloud computing resources. +- **The location of your data**: Ensure that the chosen computing infrastructure and platforms have access to the data. It is important to consider the distance between the data storage and computing, as it can significantly impact transfer times and costs. +- **Document the steps**: Report every step of the data analysis process. Including software versions employed, parameters utilised, the computing environment employed, reference genome used, as well as any “manual” data curation steps. More information on recording provenance can be found on the [Provenance pages](/provenance/) +- **Collaborative analysis**: it is important that partners have access to the data, tools, and workflows. It is crucial that systems are in place to track changes to the tools and workflows used, and that the history of modifications is accessible to all collaborators. + +### Existing approaches + +There are several types of analysis that can be performed on pathogen-related data, depending on the specific research question and type of data being analysed. Here are some solutions: +- Consider using the available computational infrastructure to scale up your analysis capabilities. This may include applying for access to large computing cluster resources with e.g. {% tool ëurohpc"%} or making use of public Galaxy servers such as {% tool "galaxy-europe" %}. +- **Genomic analysis**: Including whole genome sequencing (WGS), this analysis allows the interpretation of genetic information encoded along the genome (DNA or RNA). Genomic analysis can be used for a wide range of applications to characterise many aspects of pathogen variability, such as Variants of Concern (VOC) and antimicrobial resistance profiles in bacteria (AMR). Examples of tools that allow us to take into account the genomic characteristics of pathogens (e.g. genomic structure and size, gene annotations, mobile genetic elements) are: + - Sequence Alignment + - {% tool "bowtie2" %} + - {% tool "bwa" %} + - {% tool "samtools" %} + - Genome Assembly + - {% tool "canu"%} + - {% tool "velvet" %} + - {% tool "spades" %} + - Phylogenetic Analysis + - {% tool "clustalw" %} + - {% tool "muscle" %} + - {% tool "mafft" %} + - {% tool "raxml" %} + - {% tool "iqtree" %} + - Molecular Clock + - {% tool "mrbayes" %} + - {% tool "beast" %} + - {% tool "beauti" %} + - Variant calling + - {% tool "dragen-gatk" %} + - {% tool "freebayes" %} + - {% tool "varscan" %} + - Annotation + - {% tool "annovar" %} + - {% tool "snpeff" %} + - {% tool "vep" %} + - {% tool "dbnsfp" %} + - All-in-one Bioinformatic Tools + - {% tool "snippy" %} ## Concrete topic 1 Short explanation of what this topic is about and why it is important, with an emphasis on infectious diseases and the category that you selected e.g. pathogen characterisation. From 19368d03e543805e9da3874bf6f1cd7acd6bdd7f Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:02:57 +0200 Subject: [PATCH 08/18] typo in fix in metadata --- data-analysis/pathogen-characterisation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index ab97d248..bb3bd575 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -17,7 +17,7 @@ training: registry: Other url: https://gxy.io/GTN:T00437 rdmkit: - - name: “Your tasks: Data Analysis” + - name: Data Analysis url: https://rdmkit.elixir-europe.org/data_analysis faircookbook: - name: From 8f78aa7bc384e46a7770cdcc2643fa58ba35a3fd Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:05:39 +0200 Subject: [PATCH 09/18] typo in tool entry --- data-analysis/pathogen-characterisation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index bb3bd575..63173a0a 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -94,7 +94,7 @@ Some typical considerations involved in this step: Preprocessing steps may depend on the technology used and the pathogen being studied and thus should be adjusted accordingly. Some common approaches in genomics studies include: - Raw sequences quality check: {% tool "fastqc" %} -- Trimming out adapters and low-quality sequences: {% "trimmomatic" %} +- Trimming out adapters and low-quality sequences: {% tool "trimmomatic" %} - Quality checks: further information can be found on the [Quality control - Pathogen characterisation](/quality-control/pathogen-characterisation) page. ## Analysis From d3bfbc8158035a3896bcd2031333fe6f337ed500 Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:35:14 +0200 Subject: [PATCH 10/18] typo fix --- data-analysis/pathogen-characterisation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 63173a0a..dfc31be8 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -111,7 +111,7 @@ The analysis of data to characterise a pathogen of interest can involve methodol ### Existing approaches There are several types of analysis that can be performed on pathogen-related data, depending on the specific research question and type of data being analysed. Here are some solutions: -- Consider using the available computational infrastructure to scale up your analysis capabilities. This may include applying for access to large computing cluster resources with e.g. {% tool ëurohpc"%} or making use of public Galaxy servers such as {% tool "galaxy-europe" %}. +- Consider using the available computational infrastructure to scale up your analysis capabilities. This may include applying for access to large computing cluster resources with e.g. {% tool "eurohpc"%} or making use of public Galaxy servers such as {% tool "galaxy-europe" %}. - **Genomic analysis**: Including whole genome sequencing (WGS), this analysis allows the interpretation of genetic information encoded along the genome (DNA or RNA). Genomic analysis can be used for a wide range of applications to characterise many aspects of pathogen variability, such as Variants of Concern (VOC) and antimicrobial resistance profiles in bacteria (AMR). Examples of tools that allow us to take into account the genomic characteristics of pathogens (e.g. genomic structure and size, gene annotations, mobile genetic elements) are: - Sequence Alignment - {% tool "bowtie2" %} From 52d460bff677c6fee6cb0941e95918bd3b3f1585 Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Wed, 18 Sep 2024 13:21:58 +0200 Subject: [PATCH 11/18] more content and tools --- _data/tool_and_resource_list.yml | 24 ++++- data-analysis/pathogen-characterisation.md | 100 +++++++++++++++------ 2 files changed, 94 insertions(+), 30 deletions(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 01e26a27..0f1ffd22 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -1119,4 +1119,26 @@ id: beauti name: BEAUti url: https://beast.community/beauti.html - regsitry: \ No newline at end of file + regsitry: +- description: QIIME 2 is a powerful, extensible, and decentralized microbiome analysis package with a focus on data and analysis transparency. + id: qiime2 + name: QIIME 2 + url: https://docs.qiime2.org/ + regsitry: +- description: MEGAHIT is an ultra-fast and memory-efficient NGS assembler optimized for metagenomes. + id: megahit + name: MEGAHIT + url: https://github.com/voutcn/megahit + regsitry: + biotools: megahit +- description: A taxonomic classification system using exact k-mer matches to achieve high accuracy and fast classification speeds. + id: kraken2 + name: Kraken 2 + url: https://ccb.jhu.edu/software/kraken2/ + regsitry: + biotools: kraken2 +- description: The COVID-19 Disease Map is an assembly of molecular interaction diagrams, established based on literature evidence. + id: covid19map + name: COVID19 Disease Map + url: https://covid19map.elixir-luxembourg.org/ + regsitry: diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index dfc31be8..9e55df8d 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -142,41 +142,83 @@ There are several types of analysis that can be performed on pathogen-related da - {% tool "dbnsfp" %} - All-in-one Bioinformatic Tools - {% tool "snippy" %} -## Concrete topic 1 -Short explanation of what this topic is about and why it is important, with an emphasis on infectious diseases and the category that you selected e.g. pathogen characterisation. - -### Considerations - -Using a bullet point style list format as much as possible, describe what should be taken into account (i.e. what considerations you should have) for the topic being covered on this page, specifically with regard to infectious diseases in the broader category selected (e.g. pathogen characterisation). This could be, for example; which features are important to consider when selecting data sources? What capabilities are important when defining tools to be used for quality control? What are general characteristics that you should look for in data standards for human biomolecular data. The considerations provided here should help to justify the existing approaches/solutions described in the next section. - -Please avoid replicating 'generic' guidelines, i.e. those not specific to infectious diseases, here. Add links to RDMkit in the metadata above, if any are needed. Links to other sources can also be provided in text as needed. - -### Existing approaches - -Using a bullet point list style as much as possible, describe when, why and for what purpose a specific tool or resource should be used. - -Please avoid replicating 'generic' guidelines, i.e. those not specific to infectious diseases. - -Avoid making long lists of links to tools and resources. The IDTk does not aim to list all possible approaches and solutions. The focus is on contextualised best practices approaches. - -The tools or resources inserted in this section do not have to be considered a 'final' or 'perfect' solution, but should be something that is used by the wider community working in this area or topic. The existing approaches should also reflect the considerations mentioned in the “Considerations” subheading. - -Make sure to add to the Tools and resources list table all of the tools and resources mentioned in the text - -## Concrete topic 2 +- **Metagenomics analysis**: Sequencing all genetic material in a sample can provide comprehensive data about the composition of the microbial community. In the context of infectious diseases, it can aid in identifying multiple pathogens simultaneously in clinical, as well as environmental samples. Examples of tools in this type of analysis are: + - 16S rRNA sequencing + - {% tool "qiime2" %} + - Shotgun sequencing + - {% tool "spades" %} + - {% tool "megahit" %} + - Assigning taxonomic labels + - {% tool "kraken2" %} + +- **Proteomics analysis**: Proteomics, primarily utilising mass spectrometry techniques, offers a powerful tools for examining proteins and their interplay. This can provide valuable insights into irregularities associated with infectious diseases and potentially uncover mechanisms of drug resistance. Examples of tools in this type of analysis are: + - Mass Spectrometry Data Extraction Software + - {% tool "readw" %} + - Search Algorithms + - {% tool "x-tandem" %} + - {% tool "omssa" %} + - {% tool "maxquant" %} + - Statistical Validation + - {% tool "peparml" %} + - Quantitative Tools + - {% tool "apex" %} + - {% tool "maxquant" %} + +- **Metabolomics analysis**: This involves measuring the levels of small molecules (metabolites) produced by specific pathogens in biological samples, comparing them across different conditions or groups of samples. Examples of tools in this type of analysis are: + - Mass Spectrometry Software: + - {% tool "xcms" %} + - {% tool "metaboanalyst" %} + - NMR Spectroscopy Software: + - {% tool "chenomx" %} + - Data Processing: + - {% tool "xcms" %} + - {% tool "mzmine" %} + - {% tool "openms" %} + +## Postprocessing +In pathogen characterisation, the postprocessing steps are crucial to evaluate and interpret the results. These steps are important to identify strain relationships and specific molecular variation patterns linked to peculiar phenotypes of pathogens (e.g. drug resistance, virulence, and transmission rate). Such results must be biologically meaningful and reproducible, considering also the clinical aspects and treatment implications. -Follow the same guidelines as in Concrete topic 1 +### Considerations -### Considerations +Some considerations about postprocessing steps in pathogen characterization include: +- **Interpretation**: it is important to interpret them in a biologically meaningful context. This should consider the following aspects: report the variability of specific pathogens; find out new strains that could become concerning; identify specific genes or mutations associated with pathogenic variation. +- **Transformation**: Consider having postprocessing steps to ensure that outputs are transformed or converted into interoperable and open formats. This ensures that subsequent pipelines and collaborators can readily make use of the results. +- **Visualisation**: To allow a clear interpretation of the clinical practice, it is important to visualise the results clearly, to make the results clear also to all professionals involved. -Follow the same guidelines as in Concrete topic 1 +### Existing approaches -### Existing approaches +- **Spatial-temporal analysis and visualisation**: using a combined approach of phylogenetic, spatial distribution, and molecular clock, this approach aids in designing strategies to control and prevent the spread of infectious diseases, as well as in the development of effective treatments, and vaccines. + - Spatial distribution of strain: {% tool "nextstrain" %} +- **Drug resistance characterisation**: genomic analysis can be used to characterise pathogens for specific resistance against drugs and help develop strategies to fight the spread of drug-resistant strains. + - Antimicrobial resistance (AMR): + - {% tool "resfinder" %} + - {% tool "pathogenwatch" %} + - Viral drug resistance: + - {% tool "hivdb-stanford" %} +- **Interaction analysis and functional enrichment analysis**: placing the identified protein interactions and regulatory networks in the context of the affected biological pathways allows for a better understanding of disease mechanisms and potential drug targets. + - Network analysis: + - {% tool "cytoscape" %} + - {% tool "celldesigner" %} + - Gene enrichment analysis: + - {% tool "enrichr" %} + - {% tool "go" %} + - {% tool "g-profiler" %} + - Interaction Databases: + - {% tool "biogrid" %} + - {% tool "intact" %} + - Integrative diagrams: + - A [disease map](https://disease-maps.org/) can be used to represent a conceptual model of the molecular mechanisms of a disease. An example is the {% tool "covid19map" %}. + +## Data analysis of wastewater surveillance for infectious diseases + +Wastewater surveillance has emerged as a valuable tool for monitoring infectious diseases, providing a non-invasive method to track the spread of pathogens within communities. This approach has gained significant attention during the COVID-19 pandemic, particularly for detecting and analysing SARS-CoV-2 variants. By analysing wastewater samples, researchers can identify the presence and prevalence of infectious agents, offering insights into public health trends. Here we focus on the analysis of wastewater with an emphasis on SARS-CoV-2. -Follow the same guidelines as in Concrete topic 1 +### Considerations - +Even though the considerations for this specific field are very similar to the ones described in the previous paragraphs, there are some approaches that are used in the context of wastewater surveillance. - +### Existing approaches +Several tools and workflows have been developed or adapted for the analysis of wastewater data, especially in the context of SARS-CoV-2 surveillance: + - Specific Tools for SARS-CoV-2 \ No newline at end of file From 30519ed113da52d579d4aff18e23c447a929a0f5 Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:39:38 +0200 Subject: [PATCH 12/18] add more content from doc --- _data/tool_and_resource_list.yml | 54 ++++++++++++++++++++++ data-analysis/pathogen-characterisation.md | 22 ++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 0f1ffd22..593cfc7d 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -1142,3 +1142,57 @@ name: COVID19 Disease Map url: https://covid19map.elixir-luxembourg.org/ regsitry: +- description: Freyja is a tool to recover relative lineage abundances from mixed SARS-CoV-2 samples from a sequencing dataset (BAM aligned to the Hu-1 reference). + id: freyja + name: Freyja + url: https://github.com/andersen-lab/Freyja + regsitry: + biotools: freyja +- description: The cojac package comprises a set of command-line tools to analyse co-occurrence of mutations on amplicons. + id: cojac + name: COJAC + url: https://github.com/cbg-ethz/cojac + regsitry: + biotools: cojac +- description: Lineagespot is a framework written in R, and aims to identify SARS-CoV-2 related mutations based on a single (or a list) of variant(s) file(s). + id: lineagespot + name: Lineagespot + url: https://github.com/BiodataAnalysisGroup/lineagespot + regsitry: + biotools: lineagespot +- description: Kallisto is a program for quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. + id: kallisto + name: Kallisto + url: https://pachterlab.github.io/kallisto/about.html + regsitry: + biotools: kallisto +- description: PiGx SARS-CoV-2 is a pipeline for analysing data from sequenced wastewater samples and identifying given lineages of SARS-CoV-2. + id: pigxs + name: PiGx SARS-CoV-2 Wastewater Sequencing Pipeline + url: https://github.com/BIMSBbioinfo/pigx_sars-cov-2 + regsitry: +- description: A GitHub repository from the CBG-ETHZ group offering tools for detecting SARS-CoV-2 variants in Switzerland. + id: cowwid + name: COWWID + url: https://github.com/cbg-ethz/cowwid + regsitry: +- description: A SARS-CoV-2 Contextual Data Specification from PHA4GE. + id: sars-pha4ge + name: SARS-CoV-2 Contextual Data Specification + url: https://github.com/pha4ge/SARS-CoV-2-Contextual-Data-Specification + regsitry: +- description: A data model to improve wastewater surveillance through interoperable data. + id: phes-odm + name: PHES-ODM + url: https://github.com/Big-Life-Lab/PHES-ODM + regsitry: +- description: A pipeline for lineage abundance estimation from wastewater sequencing data. + id: vlq + name: VLQ + url: https://github.com/baymlab/wastewater_analysis + regsitry: +- description: CFSAN Wastewater Analysis Pipeline to estimate the percentage of SARS-CoV-2 variants in a sample. + id: c-wap + name: C-WAP + url: https://github.com/CFSAN-Biostatistics/C-WAP + regsitry: \ No newline at end of file diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 9e55df8d..ede2f421 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -221,4 +221,24 @@ Even though the considerations for this specific field are very similar to the o ### Existing approaches Several tools and workflows have been developed or adapted for the analysis of wastewater data, especially in the context of SARS-CoV-2 surveillance: - - Specific Tools for SARS-CoV-2 \ No newline at end of file + - Specific Tools for SARS-CoV-2: Certain tools (such as {% tool "freyja"%}, {% tool "cojac"%}, and {% tool "lineagespot" %}) are specifically designed for analysing SARS-CoV-2 data, providing capabilities such as variant detection and lineage tracking. + - Repurposed Tools: Originally developed for other types of genomic data, tools like {% tool "kallisto" %} or {% tool "kraken2" %}, have been successfully applied to wastewater data analysis, offering high performance in read alignment and taxonomic classification. +- In addition, here are several bioinformatics protocols and solutions that could be used in the context of wastewater NGS data analysis. + - Tools and repositories: + - {% tool "pigxs" %}: provides a comprehensive solution for sequencing and analysing SARS-CoV-2 in wastewater. + - Detection of SARS-CoV-2 variants in Switzerland by genomic analysis of wastewater samples [medRxiv](https://www.medrxiv.org/content/10.1101/2021.01.08.21249379v2): COWWID: A GitHub repository from the CBG-ETHZ group offering tools for detecting SARS-CoV-2 variants in Switzerland + - [CDC Module 2.7](https://www.cdc.gov/amd/training/covid-toolkit/module2-7.html): Wastewater based variant tracking for SARS-CoV-2 + - The Public Health Alliance for Genomic Epidemiology GitHub organization makes available a mapping to the {% tool "european-nucleotide-archive" %}: {% tool "sars-pha4ge" %} + - {% tool "phes-odm" %} as an open data model for wastewater surveillance + - Viral Lineage Quantification (VLQ), Kallisto-Approach: [Lineage abundance estimation for SARS-CoV-2 in wastewater using transcriptome quantification techniques](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02805-9) and corresponding repository at {% tool "vlq" %} + - [Performance benchmark of tools](https://peerj.com/articles/14596/), evaluating tools like Kraken2, Kallisto, Freyja, implemented in C-WAP pipeline. , implemented in {% tool "c-wap" %} + - Wastewater quality control workflow in GalaxyTrakr [(SSquAWK4)](dx.doi.org/10.17504/protocols.io.kxygxzk5dv8j/v9). Further quality control aspects are discussed in the [Quality Control - Pathogen Characterisation page](/quality-control/pathogen-characterisation) + - ECDC [Guidance document](https://www.ecdc.europa.eu/sites/default/files/documents/Guidance-for-representative-and-targeted-genomic-SARS-CoV-2-monitoring-updated-with%20erratum-20-May-2021.pdf) for representative and targeted genomic SARS-CoV-2 monitoring + + + + + + + + From 70de5b5e51a646746f9e8f007a945aa5b5603c00 Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:57:46 +0200 Subject: [PATCH 13/18] reformat tools --- data-analysis/pathogen-characterisation.md | 119 ++++++--------------- 1 file changed, 33 insertions(+), 86 deletions(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index ede2f421..b8a78ab9 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -26,11 +26,8 @@ fairsharing: - name: url: -# More information on how to fill in this metadata section can be found here https://www.infectious-diseases-toolkit.org/contribute/page-metadata ---- - - +--- ## Introduction Data analysis for pathogen characterization allows us to understand the evolution of pathogens, and the relationship among different strains and provides insights on host-pathogen interactions and drug resistance. The tasks can involve processing data collected from a diverse spectrum of sources, from both clinical and environmental samples. As in every data analysis procedure, the general workflow involves: @@ -50,7 +47,7 @@ Scalable and reproducible data analysis activities enable rapid surveillance of When analysing pathogen data involved in a health emergency or epidemic outbreak are: - Define the pathogen and specific aspects to be investigated, e.g. genomic features of interest -- Collect the suitable reference data about the pathogen of interest, preferentially from community-accepted repositories, e.g. ENA, GISAID. It is worth noting that the right reference should be chosen taking into account mutation features, time of isolation, classification, phenotype, and genomic structure. +- Collect the suitable reference data about the pathogen of interest, preferentially from community-accepted repositories, e.g. {% tool "european-nucleotide-archive" %} and {% tool "gisaid" %}. It is worth noting that the right reference should be chosen taking into account mutation features, time of isolation, classification, phenotype, and genomic structure. - Before analysing the data, define which specific aspect of the pathogen’s variability will be investigated. For example, if your aim is to describe the whole variability along the genome, the data should be compared with the whole reference genome. - Define the type of data you are using, e.g. DNA or RNAseq for viral genome characterisation - Select the tools best suited for the analysis of your data @@ -113,68 +110,29 @@ The analysis of data to characterise a pathogen of interest can involve methodol There are several types of analysis that can be performed on pathogen-related data, depending on the specific research question and type of data being analysed. Here are some solutions: - Consider using the available computational infrastructure to scale up your analysis capabilities. This may include applying for access to large computing cluster resources with e.g. {% tool "eurohpc"%} or making use of public Galaxy servers such as {% tool "galaxy-europe" %}. - **Genomic analysis**: Including whole genome sequencing (WGS), this analysis allows the interpretation of genetic information encoded along the genome (DNA or RNA). Genomic analysis can be used for a wide range of applications to characterise many aspects of pathogen variability, such as Variants of Concern (VOC) and antimicrobial resistance profiles in bacteria (AMR). Examples of tools that allow us to take into account the genomic characteristics of pathogens (e.g. genomic structure and size, gene annotations, mobile genetic elements) are: - - Sequence Alignment - - {% tool "bowtie2" %} - - {% tool "bwa" %} - - {% tool "samtools" %} - - Genome Assembly - - {% tool "canu"%} - - {% tool "velvet" %} - - {% tool "spades" %} - - Phylogenetic Analysis - - {% tool "clustalw" %} - - {% tool "muscle" %} - - {% tool "mafft" %} - - {% tool "raxml" %} - - {% tool "iqtree" %} - - Molecular Clock - - {% tool "mrbayes" %} - - {% tool "beast" %} - - {% tool "beauti" %} - - Variant calling - - {% tool "dragen-gatk" %} - - {% tool "freebayes" %} - - {% tool "varscan" %} - - Annotation - - {% tool "annovar" %} - - {% tool "snpeff" %} - - {% tool "vep" %} - - {% tool "dbnsfp" %} - - All-in-one Bioinformatic Tools - - {% tool "snippy" %} + - Sequence Alignment: {% tool "bowtie2" %}, {% tool "bwa" %} and {% tool "samtools" %} + - Genome Assembly: {% tool "canu"%}, {% tool "velvet" %} and {% tool "spades" %} + - Phylogenetic Analysis: {% tool "clustalw" %}, {% tool "muscle" %}, {% tool "mafft" %}, {% tool "raxml" %} and {% tool "iqtree" %} + - Molecular Clock: {% tool "mrbayes" %}, {% tool "beast" %} and {% tool "beauti" %} + - Variant calling: {% tool "dragen-gatk" %}, {% tool "freebayes" %} and {% tool "varscan" %} + - Annotation: {% tool "annovar" %}, {% tool "snpeff" %}, {% tool "vep" %} and {% tool "dbnsfp" %} + - All-in-one Bioinformatic Tools: {% tool "snippy" %} - **Metagenomics analysis**: Sequencing all genetic material in a sample can provide comprehensive data about the composition of the microbial community. In the context of infectious diseases, it can aid in identifying multiple pathogens simultaneously in clinical, as well as environmental samples. Examples of tools in this type of analysis are: - - 16S rRNA sequencing - - {% tool "qiime2" %} - - Shotgun sequencing - - {% tool "spades" %} - - {% tool "megahit" %} - - Assigning taxonomic labels - - {% tool "kraken2" %} + - 16S rRNA sequencing: {% tool "qiime2" %} + - Shotgun sequencing: {% tool "spades" %}, and {% tool "megahit" %} + - Assigning taxonomic labels: {% tool "kraken2" %} - **Proteomics analysis**: Proteomics, primarily utilising mass spectrometry techniques, offers a powerful tools for examining proteins and their interplay. This can provide valuable insights into irregularities associated with infectious diseases and potentially uncover mechanisms of drug resistance. Examples of tools in this type of analysis are: - - Mass Spectrometry Data Extraction Software - - {% tool "readw" %} - - Search Algorithms - - {% tool "x-tandem" %} - - {% tool "omssa" %} - - {% tool "maxquant" %} - - Statistical Validation - - {% tool "peparml" %} - - Quantitative Tools - - {% tool "apex" %} - - {% tool "maxquant" %} + - Mass Spectrometry Data Extraction Software: {% tool "readw" %} + - Search Algorithms: {% tool "x-tandem" %}, {% tool "omssa" %} and {% tool "maxquant" %} + - Statistical Validation: {% tool "peparml" %} + - Quantitative Tools: {% tool "apex" %} and {% tool "maxquant" %} - **Metabolomics analysis**: This involves measuring the levels of small molecules (metabolites) produced by specific pathogens in biological samples, comparing them across different conditions or groups of samples. Examples of tools in this type of analysis are: - - Mass Spectrometry Software: - - {% tool "xcms" %} - - {% tool "metaboanalyst" %} - - NMR Spectroscopy Software: - - {% tool "chenomx" %} - - Data Processing: - - {% tool "xcms" %} - - {% tool "mzmine" %} - - {% tool "openms" %} + - Mass Spectrometry Software: {% tool "xcms" %} and {% tool "metaboanalyst" %} + - NMR Spectroscopy Software: {% tool "chenomx" %} + - Data Processing: {% tool "xcms" %}, {% tool "mzmine" %} and {% tool "openms" %} ## Postprocessing In pathogen characterisation, the postprocessing steps are crucial to evaluate and interpret the results. These steps are important to identify strain relationships and specific molecular variation patterns linked to peculiar phenotypes of pathogens (e.g. drug resistance, virulence, and transmission rate). Such results must be biologically meaningful and reproducible, considering also the clinical aspects and treatment implications. @@ -191,22 +149,12 @@ Some considerations about postprocessing steps in pathogen characterization incl - **Spatial-temporal analysis and visualisation**: using a combined approach of phylogenetic, spatial distribution, and molecular clock, this approach aids in designing strategies to control and prevent the spread of infectious diseases, as well as in the development of effective treatments, and vaccines. - Spatial distribution of strain: {% tool "nextstrain" %} - **Drug resistance characterisation**: genomic analysis can be used to characterise pathogens for specific resistance against drugs and help develop strategies to fight the spread of drug-resistant strains. - - Antimicrobial resistance (AMR): - - {% tool "resfinder" %} - - {% tool "pathogenwatch" %} - - Viral drug resistance: - - {% tool "hivdb-stanford" %} + - Antimicrobial resistance (AMR): {% tool "resfinder" %} and {% tool "pathogenwatch" %} + - Viral drug resistance: {% tool "hivdb-stanford" %} - **Interaction analysis and functional enrichment analysis**: placing the identified protein interactions and regulatory networks in the context of the affected biological pathways allows for a better understanding of disease mechanisms and potential drug targets. - - Network analysis: - - {% tool "cytoscape" %} - - {% tool "celldesigner" %} - - Gene enrichment analysis: - - {% tool "enrichr" %} - - {% tool "go" %} - - {% tool "g-profiler" %} - - Interaction Databases: - - {% tool "biogrid" %} - - {% tool "intact" %} + - Network analysis: {% tool "cytoscape" %} and {% tool "celldesigner" %} + - Gene enrichment analysis: {% tool "enrichr" %}, {% tool "go" %} and {% tool "g-profiler" %} + - Interaction Databases: {% tool "biogrid" %} and {% tool "intact" %} - Integrative diagrams: - A [disease map](https://disease-maps.org/) can be used to represent a conceptual model of the molecular mechanisms of a disease. An example is the {% tool "covid19map" %}. @@ -221,16 +169,15 @@ Even though the considerations for this specific field are very similar to the o ### Existing approaches Several tools and workflows have been developed or adapted for the analysis of wastewater data, especially in the context of SARS-CoV-2 surveillance: - - Specific Tools for SARS-CoV-2: Certain tools (such as {% tool "freyja"%}, {% tool "cojac"%}, and {% tool "lineagespot" %}) are specifically designed for analysing SARS-CoV-2 data, providing capabilities such as variant detection and lineage tracking. - - Repurposed Tools: Originally developed for other types of genomic data, tools like {% tool "kallisto" %} or {% tool "kraken2" %}, have been successfully applied to wastewater data analysis, offering high performance in read alignment and taxonomic classification. -- In addition, here are several bioinformatics protocols and solutions that could be used in the context of wastewater NGS data analysis. - - Tools and repositories: - - {% tool "pigxs" %}: provides a comprehensive solution for sequencing and analysing SARS-CoV-2 in wastewater. - - Detection of SARS-CoV-2 variants in Switzerland by genomic analysis of wastewater samples [medRxiv](https://www.medrxiv.org/content/10.1101/2021.01.08.21249379v2): COWWID: A GitHub repository from the CBG-ETHZ group offering tools for detecting SARS-CoV-2 variants in Switzerland - - [CDC Module 2.7](https://www.cdc.gov/amd/training/covid-toolkit/module2-7.html): Wastewater based variant tracking for SARS-CoV-2 - - The Public Health Alliance for Genomic Epidemiology GitHub organization makes available a mapping to the {% tool "european-nucleotide-archive" %}: {% tool "sars-pha4ge" %} - - {% tool "phes-odm" %} as an open data model for wastewater surveillance - - Viral Lineage Quantification (VLQ), Kallisto-Approach: [Lineage abundance estimation for SARS-CoV-2 in wastewater using transcriptome quantification techniques](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02805-9) and corresponding repository at {% tool "vlq" %} + - **Specific Tools for SARS-CoV-2**: Certain tools (such as {% tool "freyja"%}, {% tool "cojac"%}, and {% tool "lineagespot" %}) are specifically designed for analysing SARS-CoV-2 data, providing capabilities such as variant detection and lineage tracking. + - **Repurposed Tools**: Originally developed for other types of genomic data, tools like {% tool "kallisto" %} or {% tool "kraken2" %}, have been successfully applied to wastewater data analysis, offering high performance in read alignment and taxonomic classification. +- In addition, here are **several bioinformatics protocols and solutions** that could be used in the context of wastewater next-generation sequencing (NGS) data analysis. + - {% tool "pigxs" %}: provides a comprehensive solution for sequencing and analysing SARS-CoV-2 in wastewater. + - Detection of SARS-CoV-2 variants in Switzerland by genomic analysis of wastewater samples [medRxiv](https://www.medrxiv.org/content/10.1101/2021.01.08.21249379v2): COWWID: A GitHub repository from the CBG-ETHZ group offering tools for detecting SARS-CoV-2 variants in Switzerland + - [CDC Module 2.7](https://www.cdc.gov/amd/training/covid-toolkit/module2-7.html): Wastewater based variant tracking for SARS-CoV-2 + - The Public Health Alliance for Genomic Epidemiology GitHub organization makes available a mapping to the {% tool "european-nucleotide-archive" %}: {% tool "sars-pha4ge" %} + - {% tool "phes-odm" %} as an open data model for wastewater surveillance + - Viral Lineage Quantification (VLQ), Kallisto-Approach: [Lineage abundance estimation for SARS-CoV-2 in wastewater using transcriptome quantification techniques](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02805-9) and corresponding repository at {% tool "vlq" %} - [Performance benchmark of tools](https://peerj.com/articles/14596/), evaluating tools like Kraken2, Kallisto, Freyja, implemented in C-WAP pipeline. , implemented in {% tool "c-wap" %} - Wastewater quality control workflow in GalaxyTrakr [(SSquAWK4)](dx.doi.org/10.17504/protocols.io.kxygxzk5dv8j/v9). Further quality control aspects are discussed in the [Quality Control - Pathogen Characterisation page](/quality-control/pathogen-characterisation) - ECDC [Guidance document](https://www.ecdc.europa.eu/sites/default/files/documents/Guidance-for-representative-and-targeted-genomic-SARS-CoV-2-monitoring-updated-with%20erratum-20-May-2021.pdf) for representative and targeted genomic SARS-CoV-2 monitoring From 5b9a017f23dc47597ff131a87524a19312332c1d Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:52:30 +0200 Subject: [PATCH 14/18] typo fix --- data-analysis/pathogen-characterisation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index b8a78ab9..4d9ff0c7 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -178,7 +178,7 @@ Several tools and workflows have been developed or adapted for the analysis of w - The Public Health Alliance for Genomic Epidemiology GitHub organization makes available a mapping to the {% tool "european-nucleotide-archive" %}: {% tool "sars-pha4ge" %} - {% tool "phes-odm" %} as an open data model for wastewater surveillance - Viral Lineage Quantification (VLQ), Kallisto-Approach: [Lineage abundance estimation for SARS-CoV-2 in wastewater using transcriptome quantification techniques](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02805-9) and corresponding repository at {% tool "vlq" %} - - [Performance benchmark of tools](https://peerj.com/articles/14596/), evaluating tools like Kraken2, Kallisto, Freyja, implemented in C-WAP pipeline. , implemented in {% tool "c-wap" %} + - [Performance benchmark of tools](https://peerj.com/articles/14596/), evaluating tools like Kraken2, Kallisto, Freyja, implemented in {% tool "c-wap" %} - Wastewater quality control workflow in GalaxyTrakr [(SSquAWK4)](dx.doi.org/10.17504/protocols.io.kxygxzk5dv8j/v9). Further quality control aspects are discussed in the [Quality Control - Pathogen Characterisation page](/quality-control/pathogen-characterisation) - ECDC [Guidance document](https://www.ecdc.europa.eu/sites/default/files/documents/Guidance-for-representative-and-targeted-genomic-SARS-CoV-2-monitoring-updated-with%20erratum-20-May-2021.pdf) for representative and targeted genomic SARS-CoV-2 monitoring From 2f24d5ea08bf93c2047cbf72925302f31fa770d1 Mon Sep 17 00:00:00 2001 From: rabuono <77321541+rabuono@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:55:18 +0200 Subject: [PATCH 15/18] add news item --- _data/news.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/news.yml b/_data/news.yml index 0a924bfb..c841df21 100755 --- a/_data/news.yml +++ b/_data/news.yml @@ -138,3 +138,7 @@ date: 2024-09-05 linked_pr: 339 description: A showcase page was added about an open source workflow, integrating biological databases for FAIR data compliant Knowledge Graphs, in the Showcase section. [Discover the page here](/showcase/knowledge-graph-generator) +- name: "New page: Data Analysis of Pathogen Characterisation data" + date: 2024-09-19 + linked_pr: 308 + description: Content was added to the Pathogen Characterisation page on Data Analysis. [Discover the page here](/data-analysis/pathogen-characterisation) \ No newline at end of file From 1afb2c15a1f4c4304a8b684d42123d200bdfb02d Mon Sep 17 00:00:00 2001 From: bedroesb Date: Thu, 19 Sep 2024 16:31:55 +0200 Subject: [PATCH 16/18] add to sidebar --- _data/sidebars/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_data/sidebars/main.yml b/_data/sidebars/main.yml index 420d1def..70abf4f5 100644 --- a/_data/sidebars/main.yml +++ b/_data/sidebars/main.yml @@ -14,6 +14,8 @@ subitems: subitems: - title: Human biomolecular data url: /data-analysis/human-biomolecular-data + - title: Pathogen characterisation + url: /data-analysis/pathogen-characterisation - title: Data communication url: /data-communication/ From b7bf9fb25359fa9ec5c9cb39ee8a65dd74b6e526 Mon Sep 17 00:00:00 2001 From: bedroesb Date: Thu, 19 Sep 2024 16:32:01 +0200 Subject: [PATCH 17/18] remove placeholders --- data-analysis/pathogen-characterisation.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/data-analysis/pathogen-characterisation.md b/data-analysis/pathogen-characterisation.md index 4d9ff0c7..64f3243c 100644 --- a/data-analysis/pathogen-characterisation.md +++ b/data-analysis/pathogen-characterisation.md @@ -19,14 +19,6 @@ training: rdmkit: - name: Data Analysis url: https://rdmkit.elixir-europe.org/data_analysis -faircookbook: - - name: - url: -fairsharing: - - name: - url: - - --- ## Introduction From 6713033cbb6284a95b746839d737a60e120744c6 Mon Sep 17 00:00:00 2001 From: bedroesb Date: Thu, 19 Sep 2024 16:34:06 +0200 Subject: [PATCH 18/18] add contributor --- _data/CONTRIBUTORS.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/_data/CONTRIBUTORS.yaml b/_data/CONTRIBUTORS.yaml index caab7015..27f29a9b 100644 --- a/_data/CONTRIBUTORS.yaml +++ b/_data/CONTRIBUTORS.yaml @@ -289,5 +289,10 @@ Reagon Karki: email: reagon.karki@itmp.fraunhofer.de orcid: https://orcid.org/0000-0002-1815-0037 affiliation: Fraunhofer ITMP/EU-OpenScreen +Francesco Messina: + orcid: 0000-0001-8076-7217 + git: INMIbioinfo + affiliation: IRCCS (INMI) + Email: francesco.messina@inmi.it