From 3461bb3e7b1648cf5cb9e60d9c0d62fbb13ff3c2 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Mon, 4 Apr 2022 07:21:35 +0900 Subject: [PATCH 01/32] Bump up version number --- CHANGELOG.rst | 3 +++ pypgx/version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f5959baf..0224f5ad 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,9 @@ Changelog ********* +0.15.0 (in development) +----------------------- + 0.14.0 (2022-04-03) ------------------- diff --git a/pypgx/version.py b/pypgx/version.py index ef919940..a842d05a 100644 --- a/pypgx/version.py +++ b/pypgx/version.py @@ -1 +1 @@ -__version__ = '0.14.0' +__version__ = '0.15.0' From 809f4352b3a2b22277497f3ae354178268c38cb7 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 4 Apr 2022 16:20:12 +0900 Subject: [PATCH 02/32] Update `prepare-depth-of-coverage`: * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. --- CHANGELOG.rst | 2 ++ docs/cli.rst | 33 +++++++++++++++----------- pypgx/api/utils.py | 9 +++++-- pypgx/cli/prepare_depth_of_coverage.py | 17 ++++++++++++- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0224f5ad..1665930d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ Changelog 0.15.0 (in development) ----------------------- +* Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. + 0.14.0 (2022-04-03) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index 1b22d6c0..a83208de 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -703,6 +703,7 @@ prepare-depth-of-coverage $ pypgx prepare-depth-of-coverage -h usage: pypgx prepare-depth-of-coverage [-h] [--assembly TEXT] [--bed PATH] + [--genes TEXT [TEXT ...]] [--exclude] depth-of-coverage bams [bams ...] Prepare a depth of coverage file for all target genes with SV from BAM files. @@ -713,22 +714,26 @@ prepare-depth-of-coverage have star alleles defined only by SNVs/indels (e.g. CYP3A5). Positional arguments: - depth-of-coverage Output archive file with the semantic type - CovFrame[DepthOfCoverage]. - bams One or more input BAM files. Alternatively, you can - provide a text file (.txt, .tsv, .csv, or .list) - containing one BAM file per line. + depth-of-coverage Output archive file with the semantic type + CovFrame[DepthOfCoverage]. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: - -h, --help Show this help message and exit. - --assembly TEXT Reference genome assembly (default: 'GRCh37') - (choices: 'GRCh37', 'GRCh38'). - --bed PATH By default, the input data is assumed to be WGS. If - it's targeted sequencing, you must provide a BED file - to indicate probed regions. Note that the 'chr' prefix - in contig names (e.g. 'chr1' vs. '1') will be - automatically added or removed as necessary to match - the input BAM's contig names. + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --bed PATH By default, the input data is assumed to be WGS. If + it's targeted sequencing, you must provide a BED file + to indicate probed regions. Note that the 'chr' prefix + in contig names (e.g. 'chr1' vs. '1') will be + automatically added or removed as necessary to match + the input BAM's contig names. + --genes TEXT [TEXT ...] + List of genes to include. + --exclude Exclude specified genes. Ignored when --genes is not + used. [Example] From WGS data: $ pypgx prepare-depth-of-coverage \ diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index ae3007c8..099d49b9 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -1184,7 +1184,7 @@ def predict_cnv(copy_number, cnv_caller=None): return sdk.Archive(metadata, data) def prepare_depth_of_coverage( - bams, assembly='GRCh37', bed=None + bams, assembly='GRCh37', bed=None, genes=None, exclude=False ): """ Prepare a depth of coverage file for all target genes with SV from BAM @@ -1208,6 +1208,10 @@ def prepare_depth_of_coverage( Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to match the input BAM's contig names. + genes : list, optional + List of genes to include. + exclude : bool, default: False + Exclude specified genes. Ignored when ``genes=None``. Returns ------- @@ -1220,7 +1224,8 @@ def prepare_depth_of_coverage( } regions = create_regions_bed( - merge=True, sv_genes=True, assembly=assembly, + merge=True, sv_genes=True, assembly=assembly, genes=genes, + exclude=exclude ).to_regions() cf = pycov.CovFrame.from_bam(bams, regions=regions, zero=True) diff --git a/pypgx/cli/prepare_depth_of_coverage.py b/pypgx/cli/prepare_depth_of_coverage.py index bf066162..987be753 100644 --- a/pypgx/cli/prepare_depth_of_coverage.py +++ b/pypgx/cli/prepare_depth_of_coverage.py @@ -71,9 +71,24 @@ def create_parser(subparsers): automatically added or removed as necessary to match the input BAM's contig names.""" ) + parser.add_argument( + '--genes', + metavar='TEXT', + nargs='+', + help= +"""List of genes to include.""" + ) + parser.add_argument( + '--exclude', + action='store_true', + help= +"""Exclude specified genes. Ignored when --genes is not +used.""" + ) def main(args): archive = utils.prepare_depth_of_coverage( - args.bams, assembly=args.assembly, bed=args.bed + args.bams, assembly=args.assembly, bed=args.bed, genes=args.genes, + exclude=args.exclude ) archive.to_file(args.depth_of_coverage) From 231a7e229b3661e075744b4bdabe977b510b87c3 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 6 Apr 2022 14:33:46 +0900 Subject: [PATCH 03/32] Add new command `slice-bam` --- CHANGELOG.rst | 1 + README.rst | 1 + docs/cli.rst | 27 ++++++++++++++++++++ pypgx/__init__.py | 1 + pypgx/api/utils.py | 23 +++++++++++++++++ pypgx/cli/slice_bam.py | 58 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 111 insertions(+) create mode 100644 pypgx/cli/slice_bam.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1665930d..f9517e4c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ Changelog ----------------------- * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. +* Add new command :command:`slice-bam`. 0.14.0 (2022-04-03) ------------------- diff --git a/README.rst b/README.rst index a3fd0334..2994d70b 100644 --- a/README.rst +++ b/README.rst @@ -669,6 +669,7 @@ For getting help on the CLI: run-long-read-pipeline Run genotyping pipeline for long-read sequencing data. run-ngs-pipeline Run genotyping pipeline for NGS data. + slice-bam Slice BAM file for all genes used by PyPGx. test-cnv-caller Test CNV caller for target gene. train-cnv-caller Train CNV caller for target gene. diff --git a/docs/cli.rst b/docs/cli.rst index a83208de..91ecee79 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -60,6 +60,7 @@ For getting help on the CLI: run-long-read-pipeline Run genotyping pipeline for long-read sequencing data. run-ngs-pipeline Run genotyping pipeline for NGS data. + slice-bam Slice BAM file for all genes used by PyPGx. test-cnv-caller Test CNV caller for target gene. train-cnv-caller Train CNV caller for target gene. @@ -929,6 +930,32 @@ run-ngs-pipeline --control-statistcs control-statistics-VDR.zip \ --platform Targeted +slice-bam +========= + +.. code-block:: text + + $ pypgx slice-bam -h + usage: pypgx slice-bam [-h] [--assembly TEXT] [--genes TEXT [TEXT ...]] + [--exclude] + input output + + Slice BAM file for all genes used by PyPGx. + + Positional arguments: + input Input BAM file. It must be already indexed to allow + random access. + output Output BAM file. + + Optional arguments: + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --genes TEXT [TEXT ...] + List of genes to include. + --exclude Exclude specified genes. Ignored when --genes is not + used. + test-cnv-caller =============== diff --git a/pypgx/__init__.py b/pypgx/__init__.py index 9214d35f..f9690bca 100644 --- a/pypgx/__init__.py +++ b/pypgx/__init__.py @@ -54,6 +54,7 @@ predict_cnv, prepare_depth_of_coverage, print_metadata, + slice_bam, test_cnv_caller, train_cnv_caller, ) diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index 099d49b9..ce71efc5 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -1265,6 +1265,29 @@ def print_metadata(input): with zf.open(f'{parent}/metadata.txt') as f: print(f.read().decode('utf-8').strip()) +def slice_bam( + input, output, assembly='GRCh37', genes=None, exclude=False +): + """ + Slice BAM file for all genes used by PyPGx. + + Parameters + ---------- + input + Input BAM file. It must be already indexed to allow random access. + output : str + Output BAM file. + assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' + Reference genome assembly. + genes : list, optional + List of genes to include. + exclude : bool, default: False + Exclude specified genes. Ignored when ``genes=None``. + """ + bf = create_regions_bed(merge=True, assembly=assembly, + genes=genes, exclude=exclude) + pybam.slice(input, bf, path=output) + def test_cnv_caller( cnv_caller, copy_number, cnv_calls, confusion_matrix=None ): diff --git a/pypgx/cli/slice_bam.py b/pypgx/cli/slice_bam.py new file mode 100644 index 00000000..37278cb8 --- /dev/null +++ b/pypgx/cli/slice_bam.py @@ -0,0 +1,58 @@ +import sys + +from ..api import utils + +import fuc +import pysam + +description = f""" +Slice BAM file for all genes used by PyPGx. +""" + +def create_parser(subparsers): + parser = fuc.api.common._add_parser( + subparsers, + fuc.api.common._script_name(), + description=description, + help= +"""Slice BAM file for all genes used by PyPGx.""" + ) + parser.add_argument( + 'input', + help= +"""Input BAM file. It must be already indexed to allow +random access.""" + ) + parser.add_argument( + 'output', + help= +"""Output BAM file.""" + ) + parser.add_argument( + '--assembly', + metavar='TEXT', + default='GRCh37', + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" + ) + parser.add_argument( + '--genes', + metavar='TEXT', + nargs='+', + help= +"""List of genes to include.""" + ) + parser.add_argument( + '--exclude', + action='store_true', + help= +"""Exclude specified genes. Ignored when --genes is not +used.""" + ) + +def main(args): + utils.slice_bam( + args.input, args.output, assembly=args.assembly, genes=args.genes, + exclude=args.exclude + ) From b8c02d2c6424985112529c742d9de2899902e280 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 7 Apr 2022 11:00:01 +0900 Subject: [PATCH 04/32] Add new command `print-data` --- CHANGELOG.rst | 1 + README.rst | 1 + docs/cli.rst | 17 +++++++++++++++++ pypgx/__init__.py | 1 + pypgx/api/utils.py | 20 ++++++++++++++++++++ pypgx/cli/print_data.py | 27 +++++++++++++++++++++++++++ 6 files changed, 67 insertions(+) create mode 100644 pypgx/cli/print_data.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f9517e4c..0c6cc7cc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. +* Add new command :command:`print-data`. 0.14.0 (2022-04-03) ------------------- diff --git a/README.rst b/README.rst index 2994d70b..6bf5778c 100644 --- a/README.rst +++ b/README.rst @@ -664,6 +664,7 @@ For getting help on the CLI: prepare-depth-of-coverage Prepare a depth of coverage file for all target genes with SV from BAM files. + print-data Print the main data of specified archive. print-metadata Print the metadata of specified archive. run-chip-pipeline Run genotyping pipeline for chip data. run-long-read-pipeline diff --git a/docs/cli.rst b/docs/cli.rst index 91ecee79..f35edd08 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -55,6 +55,7 @@ For getting help on the CLI: prepare-depth-of-coverage Prepare a depth of coverage file for all target genes with SV from BAM files. + print-data Print the main data of specified archive. print-metadata Print the metadata of specified archive. run-chip-pipeline Run genotyping pipeline for chip data. run-long-read-pipeline @@ -747,6 +748,22 @@ prepare-depth-of-coverage bam.list \ --bed probes.bed +print-data +========== + +.. code-block:: text + + $ pypgx print-data -h + usage: pypgx print-data [-h] input + + Print the main data of specified archive. + + Positional arguments: + input Input archive file. + + Optional arguments: + -h, --help Show this help message and exit. + print-metadata ============== diff --git a/pypgx/__init__.py b/pypgx/__init__.py index f9690bca..5b78bf50 100644 --- a/pypgx/__init__.py +++ b/pypgx/__init__.py @@ -53,6 +53,7 @@ predict_alleles, predict_cnv, prepare_depth_of_coverage, + print_data, print_metadata, slice_bam, test_cnv_caller, diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index ce71efc5..a3c15a3d 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -1251,6 +1251,26 @@ def prepare_depth_of_coverage( return sdk.Archive(metadata, cf) +def print_data(input): + """ + Print the main data of specified archive. + + Parameters + ---------- + input : pypgx.Archive + Archive file. + """ + archive = sdk.Archive.from_file(input) + if 'SampleTable' in archive.type: + data = archive.data.to_csv(sep='\t') + elif 'CovFrame' in archive.type: + data = archive.data.to_string() + elif 'VcfFrame' in archive.type: + data = archive.data.to_string() + else: + raise ValueError(f"Data cannot be printed for {archive.type}") + print(data, end='') + def print_metadata(input): """ Print the metadata of specified archive. diff --git a/pypgx/cli/print_data.py b/pypgx/cli/print_data.py new file mode 100644 index 00000000..70ba9478 --- /dev/null +++ b/pypgx/cli/print_data.py @@ -0,0 +1,27 @@ +import sys + +from ..api import utils + +import fuc +import pysam + +description = f""" +Print the main data of specified archive. +""" + +def create_parser(subparsers): + parser = fuc.api.common._add_parser( + subparsers, + fuc.api.common._script_name(), + description=description, + help= +"""Print the main data of specified archive.""" + ) + parser.add_argument( + 'input', + help= +"""Input archive file.""" + ) + +def main(args): + utils.print_data(args.input) From 25e081d1ebb8dde4b9df1fb0ca4cd4f5b5c6150b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 7 Apr 2022 11:17:19 +0900 Subject: [PATCH 05/32] Update docs --- README.rst | 11 ++++++++++- docs/create.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 6bf5778c..2b4e41f1 100644 --- a/README.rst +++ b/README.rst @@ -374,7 +374,8 @@ Wroking with archive files -------------------------- To demonstrate how easy it is to work with PyPGx archive files, below we will -show some examples. First, download an archive: +show some examples. First, download an archive to play with, which has +``SampleTable[Results]`` as semantic type: .. code-block:: text @@ -389,6 +390,14 @@ Let's print its metadata: Assembly=GRCh37 SemanticType=SampleTable[Results] +Now print its main data (but display first sample only): + +.. code-block:: text + + $ pypgx print-data grch37-CYP2D6-results.zip | head -n 2 + Genotype Phenotype Haplotype1 Haplotype2 AlternativePhase VariantData CNV + HG00276_PyPGx *4/*5 Poor Metabolizer *4;*10;*74;*2; *10;*74;*2; ; *4:22-42524947-C-T:0.913;*10:22-42526694-G-A,22-42523943-A-G:1.0,1.0;*74:22-42525821-G-T:1.0;*2:default; DeletionHet + We can unzip it to extract files inside (note that ``tmpcty4c_cr`` is the original folder name): diff --git a/docs/create.py b/docs/create.py index 2fbeefe9..dfdecfc0 100644 --- a/docs/create.py +++ b/docs/create.py @@ -401,7 +401,8 @@ -------------------------- To demonstrate how easy it is to work with PyPGx archive files, below we will -show some examples. First, download an archive: +show some examples. First, download an archive to play with, which has +``SampleTable[Results]`` as semantic type: .. code-block:: text @@ -416,6 +417,14 @@ Assembly=GRCh37 SemanticType=SampleTable[Results] +Now print its main data (but display first sample only): + +.. code-block:: text + + $ pypgx print-data grch37-CYP2D6-results.zip | head -n 2 + Genotype Phenotype Haplotype1 Haplotype2 AlternativePhase VariantData CNV + HG00276_PyPGx *4/*5 Poor Metabolizer *4;*10;*74;*2; *10;*74;*2; ; *4:22-42524947-C-T:0.913;*10:22-42526694-G-A,22-42523943-A-G:1.0,1.0;*74:22-42525821-G-T:1.0;*2:default; DeletionHet + We can unzip it to extract files inside (note that ``tmpcty4c_cr`` is the original folder name): From 6a05b85e3ee0ef1d6f29f7432826a6c0277cd286 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 8 Apr 2022 12:37:45 +0900 Subject: [PATCH 06/32] Update docs --- docs/tutorials.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 307ebd5e..7b692828 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -54,6 +54,11 @@ those from: Please visit the :ref:`readme:Pipelines` page for details on how to generate the input files. +In case you are interested in creating above input files on your own, I have +also prepared "mini" BAM files where the original BAM files from GeT-RM have +been sliced to only contain genes used by PyPGx. You can download them `here +`__. + Let's look at the metadata for some of these files: .. code-block:: text From a1a11c3c036b8df6645f7b30102f7948483c299f Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 9 Apr 2022 11:05:45 +0900 Subject: [PATCH 07/32] Update docs --- docs/tutorials.rst | 77 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 13 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 7b692828..54862990 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -35,10 +35,13 @@ available for download and use from the `European Nucleotide Archive `__. We will be using this WGS dataset throughout the tutorial. -Because downloading the entire WGS dataset is not feasible for most users due -to its file size (i.e. a 30x WGS sample ≈ 90 GB), I have prepared input files -ranging from 2 KB to 17.6 MB, for both GRCh37 and GRCh38. You can download -those from: +Obtaining input files +--------------------- + +Because downloading the entire WGS dataset is probably not feasible for most +users due to large file size (i.e. a 30x WGS sample ≈ 90 GB), I have prepared +input files ranging from 2 KB to 25.5 MB, for both GRCh37 and GRCh38. You can +easily download these with: .. code-block:: text @@ -51,14 +54,6 @@ those from: $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-depth-of-coverage.zip $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-control-statistics-VDR.zip -Please visit the :ref:`readme:Pipelines` page for details on how to generate -the input files. - -In case you are interested in creating above input files on your own, I have -also prepared "mini" BAM files where the original BAM files from GeT-RM have -been sliced to only contain genes used by PyPGx. You can download them `here -`__. - Let's look at the metadata for some of these files: .. code-block:: text @@ -67,19 +62,75 @@ Let's look at the metadata for some of these files: Assembly=GRCh37 SemanticType=CovFrame[DepthOfCoverage] Platform=WGS + +.. code-block:: text + $ pypgx print-metadata grch38-control-statistics-VDR.zip Control=VDR Assembly=GRCh38 SemanticType=SampleTable[Statistics] Platform=WGS +At this point, you are now ready to move on to the next step. + +Optionally, in case you are interested in creating above input files on your +own, I have also prepared "mini" BAM files for GRCh37 where the original +sequencing data from GeT-RM have been sliced to contain genes used by PyPGx +only. You can download them `here `__. You will also need reference +FASTA when creating input VCF, which can be downloaded from `here +`__. + +Once you are finished downloading the mini BAM files and the reference FASTA +file, let's first create input VCF: + +.. code-block:: text + + $ pypgx create-input-vcf \ + grch37-variants.vcf.gz \ + /path/to/genome.fa \ + grch37-bam.list + +Note that this step can take some time to run. For example, it takes about 1 +hour to finish using my personal MacBook Air (M1, 2020) with 8 GB of memory. + +Next, we will compute depth of coverage for genes that are known to have SV: + +.. code-block:: text + + $ pypgx prepare-depth-of-coverage \ + grch37-depth-of-coverage.zip \ + grch37-bam.list + +This step should be quick. It finishes in less than 30 seconds with my laptop. + +.. code-block:: text + + $ pypgx compute-control-statistics \ + VDR \ + grch37-control-statistics-VDR.zip \ + grch37-bam.list + +Finally, we can compute control statistics using the VDR gene as control +locus, which will be used when converting read depth to copy number: + +.. code-block:: text + + $ pypgx compute-control-statistics \ + VDR \ + grch37-control-statistics-VDR.zip \ + grch37-bam.list + +This step should be quick as well. It finishes in less than 5 seconds with my +laptop. + Genotyping genes with SV ------------------------ The first gene we are going to genotype is CYP2D6, which has almost 150 star alleles including those with SV (e.g. gene deletions, duplications, and hybrids). To this end, we will run PyPGx's next-generation sequencing (NGS) -pipeline: +pipeline (see :ref:`readme:NGS pipeline` for more details): .. code-block:: text From 5392a5c21747aca4dd2c171affae3486749e6f56 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 9 Apr 2022 11:27:08 +0900 Subject: [PATCH 08/32] Update docs --- README.rst | 6 ++++++ docs/create.py | 6 ++++++ docs/tutorials.rst | 11 ++--------- pypgx/api/core.py | 4 ++-- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index 2b4e41f1..70ed10ee 100644 --- a/README.rst +++ b/README.rst @@ -524,6 +524,9 @@ HaplotypeCaller). See the `Variant caller choice `__ section for detailed discussion on when to use either option. +Check out the `GeT-RM WGS tutorial `__ to see this pipeline in action. + Chip pipeline ------------- @@ -543,6 +546,9 @@ The pipeline currently does not support SV detection. Please post a GitHub issue if you want to contribute your development skills and/or data for devising an SV detection algorithm. +Check out the `Coriell Affy tutorial `__ to see this pipeline in action. + Long-read pipeline ------------------ diff --git a/docs/create.py b/docs/create.py index dfdecfc0..71a8bd80 100644 --- a/docs/create.py +++ b/docs/create.py @@ -551,6 +551,9 @@ io/en/latest/faq.html#variant-caller-choice>`__ section for detailed discussion on when to use either option. +Check out the `GeT-RM WGS tutorial `__ to see this pipeline in action. + Chip pipeline ------------- @@ -570,6 +573,9 @@ issue if you want to contribute your development skills and/or data for devising an SV detection algorithm. +Check out the `Coriell Affy tutorial `__ to see this pipeline in action. + Long-read pipeline ------------------ diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 54862990..17952bce 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -104,15 +104,8 @@ Next, we will compute depth of coverage for genes that are known to have SV: This step should be quick. It finishes in less than 30 seconds with my laptop. -.. code-block:: text - - $ pypgx compute-control-statistics \ - VDR \ - grch37-control-statistics-VDR.zip \ - grch37-bam.list - -Finally, we can compute control statistics using the VDR gene as control -locus, which will be used when converting read depth to copy number: +Finally, we will compute control statistics using the VDR gene as control +locus, which is required when converting read depth to copy number: .. code-block:: text diff --git a/pypgx/api/core.py b/pypgx/api/core.py index 9e1916b8..90cdc201 100644 --- a/pypgx/api/core.py +++ b/pypgx/api/core.py @@ -1123,7 +1123,7 @@ def predict_phenotype(gene, a, b): gene deletion, duplication, and tandem arrangement. For detailed implementation, please see the `Phenotype prediction - `__ section. Parameters @@ -1199,7 +1199,7 @@ def predict_score(gene, allele): activity score system. For detailed implementation, please see the `Phenotype prediction - `__ section. Parameters From 3119a069a719417da75fd03fbd997e0e6ab6d682 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sun, 10 Apr 2022 09:53:46 +0900 Subject: [PATCH 09/32] Update `print-data` command to avoid BrokenPipeError --- pypgx/api/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index a3c15a3d..9cccaa1f 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -8,6 +8,7 @@ import zipfile import subprocess import os +import sys import pickle import warnings @@ -1269,7 +1270,14 @@ def print_data(input): data = archive.data.to_string() else: raise ValueError(f"Data cannot be printed for {archive.type}") - print(data, end='') + + # https://docs.python.org/3/library/signal.html#note-on-sigpipe + try: + print(data, end='') + except BrokenPipeError: + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(1) def print_metadata(input): """ From d8f17fc3212b2adcb2095eebc634f97b7e65e754 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sun, 10 Apr 2022 10:30:23 +0900 Subject: [PATCH 10/32] Update docs --- docs/tutorials.rst | 81 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 17952bce..79f0aea1 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -54,7 +54,12 @@ easily download these with: $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-depth-of-coverage.zip $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-control-statistics-VDR.zip -Let's look at the metadata for some of these files: +Let's take a look at the metadata for some of these files. If you're not +familiar with what metadata is, please visit `Archive file, semantic type, +and metadata `__. The first one we'll +look at is an archive file with the semantic type +``CovFrame[DepthOfCoverage]``: .. code-block:: text @@ -63,6 +68,26 @@ Let's look at the metadata for some of these files: SemanticType=CovFrame[DepthOfCoverage] Platform=WGS +We can see that above archive was created using WGS data aligned to GRCh37. +It has following data structure: + +.. code-block:: text + + $ pypgx print-data grch37-depth-of-coverage.zip | head + Chromosome Position NA18519_PyPGx HG01190_PyPGx NA12006_PyPGx NA18484_PyPGx NA07055_PyPGx NA18980_PyPGx NA19213_PyPGx NA12813_PyPGx NA19003_PyPGx NA10831_PyPGx NA18524_PyPGx NA10851_PyPGx NA18966_PyPGx HG00589_PyPGx NA18855_PyPGx NA18544_PyPGx NA18518_PyPGx NA18973_PyPGx NA19143_PyPGx NA18992_PyPGx NA12873_PyPGx NA19207_PyPGx NA18942_PyPGx NA19178_PyPGx NA19789_PyPGx NA19122_PyPGx NA19174_PyPGx NA18868_PyPGx HG00436_PyPGx HG00276_PyPGx NA19239_PyPGx NA19109_PyPGx NA20509_PyPGx NA10854_PyPGx NA19226_PyPGx NA10847_PyPGx NA18552_PyPGx NA18526_PyPGx NA07029_PyPGx NA06991_PyPGx NA11832_PyPGx NA21781_PyPGx NA12145_PyPGx NA19007_PyPGx NA18861_PyPGx NA12156_PyPGx NA18952_PyPGx NA18565_PyPGx NA19920_PyPGx NA12003_PyPGx NA20296_PyPGx NA07019_PyPGx NA07056_PyPGx NA11993_PyPGx NA19147_PyPGx NA19819_PyPGx NA07000_PyPGx NA18540_PyPGx NA19095_PyPGx NA18509_PyPGx NA19917_PyPGx NA18617_PyPGx NA07357_PyPGx NA19176_PyPGx NA18959_PyPGx NA07348_PyPGx NA18564_PyPGx NA19908_PyPGx NA11839_PyPGx NA12717_PyPGx + chr1 110227417 17 0 9 12 12 13 10 0 0 0 0 1 14 10 4 26 7 6 0 0 4 19 8 6 0 15 0 17 20 0 0 15 10 11 0 7 18 0 0 0 0 22 11 0 6 0 0 0 24 17 17 12 19 0 14 0 0 13 15 8 0 24 0 10 + chr1 110227418 17 0 9 12 12 13 10 0 0 0 0 1 14 10 4 26 8 8 0 0 4 19 9 6 0 15 0 18 20 0 0 16 10 11 0 8 18 0 0 0 0 22 11 0 6 0 0 0 24 17 17 12 20 0 14 0 0 13 15 8 0 24 0 10 + chr1 110227419 17 0 10 12 12 13 10 0 0 0 0 1 14 10 4 27 8 8 0 0 5 19 9 6 0 16 0 18 20 0 0 16 11 11 0 8 18 0 0 0 0 22 12 0 6 0 0 0 24 17 17 12 20 0 14 0 0 14 15 8 0 24 0 10 + chr1 110227420 17 0 10 13 13 12 10 0 0 0 0 1 14 10 3 27 8 8 0 0 5 18 9 6 0 15 0 18 19 0 0 16 11 11 0 8 16 0 0 0 0 22 12 0 6 0 0 0 24 19 17 11 19 0 13 0 0 14 15 8 0 23 0 10 + chr1 110227421 17 0 10 13 13 12 10 0 0 0 0 1 13 10 3 27 8 8 0 0 5 18 8 7 0 15 0 19 19 0 0 16 11 11 0 8 15 0 0 0 0 22 12 0 6 0 0 0 25 20 17 11 19 0 13 0 0 15 15 8 0 23 0 10 + chr1 110227422 18 0 10 13 13 12 10 0 0 0 0 1 13 10 3 27 8 8 0 0 5 18 9 7 0 15 0 19 19 0 0 17 11 11 0 8 15 0 0 0 0 21 12 0 6 0 0 0 25 20 18 11 19 0 13 0 0 16 15 9 0 23 0 10 + chr1 110227423 18 0 10 13 13 12 10 0 0 0 0 1 13 10 3 25 8 8 0 0 5 18 9 7 0 15 0 19 18 0 0 17 11 11 0 9 15 0 0 0 0 21 13 0 6 0 0 0 25 20 18 11 19 0 13 0 0 17 15 9 0 23 0 10 + chr1 110227424 18 0 10 13 13 12 10 0 0 0 0 1 13 10 3 25 8 8 0 0 5 18 9 7 0 15 0 19 18 0 0 17 11 11 0 9 15 0 0 0 0 21 13 0 6 0 0 0 26 20 18 11 19 0 14 0 0 16 15 9 0 23 0 10 + chr1 110227425 19 0 11 13 13 12 10 0 0 0 0 1 13 10 3 25 8 8 0 0 5 18 9 8 0 15 0 20 18 0 0 17 11 11 0 9 15 0 0 0 0 21 13 0 6 0 0 0 26 20 18 13 19 0 15 0 0 16 15 9 0 23 0 10 + +The second one is an archive file with the semantic type +``SampleTable[Statistics]``: + .. code-block:: text $ pypgx print-metadata grch38-control-statistics-VDR.zip @@ -71,8 +96,41 @@ Let's look at the metadata for some of these files: SemanticType=SampleTable[Statistics] Platform=WGS +Note that this archive was created using WGS data aligned to GRCh38 and the +VDR gene as control locus, and has following data structure: + +.. code-block:: text + + $ pypgx print-data grch38-control-statistics-VDR.zip | head + count mean std min 25% 50% 75% max + NA19213_PyPGx 69459.0 40.464317079140216 7.416070659882781 5.0 35.0 40.0 45.0 67.0 + HG00436_PyPGx 69459.0 39.05070617198635 7.041075412533929 3.0 34.0 39.0 44.0 66.0 + NA12006_PyPGx 69459.0 44.49780446018514 7.565078889270334 6.0 39.0 44.0 50.0 73.0 + NA12156_PyPGx 69459.0 39.53788565916584 7.463158820634827 3.0 34.0 39.0 44.0 66.0 + NA12813_PyPGx 69459.0 37.33543529276264 6.920597209929764 7.0 33.0 37.0 42.0 67.0 + NA19207_PyPGx 69459.0 40.59959112570005 7.042408883522744 4.0 36.0 41.0 45.0 63.0 + NA07029_PyPGx 69459.0 38.69389136037086 7.075488283784741 2.0 34.0 39.0 44.0 67.0 + NA18980_PyPGx 69459.0 34.79616752328712 6.685174389736681 1.0 30.0 35.0 39.0 59.0 + NA18973_PyPGx 69459.0 36.43840251083373 7.0885860461926296 3.0 32.0 37.0 41.0 66.0 + +Finally, we'll look at the input VCF. Note that it's not an archive file per +se, but we can still peek at its data: + +.. code-block:: text + + $ zcat grch37-variants.vcf.gz | grep "#CHROM" -A 5 + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA18519_PyPGx HG01190_PyPGx NA12006_PyPGx NA18484_PyPGx NA07055_PyPGx NA18980_PyPGx NA19213_PyPGx NA12813_PyPGx NA19003_PyPGx NA10831_PyPGx NA18524_PyPGx NA10851_PyPGx NA18966_PyPGx HG00589_PyPGx NA18855_PyPGx NA18544_PyPGx NA18518_PyPGx NA18973_PyPGx NA19143_PyPGx NA18992_PyPGx NA12873_PyPGx NA19207_PyPGx NA18942_PyPGx NA19178_PyPGx NA19789_PyPGx NA19122_PyPGx NA19174_PyPGx NA18868_PyPGx HG00436_PyPGx HG00276_PyPGx NA19239_PyPGx NA19109_PyPGx NA20509_PyPGx NA10854_PyPGx NA19226_PyPGx NA10847_PyPGx NA18552_PyPGx NA18526_PyPGx NA07029_PyPGx NA06991_PyPGx NA11832_PyPGx NA21781_PyPGx NA12145_PyPGx NA19007_PyPGx NA18861_PyPGx NA12156_PyPGx NA18952_PyPGx NA18565_PyPGx NA19920_PyPGx NA12003_PyPGx NA20296_PyPGx NA07019_PyPGx NA07056_PyPGx NA11993_PyPGx NA19147_PyPGx NA19819_PyPGx NA07000_PyPGx NA18540_PyPGx NA19095_PyPGx NA18509_PyPGx NA19917_PyPGx NA18617_PyPGx NA07357_PyPGx NA19176_PyPGx NA18959_PyPGx NA07348_PyPGx NA18564_PyPGx NA19908_PyPGx NA11839_PyPGx NA12717_PyPGx + chr1 47261780 . T C 235.707 PASS DP=1519;VDB=0.326231;SGB=-40.8249;RPBZ=0.398415;MQBZ=-15.2308;MQSBZ=0.889911;BQBZ=-10.8447;SCBZ=0.105486;FS=0;MQ0F=0;AC=120;AN=140;DP4=205,13,1153,122;MQ=49 GT:PL:AD 0/0:0,57,255:19,0 0/1:204,0,172:10,11 1/1:240,45,0:0,15 0/1:147,0,165:11,10 1/1:246,54,0:0,18 1/1:255,66,0:0,22 0/1:134,0,182:15,9 1/1:255,87,0:0,29 1/1:231,54,0:0,18 1/1:224,57,0:0,19 1/1:248,36,0:0,12 0/1:120,0,176:9,7 1/1:255,54,0:0,18 1/1:198,75,0:0,25 0/1:168,0,127:7,12 1/1:255,57,0:0,19 0/1:105,0,183:9,5 1/1:223,51,0:0,17 1/1:255,63,0:0,21 1/1:255,80,0:1,31 1/1:189,60,0:0,20 0/1:148,0,214:10,12 1/1:191,45,0:0,15 0/1:98,0,175:15,6 1/1:255,69,0:0,23 0/1:158,0,100:7,16 0/1:161,0,114:5,12 0/1:255,0,138:9,14 1/1:247,81,0:0,27 1/1:227,57,0:0,19 1/1:255,63,0:0,21 1/1:255,69,0:0,23 1/1:255,75,0:0,25 1/1:255,84,0:0,28 0/1:202,0,190:14,15 1/1:224,69,0:0,23 1/1:255,66,0:0,22 1/1:255,63,0:0,21 1/1:255,39,0:0,13 1/1:255,51,0:0,17 1/1:255,72,0:0,24 1/1:231,63,0:0,21 1/1:255,78,0:0,26 1/1:255,75,0:0,25 0/1:145,0,227:16,10 1/1:200,72,0:0,24 1/1:205,72,0:0,24 1/1:207,66,0:0,22 0/1:109,0,172:12,8 0/1:174,0,135:9,14 1/1:255,66,0:0,22 1/1:255,45,0:0,15 1/1:249,54,0:0,18 1/1:255,54,0:0,18 1/1:230,72,0:0,24 1/1:247,63,0:0,21 1/1:211,81,0:0,27 1/1:255,54,0:0,18 0/1:167,0,193:13,13 1/1:255,72,0:0,24 0/1:76,0,159:11,4 1/1:236,66,0:0,22 1/1:255,78,0:0,26 1/1:218,45,0:0,15 1/1:255,60,0:0,20 1/1:255,66,0:0,22 1/1:202,78,0:0,26 1/1:255,81,0:0,27 0/1:181,0,176:16,11 1/1:231,33,0:0,11 + chr1 47261821 . G A 174.846 PASS DP=1722;VDB=0.413935;SGB=-18.2343;RPBZ=0.238211;MQBZ=-1.89867;MQSBZ=6.49061;BQBZ=1.3413;SCBZ=0.173613;FS=0;MQ0F=0;AC=1;AN=140;DP4=1407,277,14,2;MQ=52 GT:PL:AD 0/0:0,81,255:27,0 0/0:0,84,255:28,0 0/0:0,60,255:20,0 0/0:0,90,239:30,0 0/0:0,60,221:20,0 0/0:0,84,255:28,0 0/0:0,84,241:28,0 0/0:0,81,255:27,0 0/0:0,63,190:21,0 0/1:200,0,127:11,110/0:0,63,255:21,0 0/0:0,75,255:25,0 0/0:0,63,255:21,0 0/0:0,63,215:21,0 0/0:0,69,216:23,0 0/0:0,75,255:25,0 0/0:0,54,244:18,0 0/0:0,57,212:19,0 0/0:0,90,255:30,0 0/0:0,96,255:32,0 0/0:0,72,241:24,0 0/0:0,72,223:24,0 0/0:0,54,191:18,0 0/0:0,75,223:25,0 0/0:0,75,255:25,0 0/0:0,90,222:30,0 0/0:0,54,180:18,0 0/0:0,99,255:33,0 0/0:0,93,255:31,0 0/0:0,66,212:22,0 0/0:0,72,255:24,0 0/0:0,75,243:25,0 0/0:0,72,255:24,0 0/0:0,69,255:27,1 0/0:0,102,250:34,0 0/0:0,81,186:27,0 0/0:0,66,255:22,0 0/0:0,72,255:24,0 0/0:0,50,236:21,1 0/0:0,60,255:20,0 0/0:0,75,255:25,0 0/0:0,54,182:18,0 0/0:0,75,255:25,0 0/0:0,78,255:26,0 0/0:0,81,233:27,0 0/0:0,78,153:26,0 0/0:0,75,180:25,0 0/0:0,60,174:20,0 0/0:0,51,189:17,0 0/0:0,84,234:28,0 0/0:0,63,255:21,0 0/0:0,48,210:16,0 0/0:0,63,231:21,0 0/0:0,69,255:23,0 0/0:0,81,252:27,0 0/0:0,69,178:23,0 0/0:0,69,221:23,0 0/0:0,57,255:19,0 0/0:0,75,217:25,0 0/0:0,93,255:31,0 0/0:0,54,231:18,0 0/0:0,96,211:32,0 0/0:0,93,255:31,0 0/0:0,54,211:18,0 0/0:0,66,243:22,0 0/0:0,72,222:24,0 0/0:0,90,236:30,0 0/0:0,78,242:26,0 0/0:0,87,255:29,0 0/0:0,45,255:15,0 + chr1 47261822 . A T 232.856 PASS DP=1729;VDB=0.568499;SGB=-11.6626;RPBZ=-0.581723;MQBZ=-14.8734;MQSBZ=6.53808;BQBZ=1.09344;SCBZ=1.03879;FS=0;MQ0F=0;AC=88;AN=140;DP4=544,110,864,174;MQ=52 GT:PL:AD 0/0:0,81,255:27,0 0/1:255,0,226:12,17 1/1:255,60,0:0,20 0/0:0,87,255:29,0 0/0:0,63,255:21,0 0/1:152,0,255:15,11 0/1:182,0,223:17,11 1/1:255,81,0:0,27 0/1:128,0,189:13,8 1/1:255,69,0:0,23 1/1:255,66,0:0,22 0/1:246,0,193:11,14 1/1:255,60,0:0,20 1/1:255,60,0:0,20 0/0:0,66,255:22,0 1/1:255,75,0:0,25 0/0:0,54,255:18,0 1/1:255,54,0:0,18 0/1:209,0,255:19,10 0/1:255,0,255:16,161/1:255,72,0:0,24 0/1:145,0,248:15,10 0/1:113,0,170:9,6 0/1:153,0,206:16,8 1/1:255,69,0:0,23 0/0:0,87,255:29,0 0/1:149,0,187:9,10 0/1:255,0,171:12,20 0/1:176,0,255:16,13 0/1:218,0,145:11,130/1:221,0,218:14,10 0/1:237,0,184:11,15 1/1:255,72,0:0,24 1/1:255,84,0:0,28 0/1:254,0,194:16,181/1:255,75,0:0,25 1/1:255,60,0:0,20 1/1:255,69,0:0,23 0/0:0,69,255:23,0 0/0:0,60,255:20,0 1/1:255,72,0:0,24 1/1:236,54,0:0,18 1/1:255,75,0:0,25 0/1:155,0,255:18,10 0/0:0,81,255:27,0 1/1:212,75,0:0,25 0/1:196,0,133:10,15 0/1:171,0,155:9,11 0/1:105,0,188:10,7 0/1:182,0,219:14,131/1:255,63,0:0,21 1/1:255,48,0:0,16 1/1:255,63,0:0,21 1/1:255,72,0:0,24 1/1:255,78,0:0,26 1/1:232,63,0:0,21 0/0:0,66,255:22,0 0/1:150,0,215:10,7 0/1:180,0,178:13,12 0/1:243,0,190:12,180/1:106,0,222:11,6 0/1:212,0,193:13,19 1/1:255,87,0:0,29 1/1:255,57,0:0,19 0/1:203,0,189:9,13 1/1:255,69,0:0,23 0/1:233,0,146:9,20 0/0:0,81,255:27,0 0/1:180,0,249:19,9 1/1:255,45,0:0,15 + chr1 47261869 . C T 235.707 PASS DP=1863;VDB=0.677143;SGB=5.02317;RPBZ=-2.55997;MQBZ=-8.87433;MQSBZ=3.1481;BQBZ=26.6865;SCBZ=0.647961;FS=0;MQ0F=0;AC=88;AN=140;DP4=522,174,834,311;MQ=56 GT:PL:AD 0/0:0,84,255:28,0 0/1:255,0,194:12,20 1/1:255,69,0:0,23 0/0:0,93,255:31,0 0/0:0,69,255:23,0 0/1:216,0,255:17,11 0/1:218,0,238:14,14 1/1:255,90,0:0,30 0/1:190,0,148:10,9 1/1:255,60,0:0,20 1/1:255,81,0:0,27 0/1:255,0,192:13,13 1/1:255,81,0:0,27 1/1:255,81,0:1,31 0/0:0,65,255:26,1 1/1:255,78,0:0,26 0/0:0,51,255:17,0 1/1:255,63,0:0,21 0/1:240,0,244:17,12 0/1:255,0,255:17,171/1:255,69,0:0,23 0/1:186,0,239:16,11 0/1:247,0,255:15,12 0/1:231,0,221:17,11 1/1:255,69,0:0,23 0/0:0,87,255:29,0 0/1:196,0,198:11,11 0/1:255,0,213:16,20 0/1:232,0,238:16,13 0/1:255,0,175:13,150/1:223,0,245:22,11 0/1:255,0,255:15,16 1/1:255,81,0:0,27 1/1:255,99,0:0,33 0/1:255,0,209:16,161/1:255,87,0:0,29 1/1:255,75,0:0,25 1/1:255,75,0:0,25 0/0:0,66,255:22,0 0/0:0,63,255:21,0 1/1:255,78,0:0,26 1/1:218,54,0:0,18 1/1:255,78,0:0,26 0/1:173,0,255:23,12 0/0:0,72,255:24,0 1/1:255,75,0:0,25 0/1:213,0,168:11,13 0/1:247,0,188:11,12 0/1:195,0,124:6,9 0/1:173,0,205:16,121/1:255,66,0:0,22 1/1:255,72,0:0,24 1/1:255,54,0:0,18 1/1:255,93,0:0,31 1/1:255,84,0:0,28 1/1:255,66,0:0,22 0/0:0,48,255:21,1 0/1:190,0,255:13,8 0/1:255,0,173:9,13 0/1:255,0,214:16,180/1:202,0,179:12,11 0/1:255,0,218:16,17 1/1:255,84,0:0,28 1/1:255,81,0:0,27 0/1:255,0,111:7,18 1/1:255,69,0:0,23 0/1:255,0,213:13,19 0/0:0,66,255:22,0 0/1:253,0,247:21,13 1/1:255,75,0:0,25 + chr1 47261936 . C T 232.857 PASS DP=2179;VDB=0.991573;SGB=71.95;RPBZ=0.621331;MQBZ=0.919674;MQSBZ=-0.0215108;BQBZ=10.1541;SCBZ=0.212854;FS=0;MQ0F=0;AC=17;AN=140;DP4=1145,745,173,83;MQ=59 GT:PL:AD 0/0:0,87,255:29,0 0/0:0,117,255:39,0 0/0:0,72,255:24,0 0/0:0,105,255:35,0 0/1:205,0,189:10,160/1:255,0,230:10,15 0/0:0,96,255:32,0 0/0:0,96,255:32,0 0/1:225,0,222:13,12 0/0:0,69,255:23,0 0/0:0,105,255:35,0 0/0:0,78,255:26,0 0/0:0,114,255:38,0 0/0:0,123,255:41,0 0/1:210,0,255:18,100/0:0,105,255:35,0 0/0:0,78,255:26,0 0/0:0,90,255:30,0 0/0:0,96,255:32,0 0/0:0,108,255:36,0 0/0:0,84,255:28,0 0/0:0,75,255:25,0 0/1:255,0,255:15,13 0/0:0,93,255:31,0 0/0:0,84,255:28,0 0/0:0,87,255:29,0 0/0:0,81,255:27,0 0/0:0,111,255:37,0 0/1:255,0,183:10,16 0/1:255,0,251:15,170/0:0,108,255:36,0 0/0:0,99,255:33,0 0/0:0,102,255:34,0 0/0:0,99,255:33,0 0/0:0,105,255:35,0 0/0:0,117,255:39,0 0/0:0,78,255:26,0 0/0:0,102,255:34,0 1/1:255,75,0:0,25 1/1:255,99,0:0,33 0/0:0,78,255:26,0 0/0:0,66,255:22,0 0/0:0,96,255:32,0 0/0:0,87,255:29,0 0/0:0,81,255:27,0 0/0:0,93,255:31,0 0/1:224,0,252:15,13 0/0:0,96,255:32,0 0/0:0,81,255:27,0 0/0:0,102,255:34,0 0/0:0,87,255:29,0 0/0:0,108,255:36,0 0/0:0,69,255:23,0 0/0:0,96,255:32,0 0/0:0,96,255:32,0 0/0:0,93,255:31,0 1/1:255,99,0:0,33 0/0:0,81,255:27,0 0/0:0,87,255:29,0 0/0:0,102,255:34,0 0/0:0,81,255:27,0 0/1:255,0,255:20,17 0/0:0,93,255:31,0 0/0:0,84,255:28,0 0/1:100,0,255:22,6 0/0:0,87,255:29,0 0/1:255,0,255:24,19 0/0:0,78,255:26,0 0/0:0,102,255:34,0 0/0:0,66,255:22,0 + At this point, you are now ready to move on to the next step. +(Optional) Creating input files +------------------------------- + Optionally, in case you are interested in creating above input files on your own, I have also prepared "mini" BAM files for GRCh37 where the original sequencing data from GeT-RM have been sliced to contain genes used by PyPGx @@ -82,13 +140,30 @@ FASTA when creating input VCF, which can be downloaded from `here `__. Once you are finished downloading the mini BAM files and the reference FASTA -file, let's first create input VCF: +file, first create a text file (.txt, .tsv, .csv, or .list) containing one +BAM file per line such that: + +.. code-block:: text + + $ cat grch37-bam.list | head + /path/to/grch37-bam/NA18519_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/HG01190_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA12006_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA18484_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA07055_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA18980_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA19213_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA12813_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA19003_PyPGx.sorted.markdup.recal.bam + /path/to/grch37-bam/NA10831_PyPGx.sorted.markdup.recal.bam + +Now we can create input VCF: .. code-block:: text $ pypgx create-input-vcf \ grch37-variants.vcf.gz \ - /path/to/genome.fa \ + /path/to/GRCh37/genome.fa \ grch37-bam.list Note that this step can take some time to run. For example, it takes about 1 From 8e2e48862578e4aa7839c5b22cbaf5b65a0aab69 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 11 Apr 2022 09:24:04 +0900 Subject: [PATCH 11/32] Update docs --- docs/tutorials.rst | 66 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 79f0aea1..9ac2d75e 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -227,7 +227,71 @@ Above will create a number of archive files: In addition to these files, PyPGx will have also created two directories called ``copy-number-profile`` and ``allele-fraction-profile``. -Now let's make sure the genotype results are correct by comparing them with the validation data: +Let's take a look at the results: + +.. code-block:: text + + $ pypgx print-data grch37-CYP2D6-pipeline/results.zip | head + Genotype Phenotype Haplotype1 Haplotype2 AlternativePhase VariantData CNV + HG00589_PyPGx *1/*21 Intermediate Metabolizer *21;*2; *1; ; *21:22-42524213-C-CG:0.378;*1:22-42522613-G-C,22-42523943-A-G:0.645,0.625;*2:default; Normal + NA07019_PyPGx *1/*4 Intermediate Metabolizer *1; *4;*10;*74;*2; ; *4:22-42524947-C-T:0.452;*10:22-42523943-A-G,22-42526694-G-A:1.0,0.448;*74:22-42525821-G-T:0.424;*1:22-42522613-G-C,22-42523943-A-G:0.361,1.0;*2:default; Normal + NA10851_PyPGx *1/*4 Intermediate Metabolizer *1; *4;*10;*74;*2; ; *4:22-42524947-C-T:0.467;*10:22-42523943-A-G,22-42526694-G-A:0.95,0.421;*74:22-42525821-G-T:0.447;*1:22-42522613-G-C,22-42523943-A-G:0.486,0.95;*2:default; Normal + NA18484_PyPGx *1/*17 Normal Metabolizer *1; *17;*2; ; *17:22-42525772-G-A:0.6;*1:22-42522613-G-C,22-42523943-A-G:0.625,0.391;*2:default; Normal + NA12006_PyPGx *4/*41 Intermediate Metabolizer *41;*2; *4;*10;*2; *69; *69:22-42526694-G-A,22-42523805-C-T:0.473,0.528;*4:22-42524947-C-T:0.448;*10:22-42523943-A-G,22-42526694-G-A:0.545,0.473;*41:22-42523805-C-T:0.528;*2:default; Normal + HG00436_PyPGx *2x2/*71 Indeterminate *71;*1; *2; ; *71:22-42526669-C-T:0.433;*1:22-42522613-G-C,22-42523943-A-G:0.462,0.353;*2:default; Duplication + NA19213_PyPGx *1/*1 Normal Metabolizer *1; *1; ; *1:22-42522613-G-C,22-42523943-A-G:1.0,1.0; Normal + NA19207_PyPGx *2x2/*10 Normal Metabolizer *10;*2; *2; ; *10:22-42523943-A-G,22-42526694-G-A:0.366,0.25;*2:default; Duplication + NA07029_PyPGx *1/*35 Normal Metabolizer *35;*2; *1; ; *1:22-42522613-G-C,22-42523943-A-G:0.596,0.476;*35:22-42526763-C-T:0.405;*2:default; Normal + +You can read :ref:`readme:Results interpretation` for details on how to +interpret the PyPGx results. + +Next, we can manually inspect SV calls by visualizing copy number and allele +fraction for the CYP2D6 locus (read :ref:`readme:Structural variation +detection` for details). For example, above results indicate that the samples +``HG00589_PyPGx`` and ``HG00436_PyPGx`` have ``Normal`` and ``Duplication`` +as CNV calls, respectively: + +.. list-table:: + :header-rows: 1 + :widths: 10 45 45 + + * - Sample + - Copy Number + - Allele Fraction + * - HG00589_PyPGx + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-copy-number.png + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-allele-fraction.png + * - HG00436_PyPGx + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-copy-number.png + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-allele-fraction.png + +If you want to prepare publication quality figures, it's strongly recommended +to combine copy number and allele fraction profiles together: + +.. code-block:: text + + $ pypgx plot-cn-af \ + grch37-CYP2D6-pipeline/copy-number.zip \ + grch37-CYP2D6-pipeline/imported-variants.zip \ + --samples HG00589_PyPGx HG00436_PyPGx + +.. list-table:: + :header-rows: 1 + :widths: 10 90 + + * - Sample + - Profile + * - HG00589_PyPGx + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-combined.png + * - HG00436_PyPGx + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-combined.png + +Note that above also adds a fitted line on top of each copy number profile to +display what the SV classifier actually "sees". + +Now let's make sure the genotype results are correct by comparing them with +the validation data: .. code-block:: text From 5dab4161442e826c418e4183f2ae012cf4c9e161 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 12 Apr 2022 14:46:16 +0900 Subject: [PATCH 12/32] Update docs --- docs/tutorials.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 9ac2d75e..1c656f92 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -134,10 +134,11 @@ At this point, you are now ready to move on to the next step. Optionally, in case you are interested in creating above input files on your own, I have also prepared "mini" BAM files for GRCh37 where the original sequencing data from GeT-RM have been sliced to contain genes used by PyPGx -only. You can download them `here `__. You will also need reference -FASTA when creating input VCF, which can be downloaded from `here -`__. +only. You can download them from the shared OneDrive folder `sbslee-bucket +`__. There, you +can navigate to ``sbslee-bucket`` > ``pypgx`` > ``getrm-wgs-tutorial`` > +``grch37-bam``. You will also need reference FASTA when creating input VCF, +which can be downloaded from ``sbslee-bucket`` > ``ref`` > ``grch37``. Once you are finished downloading the mini BAM files and the reference FASTA file, first create a text file (.txt, .tsv, .csv, or .list) containing one From 805c92a81bb0db5bda4762eb1be743e1f163118a Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 20 Apr 2022 17:20:19 +0900 Subject: [PATCH 13/32] Update docs --- docs/tutorials.rst | 50 +++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 1c656f92..fe296e23 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -126,7 +126,8 @@ se, but we can still peek at its data: chr1 47261869 . C T 235.707 PASS DP=1863;VDB=0.677143;SGB=5.02317;RPBZ=-2.55997;MQBZ=-8.87433;MQSBZ=3.1481;BQBZ=26.6865;SCBZ=0.647961;FS=0;MQ0F=0;AC=88;AN=140;DP4=522,174,834,311;MQ=56 GT:PL:AD 0/0:0,84,255:28,0 0/1:255,0,194:12,20 1/1:255,69,0:0,23 0/0:0,93,255:31,0 0/0:0,69,255:23,0 0/1:216,0,255:17,11 0/1:218,0,238:14,14 1/1:255,90,0:0,30 0/1:190,0,148:10,9 1/1:255,60,0:0,20 1/1:255,81,0:0,27 0/1:255,0,192:13,13 1/1:255,81,0:0,27 1/1:255,81,0:1,31 0/0:0,65,255:26,1 1/1:255,78,0:0,26 0/0:0,51,255:17,0 1/1:255,63,0:0,21 0/1:240,0,244:17,12 0/1:255,0,255:17,171/1:255,69,0:0,23 0/1:186,0,239:16,11 0/1:247,0,255:15,12 0/1:231,0,221:17,11 1/1:255,69,0:0,23 0/0:0,87,255:29,0 0/1:196,0,198:11,11 0/1:255,0,213:16,20 0/1:232,0,238:16,13 0/1:255,0,175:13,150/1:223,0,245:22,11 0/1:255,0,255:15,16 1/1:255,81,0:0,27 1/1:255,99,0:0,33 0/1:255,0,209:16,161/1:255,87,0:0,29 1/1:255,75,0:0,25 1/1:255,75,0:0,25 0/0:0,66,255:22,0 0/0:0,63,255:21,0 1/1:255,78,0:0,26 1/1:218,54,0:0,18 1/1:255,78,0:0,26 0/1:173,0,255:23,12 0/0:0,72,255:24,0 1/1:255,75,0:0,25 0/1:213,0,168:11,13 0/1:247,0,188:11,12 0/1:195,0,124:6,9 0/1:173,0,205:16,121/1:255,66,0:0,22 1/1:255,72,0:0,24 1/1:255,54,0:0,18 1/1:255,93,0:0,31 1/1:255,84,0:0,28 1/1:255,66,0:0,22 0/0:0,48,255:21,1 0/1:190,0,255:13,8 0/1:255,0,173:9,13 0/1:255,0,214:16,180/1:202,0,179:12,11 0/1:255,0,218:16,17 1/1:255,84,0:0,28 1/1:255,81,0:0,27 0/1:255,0,111:7,18 1/1:255,69,0:0,23 0/1:255,0,213:13,19 0/0:0,66,255:22,0 0/1:253,0,247:21,13 1/1:255,75,0:0,25 chr1 47261936 . C T 232.857 PASS DP=2179;VDB=0.991573;SGB=71.95;RPBZ=0.621331;MQBZ=0.919674;MQSBZ=-0.0215108;BQBZ=10.1541;SCBZ=0.212854;FS=0;MQ0F=0;AC=17;AN=140;DP4=1145,745,173,83;MQ=59 GT:PL:AD 0/0:0,87,255:29,0 0/0:0,117,255:39,0 0/0:0,72,255:24,0 0/0:0,105,255:35,0 0/1:205,0,189:10,160/1:255,0,230:10,15 0/0:0,96,255:32,0 0/0:0,96,255:32,0 0/1:225,0,222:13,12 0/0:0,69,255:23,0 0/0:0,105,255:35,0 0/0:0,78,255:26,0 0/0:0,114,255:38,0 0/0:0,123,255:41,0 0/1:210,0,255:18,100/0:0,105,255:35,0 0/0:0,78,255:26,0 0/0:0,90,255:30,0 0/0:0,96,255:32,0 0/0:0,108,255:36,0 0/0:0,84,255:28,0 0/0:0,75,255:25,0 0/1:255,0,255:15,13 0/0:0,93,255:31,0 0/0:0,84,255:28,0 0/0:0,87,255:29,0 0/0:0,81,255:27,0 0/0:0,111,255:37,0 0/1:255,0,183:10,16 0/1:255,0,251:15,170/0:0,108,255:36,0 0/0:0,99,255:33,0 0/0:0,102,255:34,0 0/0:0,99,255:33,0 0/0:0,105,255:35,0 0/0:0,117,255:39,0 0/0:0,78,255:26,0 0/0:0,102,255:34,0 1/1:255,75,0:0,25 1/1:255,99,0:0,33 0/0:0,78,255:26,0 0/0:0,66,255:22,0 0/0:0,96,255:32,0 0/0:0,87,255:29,0 0/0:0,81,255:27,0 0/0:0,93,255:31,0 0/1:224,0,252:15,13 0/0:0,96,255:32,0 0/0:0,81,255:27,0 0/0:0,102,255:34,0 0/0:0,87,255:29,0 0/0:0,108,255:36,0 0/0:0,69,255:23,0 0/0:0,96,255:32,0 0/0:0,96,255:32,0 0/0:0,93,255:31,0 1/1:255,99,0:0,33 0/0:0,81,255:27,0 0/0:0,87,255:29,0 0/0:0,102,255:34,0 0/0:0,81,255:27,0 0/1:255,0,255:20,17 0/0:0,93,255:31,0 0/0:0,84,255:28,0 0/1:100,0,255:22,6 0/0:0,87,255:29,0 0/1:255,0,255:24,19 0/0:0,78,255:26,0 0/0:0,102,255:34,0 0/0:0,66,255:22,0 -At this point, you are now ready to move on to the next step. +At this point, you are now ready to move on to the next step: +:ref:`tutorials:Genotyping genes with SV`. (Optional) Creating input files ------------------------------- @@ -134,38 +135,37 @@ At this point, you are now ready to move on to the next step. Optionally, in case you are interested in creating above input files on your own, I have also prepared "mini" BAM files for GRCh37 where the original sequencing data from GeT-RM have been sliced to contain genes used by PyPGx -only. You can download them from the shared OneDrive folder `sbslee-bucket -`__. There, you -can navigate to ``sbslee-bucket`` > ``pypgx`` > ``getrm-wgs-tutorial`` > -``grch37-bam``. You will also need reference FASTA when creating input VCF, -which can be downloaded from ``sbslee-bucket`` > ``ref`` > ``grch37``. +only: -Once you are finished downloading the mini BAM files and the reference FASTA -file, first create a text file (.txt, .tsv, .csv, or .list) containing one -BAM file per line such that: +.. code-block:: text + + $ mkdir grch37-bam + $ wget https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam.list + $ head -n 6 grch37-bam.list + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00276_PyPGx.sorted.markdup.recal.bai + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00276_PyPGx.sorted.markdup.recal.bam + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00436_PyPGx.sorted.markdup.recal.bai + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00436_PyPGx.sorted.markdup.recal.bam + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00589_PyPGx.sorted.markdup.recal.bai + https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00589_PyPGx.sorted.markdup.recal.bam + $ wget -i grch37-bam.list -P grch37-bam + +You will also need reference FASTA when creating input VCF: .. code-block:: text - $ cat grch37-bam.list | head - /path/to/grch37-bam/NA18519_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/HG01190_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA12006_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA18484_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA07055_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA18980_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA19213_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA12813_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA19003_PyPGx.sorted.markdup.recal.bam - /path/to/grch37-bam/NA10831_PyPGx.sorted.markdup.recal.bam + $ wget https://storage.googleapis.com/sbslee-bucket/ref/grch37/genome.fa + $ wget https://storage.googleapis.com/sbslee-bucket/ref/grch37/genome.fa.fai -Now we can create input VCF: +Once you are finished downloading the mini BAM files and the reference FASTA +file, we can create input VCF: .. code-block:: text $ pypgx create-input-vcf \ grch37-variants.vcf.gz \ - /path/to/GRCh37/genome.fa \ - grch37-bam.list + genome.fa \ + grch37-bam/*.bam Note that this step can take some time to run. For example, it takes about 1 hour to finish using my personal MacBook Air (M1, 2020) with 8 GB of memory. @@ -176,7 +176,7 @@ Next, we will compute depth of coverage for genes that are known to have SV: $ pypgx prepare-depth-of-coverage \ grch37-depth-of-coverage.zip \ - grch37-bam.list + grch37-bam/*.bam This step should be quick. It finishes in less than 30 seconds with my laptop. @@ -188,7 +188,7 @@ locus, which is required when converting read depth to copy number: $ pypgx compute-control-statistics \ VDR \ grch37-control-statistics-VDR.zip \ - grch37-bam.list + grch37-bam/*.bam This step should be quick as well. It finishes in less than 5 seconds with my laptop. From 21cf90aa6624d98bc76de86877ded6cda598483e Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 25 Apr 2022 17:08:01 +0900 Subject: [PATCH 14/32] Update CNV data for CYP2A6 --- CHANGELOG.rst | 2 ++ docs/genes.rst | 24 +++++++++++++++++++++++- pypgx/api/data/cnv-table.csv | 2 ++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0c6cc7cc..0721e15d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,8 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. +* Improve CNV caller for CYP2A6. +* Add new CNV calls for CYP2A6: ``Deletion2Hom`` and ``Hybrid5``. 0.14.0 (2022-04-03) ------------------- diff --git a/docs/genes.rst b/docs/genes.rst index 741871ea..e3946c2f 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -533,7 +533,7 @@ Below is a summary table: - `chr4:68640596-68676652 `__ - * - :ref:`genes:UGT2B17` - - + - - ✅ - - @@ -725,6 +725,17 @@ Below is comprehensive summary of SV described from real NGS studies: - - - + * - \*4 + - Deletion2Hom + - \*4/\*4 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA21093 + - * - \*4 - Deletion3Het - \*4/\*9 @@ -824,6 +835,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `1KGP `__ - NA20515 - + * - + - Hybrid5 + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - HG00155 + - * - - PseudogeneDuplication - \*1/\*18 diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index be638710..bba5785b 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -14,6 +14,8 @@ CYP2A6,Duplication1 CYP2A6,Duplication2 CYP2A6,Duplication3 CYP2A6,Tandem +CYP2A6,Deletion2Hom +CYP2A6,Hybrid5 CYP2B6,Normal CYP2B6,Hybrid CYP2B6,Duplication From 9c1fc08cd2a67f8400f90d0094851f0d3f164aae Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 26 Apr 2022 16:16:26 +0900 Subject: [PATCH 15/32] Update CNV data for CYP2A6: * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. --- CHANGELOG.rst | 2 +- docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0721e15d..ba7dcc6e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Changelog * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. * Improve CNV caller for CYP2A6. -* Add new CNV calls for CYP2A6: ``Deletion2Hom`` and ``Hybrid5``. +* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. 0.14.0 (2022-04-03) ------------------- diff --git a/docs/genes.rst b/docs/genes.rst index e3946c2f..7371ac89 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -846,6 +846,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `1KGP `__ - HG00155 - + * - + - Hybrid6 + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - HG00141 + - * - - PseudogeneDuplication - \*1/\*18 diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index bba5785b..eba47c8e 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -16,6 +16,7 @@ CYP2A6,Duplication3 CYP2A6,Tandem CYP2A6,Deletion2Hom CYP2A6,Hybrid5 +CYP2A6,Hybrid6 CYP2B6,Normal CYP2B6,Hybrid CYP2B6,Duplication From 8915da1c50b2b371556579eab47e0dd386ef1bfe Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 27 Apr 2022 16:34:34 +0900 Subject: [PATCH 16/32] Update CNV data for CYP2E1 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ba7dcc6e..cf898813 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6. +* Improve CNV caller for CYP2A6 and CYP2E1. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. 0.14.0 (2022-04-03) From cbdbf72a7f38b7893e0e97c10d608c6e968a59c5 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 27 Apr 2022 16:34:46 +0900 Subject: [PATCH 17/32] Fix typo in docs --- docs/genes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/genes.rst b/docs/genes.rst index 7371ac89..92f4aeba 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -823,7 +823,7 @@ Below is comprehensive summary of SV described from real NGS studies: - WGS - `1KGP `__ - NA18516 - - \*34 has axons 1-4 of CYP2A7 origin and axons 5-9 of CYP2A6 origin (breakpoint in intron 4). + - \*34 has exons 1-4 of CYP2A7 origin and exons 5-9 of CYP2A6 origin (breakpoint in intron 4). * - - Hybrid4 - Indeterminate From 7c5577913581688f532568b0e9c8be50b68093f1 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 27 Apr 2022 16:37:49 +0900 Subject: [PATCH 18/32] Update docs --- docs/genes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/genes.rst b/docs/genes.rst index 92f4aeba..b156a188 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -839,7 +839,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Hybrid5 - Indeterminate - - - :download:`Model ` + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS From c1bca76e57026d8e01a6bf9656d641846baea75b Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Wed, 27 Apr 2022 21:55:27 +0900 Subject: [PATCH 19/32] Update CNV data for SULT1A1 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cf898813..7f18401d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6 and CYP2E1. +* Improve CNV caller for CYP2A6, CYP2E1, SULT1A1. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. 0.14.0 (2022-04-03) From 1904ad98dd5dd2105695fe0ffb344859635afb39 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Apr 2022 10:02:14 +0900 Subject: [PATCH 20/32] Update CNV data for GSTM1 --- CHANGELOG.rst | 3 ++- docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + pypgx/api/genotype.py | 2 +- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7f18401d..f4290b4e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,8 +7,9 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2E1, SULT1A1. +* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. +* Add new CNV call for GSTM1: ``Normal,Deletion2``. 0.14.0 (2022-04-03) ------------------- diff --git a/docs/genes.rst b/docs/genes.rst index b156a188..086a9da8 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -1813,6 +1813,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `GeT-RM `__ - NA18855 - + * - \*0 + - Normal,Deletion2 + - \*0/\*A + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `GeT-RM `__ + - NA21097 + - * - \*0 - DeletionHom - \*0/\*0 diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index eba47c8e..a2e95aee 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -57,6 +57,7 @@ GSTM1,UpstreamDeletionHet GSTM1,"DeletionHet,UpstreamDeletionHet" GSTM1,PartialDuplication GSTM1,"DeletionHet,Deletion2" +GSTM1,"Normal,Deletion2" GSTT1,Normal GSTT1,DeletionHet GSTT1,DeletionHom diff --git a/pypgx/api/genotype.py b/pypgx/api/genotype.py index 2e5a2f4c..33ea825b 100644 --- a/pypgx/api/genotype.py +++ b/pypgx/api/genotype.py @@ -374,7 +374,7 @@ def one_row(self, r): s1, s2 = core.sort_alleles([a1, a2], by='priority', gene=self.gene, assembly=self.assembly) if r.CNV in ['Normal', 'AssumeNormal', 'UpstreamDeletionHet']: result = [a1, a2] - elif r.CNV in ['DeletionHet', 'DeletionHet,UpstreamDeletionHet']: + elif r.CNV in ['DeletionHet', 'DeletionHet,UpstreamDeletionHet', 'Normal,Deletion2']: result = [s1, '*0'] elif r.CNV in ['DeletionHom', 'DeletionHet,Deletion2']: result = ['*0', '*0'] From f5fa3fe88f59a09ff54a48054175dc3811aaa152 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Apr 2022 11:00:22 +0900 Subject: [PATCH 21/32] Update CNV data for UGT1A4 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f4290b4e..70dffab3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1. +* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. From a74da8089b20fb9e5ee5a0de8b4b8592d6f6ec90 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Apr 2022 14:24:59 +0900 Subject: [PATCH 22/32] Update CNV data for UGT2B15 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 70dffab3..3990dcaa 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4. +* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. From fb7a74dd0cf1cb96b8143f42f6721fc6c68388f8 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Apr 2022 15:35:44 +0900 Subject: [PATCH 23/32] Update CNV data for UGT2B17 --- CHANGELOG.rst | 3 ++- docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3990dcaa..96f66d8e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,9 +7,10 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15. +* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. +* Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. 0.14.0 (2022-04-03) ------------------- diff --git a/docs/genes.rst b/docs/genes.rst index 086a9da8..c5b358fa 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -2598,6 +2598,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `1KGP `__ - NA19189 - + * - \*2 + - Deletion,PartialDeletion3 + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA21090 + - * - - Normal,PartialDeletion3 - Indeterminate diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index a2e95aee..8108adbb 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -90,3 +90,4 @@ UGT2B17,"Deletion,Deletion" UGT2B17,"Deletion,PartialDeletion1" UGT2B17,"Deletion,PartialDeletion2" UGT2B17,"Normal,PartialDeletion3" +UGT2B17,"Deletion,PartialDeletion3" From c29c893fab27bd34c2a09150c7d70c8b57192e83 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Apr 2022 17:05:21 +0900 Subject: [PATCH 24/32] Update CNV data for CYP2D6 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 96f66d8e..0a670503 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. +* Improve CNV caller for CYP2A6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. From c1a146ab47cfcd0429c9dd2acc3c18ce28e7808a Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Thu, 28 Apr 2022 22:47:26 +0900 Subject: [PATCH 25/32] Update CNV data for CYP2B6 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0a670503..07964b25 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. +* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. From 1f0d98f3155c7f06f093c2216e23283955c48aa2 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 29 Apr 2022 11:50:10 +0900 Subject: [PATCH 26/32] Update CNV data for CYP2A6 --- CHANGELOG.rst | 2 +- docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 07964b25..28e431f2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Changelog * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. -* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``. +* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. diff --git a/docs/genes.rst b/docs/genes.rst index c5b358fa..835a5140 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -879,6 +879,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `1KGP `__ - NA20828 - + * - + - PseudogeneDeletion + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - HG00625 + - Filtered alleles for CYP2A6 --------------------------- diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index 8108adbb..64347d83 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -17,6 +17,7 @@ CYP2A6,Tandem CYP2A6,Deletion2Hom CYP2A6,Hybrid5 CYP2A6,Hybrid6 +CYP2A6,PseudogeneDeletion CYP2B6,Normal CYP2B6,Hybrid CYP2B6,Duplication From 0692b55fe401272d7e66a6907bcee08d1be58e0d Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 29 Apr 2022 13:46:46 +0900 Subject: [PATCH 27/32] Update CNV data for SLC22A2 --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 28e431f2..e4cb4d7c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17. +* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. From 3faa553669feb9275154f34995c6d4a3f3b4c710 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 29 Apr 2022 15:58:01 +0900 Subject: [PATCH 28/32] Fix typo "statistcs" --- CHANGELOG.rst | 1 + README.rst | 4 ++-- docs/cli.rst | 16 ++++++++-------- docs/create.py | 4 ++-- pypgx/api/pipeline.py | 2 +- pypgx/api/utils.py | 4 ++-- pypgx/cli/compute_control_statistics.py | 4 ++-- pypgx/cli/compute_copy_number.py | 6 +++--- pypgx/cli/run_ngs_pipeline.py | 8 ++++---- 9 files changed, 25 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e4cb4d7c..700f0eef 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. +* Fix the typo "statistcs" to "statistics" throughout the package. * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. diff --git a/README.rst b/README.rst index 70ed10ee..d5717011 100644 --- a/README.rst +++ b/README.rst @@ -357,7 +357,7 @@ currently defined semantic types: - ``SampleTable[Results]`` * TSV file for storing various results for each sample. * Requires following metadata: ``Gene``, ``Assembly``, ``SemanticType``. -- ``SampleTable[Statistcs]`` +- ``SampleTable[Statistics]`` * TSV file for storing control gene's various statistics on read depth for each sample. Used for converting target gene's read depth to copy number. * Requires following metadata: ``Control``, ``Assembly``, ``SemanticType``, ``Platform``. - ``VcfFrame[Consolidated]`` @@ -509,7 +509,7 @@ input data is from whole genome sequencing (WGS) or targeted sequencing This pipeline supports SV detection based on copy number analysis for genes that are known to have SV. Therefore, if the target gene is associated with SV (e.g. CYP2D6) it's strongly recommended to provide a -``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in +``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistics]`` file in addtion to a VCF file containing SNVs/indels. If the target gene is not associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can visit the `Genes `__ page diff --git a/docs/cli.rst b/docs/cli.rst index f35edd08..e09639ef 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -203,13 +203,13 @@ compute-control-statistics [Example] For the VDR gene from WGS data: $ pypgx compute-control-statistics \ VDR \ - control-statistcs.zip \ + control-statistics.zip \ 1.bam 2.bam [Example] For a custom region from targeted sequencing data: $ pypgx compute-control-statistics \ chr1:100-200 \ - control-statistcs.zip \ + control-statistics.zip \ bam.list \ --bed probes.bed @@ -220,7 +220,7 @@ compute-copy-number $ pypgx compute-copy-number -h usage: pypgx compute-copy-number [-h] [--samples-without-sv TEXT [TEXT ...]] - read-depth control-statistcs copy-number + read-depth control-statistics copy-number Compute copy number from read depth for target gene. @@ -235,7 +235,7 @@ compute-copy-number Positional arguments: read-depth Input archive file with the semantic type CovFrame[ReadDepth]. - control-statistcs Input archive file with the semantic type + control-statistics Input archive file with the semantic type SampleTable[Statistics]. copy-number Output archive file with the semantic type CovFrame[CopyNumber]. @@ -899,7 +899,7 @@ run-ngs-pipeline CovFrame[DepthOfCoverage]. --control-statistics PATH Archive file with the semantic type - SampleTable[Statistcs]. + SampleTable[Statistics]. --platform TEXT Genotyping platform (default: 'WGS') (choices: 'WGS', 'Targeted') --assembly TEXT Reference genome assembly (default: 'GRCh37') @@ -920,7 +920,7 @@ run-ngs-pipeline Do not plot copy number profile. --do-not-plot-allele-fraction Do not plot allele fraction profile. - --cnv-caller PATH Archive file with the semantic type Model[CNV]. By + --cnv-caller PATH Archive file with the semantic type Model[CNV]. By default, a pre-trained CNV caller in the ~/pypgx-bundle directory will be used. @@ -936,7 +936,7 @@ run-ngs-pipeline CYP2D6-pipeline \ --variants variants.vcf.gz \ --depth-of-coverage depth-of-coverage.tsv \ - --control-statistcs control-statistics-VDR.zip + --control-statistics control-statistics-VDR.zip [Example] To genotype the CYP2D6 gene from targeted sequencing data: $ pypgx run-ngs-pipeline \ @@ -944,7 +944,7 @@ run-ngs-pipeline CYP2D6-pipeline \ --variants variants.vcf.gz \ --depth-of-coverage depth-of-coverage.tsv \ - --control-statistcs control-statistics-VDR.zip \ + --control-statistics control-statistics-VDR.zip \ --platform Targeted slice-bam diff --git a/docs/create.py b/docs/create.py index 71a8bd80..c33b0b98 100644 --- a/docs/create.py +++ b/docs/create.py @@ -384,7 +384,7 @@ - ``SampleTable[Results]`` * TSV file for storing various results for each sample. * Requires following metadata: ``Gene``, ``Assembly``, ``SemanticType``. -- ``SampleTable[Statistcs]`` +- ``SampleTable[Statistics]`` * TSV file for storing control gene's various statistics on read depth for each sample. Used for converting target gene's read depth to copy number. * Requires following metadata: ``Control``, ``Assembly``, ``SemanticType``, ``Platform``. - ``VcfFrame[Consolidated]`` @@ -536,7 +536,7 @@ This pipeline supports SV detection based on copy number analysis for genes that are known to have SV. Therefore, if the target gene is associated with SV (e.g. CYP2D6) it's strongly recommended to provide a -``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in +``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistics]`` file in addtion to a VCF file containing SNVs/indels. If the target gene is not associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can visit the `Genes `__ page diff --git a/pypgx/api/pipeline.py b/pypgx/api/pipeline.py index c1b3f858..548a4466 100644 --- a/pypgx/api/pipeline.py +++ b/pypgx/api/pipeline.py @@ -262,7 +262,7 @@ def run_ngs_pipeline( depth_of_coverage.check_metadata('Assembly', assembly) if control_statistics is None: - raise ValueError('SV detection requires SampleTable[Statistcs]') + raise ValueError('SV detection requires SampleTable[Statistics]') if isinstance(control_statistics, str): control_statistics = sdk.Archive.from_file(control_statistics) diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index 9cccaa1f..8f6b1a23 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -367,7 +367,7 @@ def compute_control_statistics( Returns ------- pypgx.Archive - Archive object with the semantic type SampleTable[Statistcs]. + Archive object with the semantic type SampleTable[Statistics]. """ gene_table = core.load_gene_table() @@ -427,7 +427,7 @@ def compute_copy_number( ---------- read_depth : str or pypgx.Archive Archive file or object with the semantic type CovFrame[ReadDepth]. - control_statistcs : str or pypgx.Archive + control_statistics : str or pypgx.Archive Archive file or object with the semandtic type SampleTable[Statistics]. samples_without_sv : list, optional diff --git a/pypgx/cli/compute_control_statistics.py b/pypgx/cli/compute_control_statistics.py index 007499aa..b0257094 100644 --- a/pypgx/cli/compute_control_statistics.py +++ b/pypgx/cli/compute_control_statistics.py @@ -17,13 +17,13 @@ [Example] For the VDR gene from WGS data: $ pypgx {fuc.api.common._script_name()} \\ VDR \\ - control-statistcs.zip \\ + control-statistics.zip \\ 1.bam 2.bam [Example] For a custom region from targeted sequencing data: $ pypgx {fuc.api.common._script_name()} \\ chr1:100-200 \\ - control-statistcs.zip \\ + control-statistics.zip \\ bam.list \\ --bed probes.bed """ diff --git a/pypgx/cli/compute_copy_number.py b/pypgx/cli/compute_copy_number.py index 9b084fab..892412ff 100644 --- a/pypgx/cli/compute_copy_number.py +++ b/pypgx/cli/compute_copy_number.py @@ -33,8 +33,8 @@ def create_parser(subparsers): CovFrame[ReadDepth].""" ) parser.add_argument( - 'control_statistcs', - metavar='control-statistcs', + 'control_statistics', + metavar='control-statistics', help= """Input archive file with the semantic type SampleTable[Statistics].""" @@ -56,7 +56,7 @@ def create_parser(subparsers): def main(args): result = utils.compute_copy_number( - args.read_depth, args.control_statistcs, + args.read_depth, args.control_statistics, samples_without_sv=args.samples_without_sv ) result.to_file(args.copy_number) diff --git a/pypgx/cli/run_ngs_pipeline.py b/pypgx/cli/run_ngs_pipeline.py index 03273333..2adf18df 100644 --- a/pypgx/cli/run_ngs_pipeline.py +++ b/pypgx/cli/run_ngs_pipeline.py @@ -26,7 +26,7 @@ CYP2D6-pipeline \\ --variants variants.vcf.gz \\ --depth-of-coverage depth-of-coverage.tsv \\ - --control-statistcs control-statistics-VDR.zip + --control-statistics control-statistics-VDR.zip [Example] To genotype the CYP2D6 gene from targeted sequencing data: $ pypgx {fuc.api.common._script_name()} \\ @@ -34,7 +34,7 @@ CYP2D6-pipeline \\ --variants variants.vcf.gz \\ --depth-of-coverage depth-of-coverage.tsv \\ - --control-statistcs control-statistics-VDR.zip \\ + --control-statistics control-statistics-VDR.zip \\ --platform Targeted """ @@ -78,7 +78,7 @@ def create_parser(subparsers): metavar='PATH', help= """Archive file with the semantic type -SampleTable[Statistcs].""" +SampleTable[Statistics].""" ) parser.add_argument( '--platform', @@ -150,7 +150,7 @@ def create_parser(subparsers): '--cnv-caller', metavar='PATH', help= -"""Archive file with the semantic type Model[CNV]. By +"""Archive file with the semantic type Model[CNV]. By default, a pre-trained CNV caller in the ~/pypgx-bundle directory will be used.""" ) From f63dfa7dd0e7b5d85c4d107dac3060f31d2b3b86 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 30 Apr 2022 21:30:43 +0900 Subject: [PATCH 29/32] Update CNV data for SULT1A1 --- CHANGELOG.rst | 1 + docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + 3 files changed, 13 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 700f0eef..4d967a62 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,7 @@ Changelog * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. +* Add new CNV call for SULT1A1: ``Unknown1``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. 0.14.0 (2022-04-03) diff --git a/docs/genes.rst b/docs/genes.rst index 835a5140..a8145bf1 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -2286,6 +2286,17 @@ Below is comprehensive summary of SV described from real NGS studies: - `GeT-RM `__ - NA19143 - + * - + - Unknown1 + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `GeT-RM `__ + - HG01085 + - TBXAS1 ====== diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index 64347d83..66df1aae 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -73,6 +73,7 @@ SULT1A1,DeletionHom SULT1A1,Duplication SULT1A1,Multiplication1 SULT1A1,Multiplication2 +SULT1A1,Unknown1 UGT1A4,Normal UGT1A4,Intron1DeletionA UGT1A4,Intron1DeletionB From 734fadbf38780fd235ec1190d22e1626c61110bb Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Mon, 2 May 2022 17:10:26 +0900 Subject: [PATCH 30/32] Update CNV data for CYP2D6; update `sdk.utils.simulate_copy_number` --- CHANGELOG.rst | 2 ++ docs/genes.rst | 11 +++++++++++ pypgx/api/data/cnv-table.csv | 1 + pypgx/sdk/utils.py | 9 ++++++++- 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4d967a62..46bdf74d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,8 +8,10 @@ Changelog * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. * Fix the typo "statistcs" to "statistics" throughout the package. +* Update :meth:`sdk.utils.simulate_copy_number` method to automatically handle duplicate sample names. * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. +* Add new CNV call for CYP2D6: ``Tandem2F``. * Add new CNV call for GSTM1: ``Normal,Deletion2``. * Add new CNV call for SULT1A1: ``Unknown1``. * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``. diff --git a/docs/genes.rst b/docs/genes.rst index a8145bf1..c883c20d 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -1213,6 +1213,17 @@ Below is comprehensive summary of SV described from real NGS studies: - - - + * - + - Tandem2F + - Indeterminate + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - HG00458 + - * - \*13+\*1 - Tandem3 - \*1/\*13+\*1 diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index 66df1aae..06e409a5 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -39,6 +39,7 @@ CYP2D6,Unknown1 CYP2D6,Unknown2 CYP2D6,PseudogeneDeletion CYP2D6,PseudogeneDownstreamDel +CYP2D6,Tandem2F CYP2E1,Normal CYP2E1,Duplication1 CYP2E1,Duplication2 diff --git a/pypgx/sdk/utils.py b/pypgx/sdk/utils.py index b02a93bc..54ac7c82 100644 --- a/pypgx/sdk/utils.py +++ b/pypgx/sdk/utils.py @@ -314,7 +314,14 @@ def simulate_copy_number( s = data - noise s[data == 0] = 0 s[s < 0] = 0 - target.data.df[f'{sv}_{i+1}'] = s + + j = 1 + name = f'{sv}_{i+j}' + while name in target.data.samples: + j += 1 + name = f'{sv}_{i+j}' + + target.data.df[name] = s return target From abef16a149e85cb3360a88a1dc66fbb9179db3f4 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Tue, 3 May 2022 14:48:21 +0900 Subject: [PATCH 31/32] Update docs --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 46bdf74d..99ed501d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ********* -0.15.0 (in development) ------------------------ +0.15.0 (2022-05-03) +------------------- * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. From 83ef1faf789cf120c56308c8997b50d202b44f79 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Tue, 3 May 2022 16:54:37 +0900 Subject: [PATCH 32/32] Update docs --- CHANGELOG.rst | 2 +- README.rst | 2 +- docs/create.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 99ed501d..318dbb75 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,7 @@ Changelog * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command. * Add new command :command:`slice-bam`. * Add new command :command:`print-data`. -* Fix the typo "statistcs" to "statistics" throughout the package. +* Fix typo "statistcs" to "statistics" throughout the package. * Update :meth:`sdk.utils.simulate_copy_number` method to automatically handle duplicate sample names. * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17. * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``. diff --git a/README.rst b/README.rst index d5717011..26b0be65 100644 --- a/README.rst +++ b/README.rst @@ -370,7 +370,7 @@ currently defined semantic types: * VcfFrame for storing target gene's phased variant data. * Requires following metadata: ``Platform``, ``Gene``, ``Assembly``, ``SemanticType``, ``Program``. -Wroking with archive files +Working with archive files -------------------------- To demonstrate how easy it is to work with PyPGx archive files, below we will diff --git a/docs/create.py b/docs/create.py index c33b0b98..e16dc502 100644 --- a/docs/create.py +++ b/docs/create.py @@ -397,7 +397,7 @@ * VcfFrame for storing target gene's phased variant data. * Requires following metadata: ``Platform``, ``Gene``, ``Assembly``, ``SemanticType``, ``Program``. -Wroking with archive files +Working with archive files -------------------------- To demonstrate how easy it is to work with PyPGx archive files, below we will