diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d6d2c743..09a70986 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,30 @@ Changelog ********* +0.13.0 (2022-03-01) +------------------- + +* Add new genotyping platform, ``LongRead``, to :command:`import-variants` command. +* Add new command :command:`run-long-read-pipeline`. +* Remove ``Code`` column from ``cnv-table.csv`` file. From now on, CNV codes will be generated on the fly. +* Add new method :meth:`api.core.load_cpic_table`. +* Move following errors from ``api.core`` submodule to ``sdk.utils`` submodule: :class:`AlleleNotFoundError`, :class:`GeneNotFoundError`, :class:`NotTargetGeneError`, :class:`PhenotypeNotFoundError`, :class:`VariantNotFoundError`. +* Combine optional arguments ``--bam`` and ``--fn`` into single positional argument ``bams`` for following commands: :command:`compute-control-statistics`, :command:`compute-target-depth`, :command:`prepare-depth-of-coverage`. +* Rename ``output`` argument to ``copy-number`` for :command:`compute-copy-number` command. +* Rename ``output`` argument to ``read-depth`` for :command:`compute-read-depth` command. +* Combine optional arguments ``--gene`` and ``--region`` into single positional argument ``gene`` for :command:`compute-control-statistics` command. +* Deprecate :meth:`sdk.utils.parse_input_bams` method. +* Update :meth:`api.utils.predict_alleles` method to match ``0.31.0`` version of ``fuc`` package. +* Fix bug in :command:`filter-samples` command when ``--exclude`` argument is used for archive files with SampleTable type. +* Remove unnecessary optional argument ``assembly`` from :meth:`api.core.get_ref_allele`. +* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, CYP4F2, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, and UGT2B17. +* Add a new CNV call for CYP2D6: ``PseudogeneDeletion``. +* In CYP2E1 CNV nomenclature, ``PartialDuplication`` has been renamed to ``PartialDuplicationHet`` and a new CNV call ``PartialDuplicationHom`` has been added. Furthermore, calling algorithm for CYP2E1\*S1 allele has been updated. When partial duplication is present, from now on the algorithm requires only \*7 to call \*S1 instead of both \*7 and \*4. +* Add a new CNV call for SLC22A2: ``Intron9Deletion,Exon11Deletion``. +* Add a new CNV call for UGT1A4: ``Intron1PartialDup``. +* Add new CNV calls for UGT2B15: ``PartialDeletion3`` and ``Deletion``. +* Add a new CNV call for UGT2B17: ``Deletion,PartialDeletion2``. Additionally, several CNV calls have been renamed: ``Normal`` → ``Normal,Normal``; ``DeletionHet`` → ``Normal,Deletion``; ``DeletionHom`` → ``Deletion,Deletion``; ``PartialDeletionHet`` → ``Deletion,PartialDeletion1``. + 0.12.0 (2022-01-29) ------------------- @@ -21,7 +45,7 @@ Changelog * Fix minor bug in :command:`compute-copy-number` command. * Update :command:`plot-cn-af` command to check input files more rigorously. * Improve CNV caller for CYP2A6, CYP2D6, and SLC22A2. -* Add new method :meth:`sdk.utils.add_cn_samples` method. +* Add new method :meth:`sdk.utils.add_cn_samples`. * Update :command:`compare-genotypes` command to output CNV comparisonw results as well. * Update :command:`estimate-phase-beagle` command. From now on, the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to match the reference VCF’s contig names. * Add index files for 1KGP reference haplotype panels. diff --git a/README.rst b/README.rst index 4f0f7176..24654a56 100644 --- a/README.rst +++ b/README.rst @@ -33,6 +33,9 @@ The package is written in Python, and supports both command line interface (CLI) and application programming interface (API) whose documentations are available at the `Read the Docs `_. +PyPGx can be used to predict PGx genotypes and phenotypes using various +genomic data, including data from next-generation sequencing (NGS), single +nucleotide polymorphism (SNP) array, and long-read sequencing. Importantly, PyPGx is compatible with both of the Genome Reference Consortium Human (GRCh) builds, GRCh37 (hg19) and GRCh38 (hg38). @@ -172,7 +175,7 @@ directory in order for PyPGx to correctly access the moved files: .. code-block:: text $ cd ~ - $ git clone --branch 0.12.0 --depth 1 https://github.com/sbslee/pypgx-bundle + $ git clone --branch 0.13.0 --depth 1 https://github.com/sbslee/pypgx-bundle This is undoubtedly annoying, but absolutely necessary for portability reasons because PyPGx has been growing exponentially in file size due to the @@ -189,35 +192,43 @@ sv>`__ such as gene deletions, duplications, and hybrids. You can visit the `Genes `__ page to see the list of genes with SV. -Some of the SV events can be quite challenging to detect accurately with -next-generation sequencing (NGS) data due to misalignment of sequence reads -caused by sequence homology with other gene family members (e.g. CYP2D6 and -CYP2D7). PyPGx attempts to address this issue by training a `support vector -machine (SVM) `__-based multiclass classifier using the `one-vs-rest -strategy `__ for each gene for each GRCh build. Each -classifier is trained using copy number profiles of real NGS samples as well -as simulated ones. +Some of the SV events can be quite challenging to detect accurately with NGS +data due to misalignment of sequence reads caused by sequence homology with +other gene family members (e.g. CYP2D6 and CYP2D7). PyPGx attempts to address +this issue by training a `support vector machine (SVM) `__-based multiclass +classifier using the `one-vs-rest strategy `__ for each +gene for each GRCh build. Each classifier is trained using copy number +profiles of real NGS samples as well as simulated ones. You can plot copy number profile and allele fraction profile with PyPGx to visually inspect SV calls. Below are CYP2D6 examples: .. list-table:: :header-rows: 1 - :widths: 20 80 + :widths: 10 30 60 * - SV Name + - Gene Model - Profile * - Normal + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-1.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-8.png * - DeletionHet + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-2.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-1.png + * - DeletionHom + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-3.png + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-6.png * - Duplication + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-4.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-2.png * - Tandem3 + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-11.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-9.png * - Tandem2C + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-10.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-7.png GRCh37 vs. GRCh38 @@ -229,10 +240,10 @@ may be tempted to use tools like ``LiftOver`` to convert GRCh37 to GRCh38, or vice versa, but deep down you know it's going to be a mess (and please don't do this). The good news is, PyPGx supports both of the builds! -In many of the PyPGx actions, you can simply indicate which human genome -build to use. For example, you can use ``assembly`` for the API and -``--assembly`` for the CLI. **Note that GRCh37 will always be the default.** -Below is an example of using the API: +In many PyPGx actions, you can simply indicate which genome build to use. For +example, for GRCh38 data you can use ``--assembly GRCh38`` in CLI and +``assembly='GRCh38'`` in API. **Note that GRCh37 will always be the +default.** Below is an example of using the API: .. code:: python3 @@ -300,7 +311,7 @@ as pairs of ``=``-separated keys and values (e.g. ``Assembly=GRCh37``): - ``CYP2D6``, ``GSTT1`` * - ``Platform`` - Genotyping platform. - - ``WGS``, ``Targeted``, ``Chip`` + - ``WGS``, ``Targeted``, ``Chip``, ``LongRead`` * - ``Program`` - Name of the phasing program. - ``Beagle``, ``SHAPEIT`` @@ -411,16 +422,69 @@ input and outputs a ``SampleTable[Phenotypes]`` file: Pipelines ========= -PyPGx provides two pipelines for performing PGx genotype analysis: NGS pipeline and chip pipeline. +PyPGx currently provides three pipelines for performing PGx genotype analysis +of single gene for one or multiple samples: NGS pipeline, chip pipeline, and +long-read pipeline. In additional to genotyping, each pipeline will perform +phenotype prediction based on genotype results. All pipelines are compatible +with both GRCh37 and GRCh38 (e.g. for GRCh38 use ``--assembly GRCh38`` in CLI +and ``assembly='GRCh38'`` in API). -**NGS pipeline** +NGS pipeline +------------ .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-ngs-pipeline.png -**Chip pipeline** +Implemented as ``pypgx run-ngs-pipeline`` in CLI and +``pypgx.pipeline.run_ngs_pipeline`` in API, this pipeline is designed for +processing short-read data (e.g. Illumina). Users must specify whether the +input data is from whole genome sequencing (WGS) or targeted sequencing +(custome targeted panel sequencing or whole exome sequencing). + +This pipeline supports SV detection based on copy number analysis for genes +that are known to have SV. Therefore, if the target gene is associated with +SV (e.g. CYP2D6) it's strongly recommended to provide a +``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in +addtion to a VCF file containing SNVs/indels. If the target gene is not +associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can +visit the `Genes `__ page +to see the full list of genes with SV. For details on SV detection algorithm, +please see the `Structural variation detection `__ section. + +Chip pipeline +------------- .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-chip-pipeline.png +Implemented as ``pypgx run-chip-pipeline`` in CLI and +``pypgx.pipeline.run_chip_pipeline`` in API, this pipeline is designed for +DNA chip data (e.g. Global Screening Array from Illumina). It's recommended +to perform variant imputation on the input VCF prior to feeding it to the +pipeline using a large reference haplotype panel (e.g. `TOPMed Imputation +Server `__). +Alternatively, it's possible to perform variant imputation with the 1000 +Genomes Project (1KGP) data as reference within PyPGx using ``--impute`` in +CLI and ``impute=True`` in API. + +The pipeline currently does not support SV detection. Please post a GitHub +issue if you want to contribute your development skills and/or data for +devising an SV detection algorithm. + +Long-read pipeline +------------------ + +.. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-long-read-pipeline.png + +Implemented as ``pypgx run-long-read-pipeline`` in CLI and +``pypgx.pipeline.run_long_read_pipeline`` in API, this pipeline is designed +for long-read data (e.g. Pacific Biosciences and Oxford Nanopore +Technologies). The input VCF must be phased using a read-backed haplotype +phasing tool such as `WhatsHap `__. + +The pipeline currently does not support SV detection. Please post a GitHub +issue if you want to contribute your development skills and/or data for +devising an SV detection algorithm. + Getting help ============ @@ -437,50 +501,50 @@ For getting help on the CLI: positional arguments: COMMAND - call-genotypes Call genotypes for the target gene. - call-phenotypes Call phenotypes for the target gene. - combine-results Combine various results for the target gene. + call-genotypes Call genotypes for target gene. + call-phenotypes Call phenotypes for target gene. + combine-results Combine various results for target gene. compare-genotypes Calculate concordance between two genotype results. compute-control-statistics - Compute summary statistics for the control gene from - BAM files. + Compute summary statistics for control gene from BAM + files. compute-copy-number - Compute copy number from read depth for the target - gene. + Compute copy number from read depth for target gene. compute-target-depth - Compute read depth for the target gene from BAM files. + Compute read depth for target gene from BAM files. create-consolidated-vcf Create a consolidated VCF file. - create-regions-bed Create a BED file which contains all regions used by + create-regions-bed Create a BED file which contains all regions used by PyPGx. estimate-phase-beagle - Estimate haplotype phase of observed variants with + Estimate haplotype phase of observed variants with the Beagle program. filter-samples Filter Archive file for specified samples. - import-read-depth Import read depth data for the target gene. - import-variants Import variant (SNV/indel) data for the target gene + import-read-depth Import read depth data for target gene. + import-variants Import SNV/indel data for target gene. plot-bam-copy-number Plot copy number profile from CovFrame[CopyNumber]. plot-bam-read-depth Plot read depth profile with BAM data. - plot-cn-af Plot both copy number profile and allele fraction + plot-cn-af Plot both copy number profile and allele fraction profile in one figure. plot-vcf-allele-fraction Plot allele fraction profile with VCF data. plot-vcf-read-depth Plot read depth profile with VCF data. - predict-alleles Predict candidate star alleles based on observed + predict-alleles Predict candidate star alleles based on observed variants. - predict-cnv Predict CNV for the target gene based on copy number - data. + predict-cnv Predict CNV from copy number data for target gene. prepare-depth-of-coverage - Prepare a depth of coverage file for all target - genes with SV. + Prepare a depth of coverage file for all target + genes with SV from BAM files. print-metadata Print the metadata of specified archive. - run-chip-pipeline Run PyPGx's genotyping pipeline for chip data. - run-ngs-pipeline Run PyPGx's genotyping pipeline for NGS data. - test-cnv-caller Test a CNV caller for the target gene. - train-cnv-caller Train a CNV caller for the target gene. + run-chip-pipeline Run genotyping pipeline for chip data. + run-long-read-pipeline + Run genotyping pipeline for long-read sequencing data. + run-ngs-pipeline Run genotyping pipeline for NGS data. + test-cnv-caller Test CNV caller for target gene. + train-cnv-caller Train CNV caller for target gene. optional arguments: -h, --help Show this help message and exit. diff --git a/docs/cli.rst b/docs/cli.rst index 7beb0a68..06c9218e 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -20,50 +20,50 @@ For getting help on the CLI: positional arguments: COMMAND - call-genotypes Call genotypes for the target gene. - call-phenotypes Call phenotypes for the target gene. - combine-results Combine various results for the target gene. + call-genotypes Call genotypes for target gene. + call-phenotypes Call phenotypes for target gene. + combine-results Combine various results for target gene. compare-genotypes Calculate concordance between two genotype results. compute-control-statistics - Compute summary statistics for the control gene from - BAM files. + Compute summary statistics for control gene from BAM + files. compute-copy-number - Compute copy number from read depth for the target - gene. + Compute copy number from read depth for target gene. compute-target-depth - Compute read depth for the target gene from BAM files. + Compute read depth for target gene from BAM files. create-consolidated-vcf Create a consolidated VCF file. - create-regions-bed Create a BED file which contains all regions used by + create-regions-bed Create a BED file which contains all regions used by PyPGx. estimate-phase-beagle - Estimate haplotype phase of observed variants with + Estimate haplotype phase of observed variants with the Beagle program. filter-samples Filter Archive file for specified samples. - import-read-depth Import read depth data for the target gene. - import-variants Import variant (SNV/indel) data for the target gene + import-read-depth Import read depth data for target gene. + import-variants Import SNV/indel data for target gene. plot-bam-copy-number Plot copy number profile from CovFrame[CopyNumber]. plot-bam-read-depth Plot read depth profile with BAM data. - plot-cn-af Plot both copy number profile and allele fraction + plot-cn-af Plot both copy number profile and allele fraction profile in one figure. plot-vcf-allele-fraction Plot allele fraction profile with VCF data. plot-vcf-read-depth Plot read depth profile with VCF data. - predict-alleles Predict candidate star alleles based on observed + predict-alleles Predict candidate star alleles based on observed variants. - predict-cnv Predict CNV for the target gene based on copy number - data. + predict-cnv Predict CNV from copy number data for target gene. prepare-depth-of-coverage - Prepare a depth of coverage file for all target - genes with SV. + Prepare a depth of coverage file for all target + genes with SV from BAM files. print-metadata Print the metadata of specified archive. - run-chip-pipeline Run PyPGx's genotyping pipeline for chip data. - run-ngs-pipeline Run PyPGx's genotyping pipeline for NGS data. - test-cnv-caller Test a CNV caller for the target gene. - train-cnv-caller Train a CNV caller for the target gene. + run-chip-pipeline Run genotyping pipeline for chip data. + run-long-read-pipeline + Run genotyping pipeline for long-read sequencing data. + run-ngs-pipeline Run genotyping pipeline for NGS data. + test-cnv-caller Test CNV caller for target gene. + train-cnv-caller Train CNV caller for target gene. optional arguments: -h, --help Show this help message and exit. @@ -83,16 +83,18 @@ call-genotypes $ pypgx call-genotypes -h usage: pypgx call-genotypes [-h] [--alleles PATH] [--cnv-calls PATH] genotypes - Call genotypes for the target gene. + Call genotypes for target gene. Positional arguments: - genotypes Archive file with the semantic type + genotypes Output archive file with the semantic type SampleTable[Genotypes]. Optional arguments: -h, --help Show this help message and exit. - --alleles PATH Archive file with the semantic type SampleTable[Alleles]. - --cnv-calls PATH Archive file with the semantic type SampleTable[CNVCalls]. + --alleles PATH Input archive file with the semantic type + SampleTable[Alleles]. + --cnv-calls PATH Input archive file with the semantic type + SampleTable[CNVCalls]. call-phenotypes =============== @@ -102,11 +104,13 @@ call-phenotypes $ pypgx call-phenotypes -h usage: pypgx call-phenotypes [-h] genotypes phenotypes - Call phenotypes for the target gene. + Call phenotypes for target gene. Positional arguments: - genotypes Archive file with the semantic type SampleTable[Genotypes]. - phenotypes Archive file with the semantic type SampleTable[Phenotypes]. + genotypes Input archive file with the semantic type + SampleTable[Genotypes]. + phenotypes Output archive file with the semantic type + SampleTable[Phenotypes]. Optional arguments: -h, --help Show this help message and exit. @@ -121,20 +125,21 @@ combine-results [--alleles PATH] [--cnv-calls PATH] results - Combine various results for the target gene. + Combine various results for target gene. Positional arguments: - results Archive file with the semantic type SampleTable[Results]. + results Output archive file with the semantic type + SampleTable[Results]. Optional arguments: -h, --help Show this help message and exit. - --genotypes PATH Archive file with the semantic type + --genotypes PATH Input archive file with the semantic type SampleTable[Genotypes]. - --phenotypes PATH Archive file with the semantic type + --phenotypes PATH Input archive file with the semantic type SampleTable[Phenotypes]. - --alleles PATH Archive file with the semantic type + --alleles PATH Input archive file with the semantic type SampleTable[Alleles]. - --cnv-calls PATH Archive file with the semantic type + --cnv-calls PATH Input archive file with the semantic type SampleTable[CNVCalls]. compare-genotypes @@ -151,14 +156,14 @@ compare-genotypes concordance for genotype calls as well as CNV calls. Positional arguments: - first First archive file with the semantic type + first First archive file with the semantic type SampleTable[Results]. - second Second archive file with the semantic type + second Second archive file with the semantic type SampleTable[Results]. Optional arguments: -h, --help Show this help message and exit. - --verbose Whether to print the verbose version of output, including + --verbose Whether to print the verbose version of output, including discordant calls. compute-control-statistics @@ -167,48 +172,45 @@ compute-control-statistics .. code-block:: text $ pypgx compute-control-statistics -h - usage: pypgx compute-control-statistics [-h] [--bam PATH [PATH ...]] - [--fn PATH] [--gene TEXT] - [--region TEXT] [--assembly TEXT] - [--bed PATH] - control-statistics + usage: pypgx compute-control-statistics [-h] [--assembly TEXT] [--bed PATH] + gene control-statistics bams + [bams ...] - Compute summary statistics for the control gene from BAM files. + Compute summary statistics for control gene from BAM files. + + Note that for the arguments gene and --bed, the 'chr' prefix in contig names + (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to + match the input BAM's contig names. Positional arguments: - control-statistics Archive file with the semantic type - SampleTable[Statistics]. + gene Control gene (recommended choices: 'EGFR', 'RYR1', + 'VDR'). Alternatively, you can provide a custom region + (format: chrom:start-end). + control-statistics Output archive file with the semantic type + SampleTable[Statistics]. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: - -h, --help Show this help message and exit. - --bam PATH [PATH ...] - One or more BAM files. Cannot be used with --fn. - --fn PATH File containing one BAM file per line. Cannot be - used with --bam. - --gene TEXT Control gene (recommended choices: 'EGFR', 'RYR1', - 'VDR'). Cannot be used with --region. - --region TEXT Custom region to use as control gene - ('chrom:start-end'). Cannot be used with --gene. - --assembly TEXT Reference genome assembly (default: 'GRCh37') - (choices: 'GRCh37', 'GRCh38'). - --bed PATH By default, the input data is assumed to be WGS. If - it's targeted sequencing, you must provide a BED file - to indicate probed regions. Note that the 'chr' - prefix in BED contig names (e.g. 'chr1' vs. '1') will - be automatically added or removed as necessary to - match the BAM contig names. - - [Example] To compute summary statistics for the VDR gene from WGS data: + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --bed PATH By default, the input data is assumed to be WGS. If + it's targeted sequencing, you must provide a BED file + to indicate probed regions. + + [Example] For the VDR gene from WGS data: $ pypgx compute-control-statistics \ - control-statistcs-VDR.zip \ - --gene VDR \ - --bam A.bam B.bam + VDR \ + control-statistcs.zip \ + 1.bam 2.bam [Example] For a custom region from targeted sequencing data: $ pypgx compute-control-statistics \ - control-statistcs-VDR.zip \ - --gene chr1:100-200 \ - --fn bam.list \ + chr1:100-200 \ + control-statistcs.zip \ + bam.list \ --bed probes.bed compute-copy-number @@ -218,9 +220,9 @@ compute-copy-number $ pypgx compute-copy-number -h usage: pypgx compute-copy-number [-h] [--samples-without-sv TEXT [TEXT ...]] - read-depth control-statistcs output + read-depth control-statistcs copy-number - Compute copy number from read depth for the target gene. + Compute copy number from read depth for target gene. The command will convert read depth to copy number by performing intra-sample normalization using summary statistics from the control gene. @@ -231,11 +233,11 @@ compute-copy-number without SV using --samples-without-sv. Positional arguments: - read-depth Archive file with the semantic type + read-depth Input archive file with the semantic type CovFrame[ReadDepth]. - control-statistcs Archive file with the semantic type + control-statistcs Input archive file with the semantic type SampleTable[Statistics]. - output Archive file with the semantic type + copy-number Output archive file with the semantic type CovFrame[CopyNumber]. Optional arguments: @@ -249,40 +251,38 @@ compute-target-depth .. code-block:: text $ pypgx compute-target-depth -h - usage: pypgx compute-target-depth [-h] [--bam PATH [PATH ...]] [--fn PATH] - [--assembly TEXT] [--bed PATH] - gene output + usage: pypgx compute-target-depth [-h] [--assembly TEXT] [--bed PATH] + gene read-depth bams [bams ...] - Compute read depth for the target gene from BAM files. + Compute read depth for target gene from BAM files. Positional arguments: - gene Target gene. - output Archive file with the semantic type - CovFrame[ReadDepth]. + gene Target gene. + read-depth Output archive file with the semantic type + CovFrame[ReadDepth]. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: - -h, --help Show this help message and exit. - --bam PATH [PATH ...] - One or more BAM files. Cannot be used with --fn. - --fn PATH File containing one BAM file per line. Cannot be - used with --bam. - --assembly TEXT Reference genome assembly (default: 'GRCh37') - (choices: 'GRCh37', 'GRCh38'). - --bed PATH By default, the input data is assumed to be WGS. If it - is targeted sequencing, you must provide a BED file to - indicate probed regions. + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --bed PATH By default, the input data is assumed to be WGS. If it + is targeted sequencing, you must provide a BED file to + indicate probed regions. [Example] For the CYP2D6 gene from WGS data: $ pypgx compute-target-depth \ CYP2D6 \ read-depth.zip \ - --bam A.bam B.bam + 1.bam 2.bam [Example] For the CYP2D6 gene from targeted sequencing data: $ pypgx compute-target-depth \ CYP2D6 \ read-depth.zip \ - --fn bam.txt \ + bam.list \ --bed probes.bed create-consolidated-vcf @@ -298,12 +298,12 @@ create-consolidated-vcf Create a consolidated VCF file. Positional arguments: - imported-variants Archive file with the semantic type + imported-variants Input archive file with the semantic type VcfFrame[Imported]. - phased-variants Archive file with the semantic type + phased-variants Input archive file with the semantic type VcfFrame[Phased]. consolidated-variants - Archive file with the semantic type + Output archive file with the semantic type VcfFrame[Consolidated]. Optional arguments: @@ -322,10 +322,10 @@ create-regions-bed Optional arguments: -h, --help Show this help message and exit. - --assembly TEXT Reference genome assembly (default: 'GRCh37') + --assembly TEXT Reference genome assembly (default: 'GRCh37') (choices: 'GRCh37', 'GRCh38'). --add-chr-prefix Whether to add the 'chr' string in contig names. - --merge Whether to merge overlapping intervals (gene names will + --merge Whether to merge overlapping intervals (gene names will be removed too). --sv-genes Whether to only return genes with SV. @@ -340,19 +340,20 @@ estimate-phase-beagle Estimate haplotype phase of observed variants with the Beagle program. - The 'chr' prefix in contig names (e.g. 'chr1' vs. '1') in the input VCF will - be automatically added or removed as necessary to match that of the reference - VCF. - Positional arguments: - imported-variants Archive file with the semantic type VcfFrame[Imported]. - phased-variants Archive file with the semantic type VcfFrame[Phased]. + imported-variants Input archive file with the semantic type + VcfFrame[Imported]. The 'chr' prefix in contig names + (e.g. 'chr1' vs. '1') will be automatically added or + removed as necessary to match the reference VCF's contig + names. + phased-variants Output archive file with the semantic type + VcfFrame[Phased]. Optional arguments: -h, --help Show this help message and exit. - --panel PATH VCF file corresponding to a reference haplotype panel - (compressed or uncompressed). By default, the 1KGP panel - in the ~/pypgx-bundle directory will be used. + --panel PATH VCF file (compressed or uncompressed) corresponding to a + reference haplotype panel. By default, the 1KGP panel in + the ~/pypgx-bundle directory will be used. --impute Perform imputation of missing genotypes. filter-samples @@ -369,9 +370,9 @@ filter-samples Positional arguments: input Input archive file. output Output archive file. - samples Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + samples Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. Optional arguments: @@ -387,20 +388,21 @@ import-read-depth usage: pypgx import-read-depth [-h] [--samples TEXT [TEXT ...]] [--exclude] gene depth-of-coverage read-depth - Import read depth data for the target gene. + Import read depth data for target gene. Positional arguments: gene Target gene. - depth-of-coverage Archive file with the semantic type + depth-of-coverage Input archive file with the semantic type CovFrame[DepthOfCoverage]. - read-depth Archive file with the semantic type CovFrame[ReadDepth]. + read-depth Output archive file with the semantic type + CovFrame[ReadDepth]. Optional arguments: -h, --help Show this help message and exit. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --exclude Exclude specified samples. @@ -414,31 +416,39 @@ import-variants [--samples TEXT [TEXT ...]] [--exclude] gene vcf imported-variants - Import variant (SNV/indel) data for the target gene. + Import SNV/indel data for target gene. - The command will first slice input VCF for the target gene and then assess - whether every genotype call in the sliced VCF is haplotype phased. It will - return an archive file with the semantic type VcfFrame[Consolidated] if the - VCF is fully phased or otherwise VcfFrame[Imported]. + The command will slice the input VCF for the target gene to create an archive + file with the semantic type VcfFrame[Imported] or VcfFrame[Consolidated]. Positional arguments: gene Target gene. - vcf Input VCF file must be already BGZF compressed (.gz) and - indexed (.tbi) to allow random access. - imported-variants Archive file with the semantic type VcfFrame[Imported] - or VcfFrame[Consolidated]. + vcf Input VCF file must be already BGZF compressed (.gz) + and indexed (.tbi) to allow random access. + imported-variants Output archive file with the semantic type + VcfFrame[Imported] or VcfFrame[Consolidated]. Optional arguments: -h, --help Show this help message and exit. - --assembly TEXT Reference genome assembly (default: 'GRCh37') (choices: - 'GRCh37', 'GRCh38'). - --platform TEXT Genotyping platform (default: 'WGS') (choices: 'WGS', - 'Targeted', 'Chip'). + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --platform TEXT Genotyping platform used (default: 'WGS') (choices: + 'WGS', 'Targeted', 'Chip', 'LongRead'). When the + platform is 'WGS', 'Targeted', or 'Chip', the command + will assess whether every genotype call in the sliced + VCF is haplotype phased (e.g. '0|1'). If the sliced + VCF is fully phased, the command will return + VcfFrame[Consolidated] or otherwise + VcfFrame[Imported]. When the platform is 'LongRead', + the command will return VcfFrame[Consolidated] after + applying the phase-extension algorithm to estimate + haplotype phase of any variants that could not be + resolved by read-backed phasing. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can - provide a list of samples. + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you + can provide a list of samples. --exclude Exclude specified samples. plot-bam-copy-number @@ -455,7 +465,7 @@ plot-bam-copy-number Plot copy number profile from CovFrame[CopyNumber]. Positional arguments: - copy-number Archive file with the semantic type + copy-number Input archive file with the semantic type CovFrame[CopyNumber]. Optional arguments: @@ -463,9 +473,9 @@ plot-bam-copy-number --fitted Show the fitted line as well. --path PATH Create plots in this directory. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --ymin FLOAT Y-axis bottom (default: -0.3). --ymax FLOAT Y-axis top (default: 6.3). @@ -480,21 +490,21 @@ plot-bam-read-depth usage: pypgx plot-bam-read-depth [-h] [--path PATH] [--samples TEXT [TEXT ...]] [--ymin FLOAT] [--ymax FLOAT] [--fontsize FLOAT] - read_depth + read-depth Plot read depth profile with BAM data. Positional arguments: - read_depth Archive file with the semantic type + read-depth Input archive file with the semantic type CovFrame[ReadDepth]. Optional arguments: -h, --help Show this help message and exit. --path PATH Create plots in this directory. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --ymin FLOAT Y-axis bottom. --ymax FLOAT Y-axis top. @@ -513,18 +523,18 @@ plot-cn-af Plot both copy number profile and allele fraction profile in one figure. Positional arguments: - copy-number Archive file with the semantic type + copy-number Input archive file with the semantic type CovFrame[CopyNumber]. - imported-variants Archive file with the semantic type + imported-variants Input archive file with the semantic type VcfFrame[Imported]. Optional arguments: -h, --help Show this help message and exit. --path PATH Create plots in this directory. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --ymin FLOAT Y-axis bottom (default: -0.3). --ymax FLOAT Y-axis top (default: 6.3). @@ -544,16 +554,16 @@ plot-vcf-allele-fraction Plot allele fraction profile from VcfFrame[Imported]. Positional arguments: - imported-variants Archive file with the semantic type + imported-variants Input archive file with the semantic type VcfFrame[Imported]. Optional arguments: -h, --help Show this help message and exit. --path PATH Create plots in this directory. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --fontsize FLOAT Text fontsize (default: 25). @@ -572,17 +582,17 @@ plot-vcf-read-depth Positional arguments: gene Target gene. - vcf VCF file. + vcf Input VCF file. Optional arguments: -h, --help Show this help message and exit. - --assembly TEXT Reference genome assembly (default: 'GRCh37') + --assembly TEXT Reference genome assembly (default: 'GRCh37') (choices: 'GRCh37', 'GRCh38'). --path PATH Create plots in this directory. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --ymin FLOAT Y-axis bottom. --ymax FLOAT Y-axis top. @@ -599,9 +609,9 @@ predict-alleles Positional arguments: consolidated-variants - Archive file with the semantic type + Input archive file with the semantic type VcfFrame[Consolidated]. - alleles Archive file with the semantic type + alleles Output archive file with the semantic type SampleTable[Alleles]. Optional arguments: @@ -615,20 +625,21 @@ predict-cnv $ pypgx predict-cnv -h usage: pypgx predict-cnv [-h] [--cnv-caller PATH] copy-number cnv-calls - Predict CNV for the target gene based on copy number data. + Predict CNV from copy number data for target gene. Genomic positions that are missing copy number because, for example, the input data is targeted sequencing will be imputed with forward filling. Positional arguments: - copy-number Archive file with the semantic type CovFrame[CopyNumber]. - cnv-calls Archive file with the semantic type + copy-number Input archive file with the semantic type + CovFrame[CopyNumber]. + cnv-calls Output archive file with the semantic type SampleTable[CNVCalls]. Optional arguments: -h, --help Show this help message and exit. - --cnv-caller PATH Archive file with the semantic type Model[CNV]. By - default, a pre-trained CNV caller in the ~/pypgx-bundle + --cnv-caller PATH Archive file with the semantic type Model[CNV]. By + default, a pre-trained CNV caller in the ~/pypgx-bundle directory will be used. prepare-depth-of-coverage @@ -637,41 +648,38 @@ prepare-depth-of-coverage .. code-block:: text $ pypgx prepare-depth-of-coverage -h - usage: pypgx prepare-depth-of-coverage [-h] [--bam PATH [PATH ...]] - [--fn PATH] [--assembly TEXT] - [--bed PATH] - depth-of-coverage + usage: pypgx prepare-depth-of-coverage [-h] [--assembly TEXT] [--bed PATH] + depth-of-coverage bams [bams ...] - Prepare a depth of coverage file for all target genes with SV. + Prepare a depth of coverage file for all target genes with SV from BAM files. Positional arguments: - depth-of-coverage Archive file with the semantic type - CovFrame[DepthOfCoverage]. + depth-of-coverage Output archive file with the semantic type + CovFrame[DepthOfCoverage]. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: - -h, --help Show this help message and exit. - --bam PATH [PATH ...] - One or more BAM files. Cannot be used with --fn. - --fn PATH File containing one BAM file per line. Cannot be used - with --bam. - --assembly TEXT Reference genome assembly (default: 'GRCh37') - (choices: 'GRCh37', 'GRCh38'). - --bed PATH By default, the input data is assumed to be WGS. If - it's targeted sequencing, you must provide a BED file - to indicate probed regions. Note that the 'chr' - prefix in BED contig names (e.g. 'chr1' vs. '1') will - be automatically added or removed as necessary to - match the BAM contig names. - - [Example] When the input data is WGS: + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --bed PATH By default, the input data is assumed to be WGS. If + it's targeted sequencing, you must provide a BED file + to indicate probed regions. Note that the 'chr' prefix + in contig names (e.g. 'chr1' vs. '1') will be + automatically added or removed as necessary to match + the input BAM's contig names. + + [Example] From WGS data: $ pypgx prepare-depth-of-coverage \ depth-of-coverage.zip \ - --bam A.bam B.bam + 1.bam 2.bam - [Example] When the input data is targeted sequencing: + [Example] From targeted sequencing data: $ pypgx prepare-depth-of-coverage \ depth-of-coverage.zip \ - --fn bam.txt \ + bam.list \ --bed probes.bed print-metadata @@ -685,7 +693,7 @@ print-metadata Print the metadata of specified archive. Positional arguments: - input Archive file. + input Input archive file. Optional arguments: -h, --help Show this help message and exit. @@ -701,30 +709,31 @@ run-chip-pipeline [--samples TEXT [TEXT ...]] [--exclude] gene output variants - Run PyPGx's genotyping pipeline for chip data. + Run genotyping pipeline for chip data. Positional arguments: gene Target gene. output Output directory. - variants Input VCF file must be already BGZF compressed (.gz) - and indexed (.tbi) to allow random access. Statistical - haplotype phasing will be skipped if input VCF is - already fully phased. + variants Input VCF file must be already BGZF compressed (.gz) + and indexed (.tbi) to allow random access. + Statistical haplotype phasing will be skipped if + input VCF is already fully phased. Optional arguments: -h, --help Show this help message and exit. - --assembly TEXT Reference genome assembly (default: 'GRCh37') (choices: - 'GRCh37', 'GRCh38'). - --panel PATH VCF file corresponding to a reference haplotype panel - (compressed or uncompressed). By default, the 1KGP panel - in the ~/pypgx-bundle directory will be used. + --assembly TEXT + Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --panel PATH VCF file corresponding to a reference haplotype panel + (compressed or uncompressed). By default, the 1KGP + panel in the ~/pypgx-bundle directory will be used. --impute Perform imputation of missing genotypes. --force Overwrite output directory if it already exists. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you can - provide a list of samples. + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you + can provide a list of samples. --exclude Exclude specified samples. [Example] To genotype the CYP3A5 gene from chip data: @@ -733,6 +742,42 @@ run-chip-pipeline CYP3A5-pipeline \ variants.vcf.gz +run-long-read-pipeline +====================== + +.. code-block:: text + + $ pypgx run-long-read-pipeline -h + usage: pypgx run-long-read-pipeline [-h] [--assembly TEXT] [--force] + [--samples TEXT [TEXT ...]] [--exclude] + gene output variants + + Run genotyping pipeline for long-read sequencing data. + + Positional arguments: + gene Target gene. + output Output directory. + variants Input VCF file must be already BGZF compressed (.gz) + and indexed (.tbi) to allow random access. + + Optional arguments: + -h, --help Show this help message and exit. + --assembly TEXT Reference genome assembly (default: 'GRCh37') + (choices: 'GRCh37', 'GRCh38'). + --force Overwrite output directory if it already exists. + --samples TEXT [TEXT ...] + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you + can provide a list of samples. + --exclude Exclude specified samples. + + [Example] To genotype the CYP3A5 gene from long-read sequencing data: + $ pypgx run-long-read-pipeline \ + CYP3A5 \ + CYP3A5-pipeline \ + variants.vcf.gz + run-ngs-pipeline ================ @@ -750,7 +795,7 @@ run-ngs-pipeline [--cnv-caller PATH] gene output - Run PyPGx's genotyping pipeline for NGS data. + Run genotyping pipeline for NGS data. During copy number analysis, if the input data is targeted sequencing, the command will apply inter-sample normalization using summary statistics across @@ -763,28 +808,28 @@ run-ngs-pipeline Optional arguments: -h, --help Show this help message and exit. - --variants PATH Input VCF file must be already BGZF compressed (.gz) - and indexed (.tbi) to allow random access. - Statistical haplotype phasing will be skipped if + --variants PATH Input VCF file must be already BGZF compressed (.gz) + and indexed (.tbi) to allow random access. + Statistical haplotype phasing will be skipped if input VCF is already fully phased. --depth-of-coverage PATH - Archive file with the semantic type + Archive file with the semantic type CovFrame[DepthOfCoverage]. --control-statistics PATH - Archive file with the semantic type + Archive file with the semantic type SampleTable[Statistcs]. - --platform TEXT Genotyping platform (default: 'WGS') (choices: 'WGS', + --platform TEXT Genotyping platform (default: 'WGS') (choices: 'WGS', 'Targeted') - --assembly TEXT Reference genome assembly (default: 'GRCh37') + --assembly TEXT Reference genome assembly (default: 'GRCh37') (choices: 'GRCh37', 'GRCh38'). - --panel PATH VCF file corresponding to a reference haplotype panel - (compressed or uncompressed). By default, the 1KGP panel + --panel PATH VCF file corresponding to a reference haplotype panel + (compressed or uncompressed). By default, the 1KGP panel in the ~/pypgx-bundle directory will be used. --force Overwrite output directory if it already exists. --samples TEXT [TEXT ...] - Specify which samples should be included for analysis - by providing a text file (.txt, .tsv, .csv, or .list) - containing one sample per line. Alternatively, you + Specify which samples should be included for analysis + by providing a text file (.txt, .tsv, .csv, or .list) + containing one sample per line. Alternatively, you can provide a list of samples. --exclude Exclude specified samples. --samples-without-sv TEXT [TEXT ...] @@ -794,7 +839,7 @@ run-ngs-pipeline --do-not-plot-allele-fraction Do not plot allele fraction profile. --cnv-caller PATH Archive file with the semantic type Model[CNV]. By - default, a pre-trained CNV caller in the ~/pypgx-bundle + default, a pre-trained CNV caller in the ~/pypgx-bundle directory will be used. [Example] To genotype the CYP3A5 gene, which does not have SV, from WGS data: @@ -829,19 +874,21 @@ test-cnv-caller usage: pypgx test-cnv-caller [-h] [--confusion-matrix PATH] cnv-caller copy-number cnv-calls - Test a CNV caller for the target gene. + Test CNV caller for target gene. Positional arguments: - cnv-caller Archive file with the semantic type Model[CNV]. - copy-number Archive file with the semantic type + cnv-caller Input archive file with the semantic type Model[CNV]. + copy-number Input archive file with the semantic type CovFrame[CopyNumber]. - cnv-calls Archive file with the semantic type + cnv-calls Input archive file with the semantic type SampleTable[CNVCalls]. Optional arguments: -h, --help Show this help message and exit. --confusion-matrix PATH - Write the confusion matrix as a CSV file. + Write the confusion matrix as a CSV file where rows + indicate actual class and columns indicate prediction + class. train-cnv-caller ================ @@ -852,20 +899,22 @@ train-cnv-caller usage: pypgx train-cnv-caller [-h] [--confusion-matrix PATH] copy-number cnv-calls cnv-caller - Train a CNV caller for the target gene. + Train CNV caller for target gene. This command will return a SVM-based multiclass classifier that makes CNV calls using the one-vs-rest strategy. Positional arguments: - copy-number Archive file with the semantic type + copy-number Input archive file with the semantic type CovFrame[CopyNumber]. - cnv-calls Archive file with the semantic type + cnv-calls Input archive file with the semantic type SampleTable[CNVCalls]. - cnv-caller Archive file with the semantic type Model[CNV]. + cnv-caller Output archive file with the semantic type Model[CNV]. Optional arguments: -h, --help Show this help message and exit. --confusion-matrix PATH - Write the confusion matrix as a CSV file. + Write the confusion matrix as a CSV file where rows + indicate actual class and columns indicate prediction + class. diff --git a/docs/create.py b/docs/create.py index fc0d6856..97c095a8 100644 --- a/docs/create.py +++ b/docs/create.py @@ -60,6 +60,9 @@ (CLI) and application programming interface (API) whose documentations are available at the `Read the Docs `_. +PyPGx can be used to predict PGx genotypes and phenotypes using various +genomic data, including data from next-generation sequencing (NGS), single +nucleotide polymorphism (SNP) array, and long-read sequencing. Importantly, PyPGx is compatible with both of the Genome Reference Consortium Human (GRCh) builds, GRCh37 (hg19) and GRCh38 (hg38). @@ -199,7 +202,7 @@ .. code-block:: text $ cd ~ - $ git clone --branch 0.12.0 --depth 1 https://github.com/sbslee/pypgx-bundle + $ git clone --branch 0.13.0 --depth 1 https://github.com/sbslee/pypgx-bundle This is undoubtedly annoying, but absolutely necessary for portability reasons because PyPGx has been growing exponentially in file size due to the @@ -216,35 +219,43 @@ `Genes `__ page to see the list of genes with SV. -Some of the SV events can be quite challenging to detect accurately with -next-generation sequencing (NGS) data due to misalignment of sequence reads -caused by sequence homology with other gene family members (e.g. CYP2D6 and -CYP2D7). PyPGx attempts to address this issue by training a `support vector -machine (SVM) `__-based multiclass classifier using the `one-vs-rest -strategy `__ for each gene for each GRCh build. Each -classifier is trained using copy number profiles of real NGS samples as well -as simulated ones. +Some of the SV events can be quite challenging to detect accurately with NGS +data due to misalignment of sequence reads caused by sequence homology with +other gene family members (e.g. CYP2D6 and CYP2D7). PyPGx attempts to address +this issue by training a `support vector machine (SVM) `__-based multiclass +classifier using the `one-vs-rest strategy `__ for each +gene for each GRCh build. Each classifier is trained using copy number +profiles of real NGS samples as well as simulated ones. You can plot copy number profile and allele fraction profile with PyPGx to visually inspect SV calls. Below are CYP2D6 examples: .. list-table:: :header-rows: 1 - :widths: 20 80 + :widths: 10 30 60 * - SV Name + - Gene Model - Profile * - Normal + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-1.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-8.png * - DeletionHet + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-2.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-1.png + * - DeletionHom + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-3.png + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-6.png * - Duplication + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-4.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-2.png * - Tandem3 + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-11.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-9.png * - Tandem2C + - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-10.png - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-7.png GRCh37 vs. GRCh38 @@ -256,10 +267,10 @@ vice versa, but deep down you know it's going to be a mess (and please don't do this). The good news is, PyPGx supports both of the builds! -In many of the PyPGx actions, you can simply indicate which human genome -build to use. For example, you can use ``assembly`` for the API and -``--assembly`` for the CLI. **Note that GRCh37 will always be the default.** -Below is an example of using the API: +In many PyPGx actions, you can simply indicate which genome build to use. For +example, for GRCh38 data you can use ``--assembly GRCh38`` in CLI and +``assembly='GRCh38'`` in API. **Note that GRCh37 will always be the +default.** Below is an example of using the API: .. code:: python3 @@ -327,7 +338,7 @@ - ``CYP2D6``, ``GSTT1`` * - ``Platform`` - Genotyping platform. - - ``WGS``, ``Targeted``, ``Chip`` + - ``WGS``, ``Targeted``, ``Chip``, ``LongRead`` * - ``Program`` - Name of the phasing program. - ``Beagle``, ``SHAPEIT`` @@ -438,16 +449,69 @@ Pipelines ========= -PyPGx provides two pipelines for performing PGx genotype analysis: NGS pipeline and chip pipeline. +PyPGx currently provides three pipelines for performing PGx genotype analysis +of single gene for one or multiple samples: NGS pipeline, chip pipeline, and +long-read pipeline. In additional to genotyping, each pipeline will perform +phenotype prediction based on genotype results. All pipelines are compatible +with both GRCh37 and GRCh38 (e.g. for GRCh38 use ``--assembly GRCh38`` in CLI +and ``assembly='GRCh38'`` in API). -**NGS pipeline** +NGS pipeline +------------ .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-ngs-pipeline.png -**Chip pipeline** +Implemented as ``pypgx run-ngs-pipeline`` in CLI and +``pypgx.pipeline.run_ngs_pipeline`` in API, this pipeline is designed for +processing short-read data (e.g. Illumina). Users must specify whether the +input data is from whole genome sequencing (WGS) or targeted sequencing +(custome targeted panel sequencing or whole exome sequencing). + +This pipeline supports SV detection based on copy number analysis for genes +that are known to have SV. Therefore, if the target gene is associated with +SV (e.g. CYP2D6) it's strongly recommended to provide a +``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in +addtion to a VCF file containing SNVs/indels. If the target gene is not +associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can +visit the `Genes `__ page +to see the full list of genes with SV. For details on SV detection algorithm, +please see the `Structural variation detection `__ section. + +Chip pipeline +------------- .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-chip-pipeline.png +Implemented as ``pypgx run-chip-pipeline`` in CLI and +``pypgx.pipeline.run_chip_pipeline`` in API, this pipeline is designed for +DNA chip data (e.g. Global Screening Array from Illumina). It's recommended +to perform variant imputation on the input VCF prior to feeding it to the +pipeline using a large reference haplotype panel (e.g. `TOPMed Imputation +Server `__). +Alternatively, it's possible to perform variant imputation with the 1000 +Genomes Project (1KGP) data as reference within PyPGx using ``--impute`` in +CLI and ``impute=True`` in API. + +The pipeline currently does not support SV detection. Please post a GitHub +issue if you want to contribute your development skills and/or data for +devising an SV detection algorithm. + +Long-read pipeline +------------------ + +.. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/flowchart-long-read-pipeline.png + +Implemented as ``pypgx run-long-read-pipeline`` in CLI and +``pypgx.pipeline.run_long_read_pipeline`` in API, this pipeline is designed +for long-read data (e.g. Pacific Biosciences and Oxford Nanopore +Technologies). The input VCF must be phased using a read-backed haplotype +phasing tool such as `WhatsHap `__. + +The pipeline currently does not support SV detection. Please post a GitHub +issue if you want to contribute your development skills and/or data for +devising an SV detection algorithm. + Getting help ============ diff --git a/docs/genes.rst b/docs/genes.rst index 0aca4afc..30489025 100644 --- a/docs/genes.rst +++ b/docs/genes.rst @@ -584,6 +584,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -594,6 +595,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -604,6 +606,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Deletion1Het - \*1/\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -614,6 +617,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Deletion1Hom - \*4/\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -624,6 +628,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Deletion2Het - \*1/\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -634,6 +639,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Deletion3Het - \*4/\*9 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -644,6 +650,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication1 - \*1x2/\*25 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -654,6 +661,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication2 - \*1x2/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -661,9 +669,10 @@ Below is comprehensive summary of SV described from real NGS studies: - NA12342 - * - \*1x2 - - Duplication2 + - Duplication3 - \*1x2/\*17 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -674,6 +683,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Hybrid1 - Indeterminate - + - - :download:`Profile ` - :download:`Profile ` - WGS @@ -684,6 +694,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Hybrid2 - \*1/\*12 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -694,6 +705,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Hybrid3 - \*1/\*34 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -704,6 +716,7 @@ Below is comprehensive summary of SV described from real NGS studies: - PseudogeneDuplication - \*1/\*18 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -752,6 +765,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -762,6 +776,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -772,16 +787,18 @@ Below is comprehensive summary of SV described from real NGS studies: - Hybrid - \*6/\*29 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - `GeT-RM `__ - NA19178 - - + - \*29 has exons 1-4 of CYP2B7 origin and exons 5-9 of CYP2A6 origin (breakpoint in intron 4). * - \*22x2 - Duplication - \*6/\*22x2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -904,6 +921,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -914,6 +932,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -924,6 +943,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet - \*5/\*29 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -934,6 +954,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHom - \*5/\*5 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -944,6 +965,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication - \*2/\*4x2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -954,6 +976,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Multiplication - \*1x3/\*10 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -964,6 +987,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem1A - \*139/\*68+\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -974,6 +998,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem1B - \*68+\*4/\*68+\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -984,6 +1009,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem2A - \*2/\*36+\*10 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -994,6 +1020,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem2B - \*1/\*36x2+\*10 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1004,6 +1031,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem2C - \*1/\*36x3+\*10 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1014,6 +1042,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Tandem3 - \*1/\*13+\*1 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1024,6 +1053,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication,Tandem1A - \*2x2/\*68+\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1034,6 +1064,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet,Tandem1A - \*5/\*68+\*4 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1044,12 +1075,24 @@ Below is comprehensive summary of SV described from real NGS studies: - Unknown1 - Indeterminate - + - - :download:`Profile ` - :download:`Profile ` - WGS - `1KGP `__ - NA18555 - + * - + - PseudogeneDeletion + - \*2/\*41 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19316 + - Phenotype summary for CYP2D6 ---------------------------- @@ -1129,6 +1172,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1139,6 +1183,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*7 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1146,19 +1191,32 @@ Below is comprehensive summary of SV described from real NGS studies: - NA10831 - * - \*S1 - - PartialDuplication + - PartialDuplicationHet - \*1/\*S1 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - `GeT-RM `__ - NA19920 + - \*S1 is linked to \*7. + * - \*S1 + - PartialDuplicationHom + - \*S1/\*S1 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19309 - * - \*1x2 - Duplication1 - \*1/\*1x2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1169,6 +1227,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication1 - \*1/\*7x2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1179,6 +1238,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication2 - \*1x2/\*7 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1189,6 +1249,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Multiplication - \*7/\*7x3 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1328,6 +1389,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1338,6 +1400,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*3 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1348,6 +1411,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet - \*1/\*DEL - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1456,6 +1520,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1466,6 +1531,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Female - \*B/\*B - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1476,6 +1542,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Male - \*B/\*MALE - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1501,6 +1568,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1511,6 +1579,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*A/\*B - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1521,6 +1590,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet - \*0/\*A - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1531,6 +1601,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHom - \*0/\*0 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1541,6 +1612,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication - \*A/\*Ax2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1551,6 +1623,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication - \*A/\*Bx2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1561,6 +1634,7 @@ Below is comprehensive summary of SV described from real NGS studies: - UpstreamDeletionHet - \*A/\*B - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1571,6 +1645,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet,UpstreamDeletionHet - \*0/\*A - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1604,6 +1679,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1614,6 +1690,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*A/\*A - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1624,6 +1701,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet - \*0/\*A - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1634,6 +1712,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHom - \*0/\*0 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1750,6 +1829,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1760,6 +1840,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*3 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1770,6 +1851,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Intron9Deletion - \*1/\*S1 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1780,12 +1862,24 @@ Below is comprehensive summary of SV described from real NGS studies: - Exon11Deletion - \*1/\*S2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - `GeT-RM `__ - NA19819 - + * - \*S1, \*S2 + - Intron9Deletion,Exon11Deletion + - \*S1/\*S2 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19030 + - SLCO1B1 ======= @@ -1835,6 +1929,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1845,6 +1940,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1855,6 +1951,7 @@ Below is comprehensive summary of SV described from real NGS studies: - DeletionHet - \*1/\*DEL - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1865,6 +1962,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Duplication - \*1x2/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1875,6 +1973,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Multiplication1 - \*1x3/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1885,6 +1984,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Multiplication2 - \*1x4/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1895,6 +1995,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Multiplication2 - \*1x3/\*2x2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -1981,6 +2082,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -1991,6 +2093,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2001,6 +2104,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Intron1DeletionA - \*1/\*S1 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2011,12 +2115,24 @@ Below is comprehensive summary of SV described from real NGS studies: - Intron1DeletionB - \*1/\*S2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - - - + * - \*S3 + - Intron1PartialDup + - \*1/\*S3 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA18632 + - UGT2B15 ======= @@ -2033,6 +2149,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -2043,6 +2160,7 @@ Below is comprehensive summary of SV described from real NGS studies: - Normal - \*1/\*2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2053,6 +2171,7 @@ Below is comprehensive summary of SV described from real NGS studies: - PartialDeletion1 - \*4/\*S1 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2063,12 +2182,35 @@ Below is comprehensive summary of SV described from real NGS studies: - PartialDeletion2 - \*2/\*S2 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - `1KGP `__ - NA19160 - + * - \*S3 + - PartialDeletion3 + - \*1/\*S3 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19189 + - + * - \*S4 + - Deletion + - \*2/\*S4 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19024 + - UGT2B17 ======= @@ -2088,6 +2230,7 @@ Below is comprehensive summary of SV described from real NGS studies: - SV Name - Genotype - Reference + - Gene Model - GRCh37 - GRCh38 - Data Type @@ -2095,9 +2238,10 @@ Below is comprehensive summary of SV described from real NGS studies: - Coriell ID - Description * - - - Normal + - Normal,Normal - \*1/\*1 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2105,9 +2249,10 @@ Below is comprehensive summary of SV described from real NGS studies: - NA19178 - * - \*2 - - DeletionHet + - Normal,Deletion - \*1/\*2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2115,9 +2260,10 @@ Below is comprehensive summary of SV described from real NGS studies: - NA18855 - * - \*2 - - DeletionHom + - Deletion,Deletion - \*2/\*2 - `Lee et al., 2019 `__ + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS @@ -2125,12 +2271,24 @@ Below is comprehensive summary of SV described from real NGS studies: - NA18617 - * - \*2, \*S1 - - PartialDeletionHet + - Deletion,PartialDeletion1 - \*2/\*S1 - + - :download:`Model ` - :download:`Profile ` - :download:`Profile ` - WGS - `1KGP `__ - NA19160 - + * - \*2, \*S2 + - Deletion,PartialDeletion2 + - \*2/\*S2 + - + - :download:`Model ` + - :download:`Profile ` + - :download:`Profile ` + - WGS + - `1KGP `__ + - NA19189 + - diff --git a/docs/glossary.rst b/docs/glossary.rst index 7955cf1c..dcd755f1 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -6,6 +6,15 @@ Introduction This page describes glossaries for PyPGx. +1000 Genomes Project (1KGP) +=========================== + +The `1000 Genomes Project `__ created a +catalogue of common human genetic variation, using openly consented samples +from people who declared themselves to be healthy. The reference data +resources generated by the project remain heavily used by the biomedical +science community. + Clinical Pharmacogenetics Implementation Consortium (CPIC) ========================================================== @@ -37,6 +46,12 @@ the-g-standaard-the-medicines-standard-in-healthcare>`__ was established in multidisciplinary and includes clinical pharmacists, physicians, clinical pharmacologists, clinical chemists, epidemiologists, and toxicologists. +Next-generation sequencing (NGS) +================================ + +Next-generation sequencing (NGS) is a massively parallel sequencing +technology that offers ultra-high throughput, scalability, and speed. + Pharmacogenomics (PGx) ====================== @@ -81,3 +96,9 @@ translocations or genomic imbalances (duplications and deletions), commonly referred to as copy number variants (CNVs). Additionally, for pharmacogenes that are known to have one or more pseudogenes (e.g. CYP2D6), SV can be often found in the form of hybrid genes (e.g. CYP2D6/CYP2D7). + +Whole genome sequencing (WGS) +============================= + +WGS is a comprehensive method for analyzing entire genomes, as opposed to +selectively sequencing targeted regions (e.g. whole exome sequencing). diff --git a/pypgx/__init__.py b/pypgx/__init__.py index 58487c72..117b4732 100644 --- a/pypgx/__init__.py +++ b/pypgx/__init__.py @@ -24,6 +24,7 @@ list_variants, load_allele_table, load_cnv_table, + load_cpic_table, load_diplotype_table, load_equation_table, load_gene_table, @@ -69,6 +70,8 @@ ) from .api.pipeline import ( + run_chip_pipeline, + run_long_read_pipeline, run_ngs_pipeline, ) diff --git a/pypgx/api/core.py b/pypgx/api/core.py index a8e2d4ed..7466ea71 100644 --- a/pypgx/api/core.py +++ b/pypgx/api/core.py @@ -7,6 +7,8 @@ from io import BytesIO import warnings +from .. import sdk + import numpy as np import pandas as pd from fuc import pyvcf, common @@ -32,21 +34,6 @@ 'Class IV (Normal)', ] -class AlleleNotFoundError(Exception): - """Raise if specified allele is not present in the allele table.""" - -class GeneNotFoundError(Exception): - """Raise if specified gene is not present in the gene table.""" - -class NotTargetGeneError(Exception): - """Raise if specified gene is not one of the target genes.""" - -class PhenotypeNotFoundError(Exception): - """Raise if specified phenotype is not present in the phenotype table.""" - -class VariantNotFoundError(Exception): - """Raise if specified variant is not present in the variant table.""" - def build_definition_table(gene, assembly='GRCh37'): """ Build the definition table of star alleles for specified gene. @@ -83,7 +70,7 @@ def build_definition_table(gene, assembly='GRCh37'): 1 19 15897578 rs3093105 A C . . VI=W12G GT 1 0 """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) other = 'GRCh38' if assembly == 'GRCh37' else 'GRCh37' @@ -175,7 +162,7 @@ def has_phenotype(gene): False """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = load_gene_table() @@ -205,7 +192,7 @@ def has_score(gene): False """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = load_gene_table() @@ -300,7 +287,7 @@ def get_exon_ends(gene, assembly='GRCh37'): List of end positions. """ if gene not in list_genes(mode='all'): - raise GeneNotFoundError(gene) + raise sdk.utils.GeneNotFoundError(gene) df = load_gene_table() df = df[df.Gene == gene] s = df[f'{assembly}ExonEnds'].values[0] @@ -323,7 +310,7 @@ def get_exon_starts(gene, assembly='GRCh37'): List of start positions. """ if gene not in list_genes(mode='all'): - raise GeneNotFoundError(gene) + raise sdk.utils.GeneNotFoundError(gene) df = load_gene_table() df = df[df.Gene == gene] s = df[f'{assembly}ExonStarts'].values[0] @@ -361,13 +348,13 @@ def get_function(gene, allele): nan """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = load_allele_table() df = df[(df.Gene == gene) & (df.StarAllele == allele)] if df.empty: - raise AlleleNotFoundError(gene, allele) + raise sdk.utils.AlleleNotFoundError(gene, allele) return df.Function.values[0] @@ -431,10 +418,10 @@ def get_priority(gene, phenotype): 'Normal/Routine/Low Risk' """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) if phenotype not in list_phenotypes(): - raise PhenotypeNotFoundError(phenotype) + raise sdk.utils.PhenotypeNotFoundError(phenotype) df = load_phenotype_table() i = (df.Gene == gene) & (df.Phenotype == phenotype) @@ -449,8 +436,6 @@ def get_ref_allele(gene, assembly='GRCh37'): ---------- gene : str Target gene. - assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' - Reference genome assembly. Returns ------- @@ -487,7 +472,7 @@ def get_region(gene, assembly='GRCh37'): Requested region. """ if gene not in list_genes(mode='all'): - raise GeneNotFoundError(gene) + raise sdk.utils.GeneNotFoundError(gene) df = load_gene_table() df = df[df.Gene == gene] @@ -529,7 +514,7 @@ def get_score(gene, allele): nan """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) if not has_score(gene): return np.nan @@ -538,7 +523,7 @@ def get_score(gene, allele): df = df[(df.Gene == gene) & (df.StarAllele == allele)] if df.empty: - raise AlleleNotFoundError(gene, allele) + raise sdk.utils.AlleleNotFoundError(gene, allele) return df.ActivityScore.values[0] @@ -557,7 +542,7 @@ def get_strand(gene): '+' or '-'. """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = load_gene_table() df = df[df.Gene == gene] @@ -588,17 +573,17 @@ def get_variant_impact(variant): 'Splice Defect' >>> pypgx.get_variant_impact('22-42524435-T-A') # Intron variant '' - >>> pypgx.get_variant_impact('22-42524435-T-C') # Does not exist + >>> pypgx.get_variant_impact('22-42524435-T-C') Traceback (most recent call last): File "", line 1, in - File "/Users/sbslee/Desktop/pypgx/pypgx/api/core.py", line 489, in get_variant_impact - raise VariantNotFoundError(variant) - pypgx.api.core.VariantNotFoundError: 22-42524435-T-C + File "/Users/sbslee/Desktop/pypgx/pypgx/api/core.py", line 588, in get_variant_impact + raise sdk.utils.VariantNotFoundError(variant) + pypgx.sdk.utils.VariantNotFoundError: 22-42524435-T-C """ df = load_variant_table() df = df[(df.GRCh37Name == variant) | (df.GRCh38Name == variant)] if df.empty: - raise VariantNotFoundError(variant) + raise sdk.utils.VariantNotFoundError(variant) impact = df.Impact.values[0] if pd.isna(impact): impact = '' @@ -783,7 +768,7 @@ def list_phenotypes(gene=None): if gene is not None: if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = df[df.Gene == gene] return sorted(list(df.Phenotype.unique())) @@ -834,7 +819,7 @@ def list_variants(gene, alleles=None, mode='all', assembly='GRCh37'): ['19-41495755-T-C', '19-41496461-T-C'] """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) allele_table = load_allele_table() allele_table = allele_table[allele_table.Gene == gene] @@ -852,7 +837,7 @@ def list_variants(gene, alleles=None, mode='all', assembly='GRCh37'): df = allele_table[allele_table.StarAllele == allele] if df.empty: - raise AlleleNotFoundError(gene, allele) + raise sdk.utils.AlleleNotFoundError(gene, allele) c = df[f'{assembly}Core'].values[0] t = df[f'{assembly}Tag'].values[0] @@ -888,6 +873,13 @@ def load_allele_table(): >>> import pypgx >>> df = pypgx.load_allele_table() + >>> df.head() + Gene StarAllele ActivityScore Function GRCh37Core GRCh37Tag GRCh38Core GRCh38Tag SV + 0 ABCB1 *1 NaN Normal Function 7-87138645-A-G,7-87160618-A-C,7-87179601-A-G NaN 7-87509329-A-G,7-87531302-A-C,7-87550285-A-G NaN False + 1 ABCB1 *2 NaN Increased Function NaN NaN NaN NaN False + 2 CACNA1S Reference NaN Normal Function NaN NaN NaN NaN False + 3 CACNA1S c.520C>T NaN Malignant Hyperthermia Associated 1-201061121-G-A NaN 1-201091993-G-A NaN False + 4 CACNA1S c.3257G>A NaN Malignant Hyperthermia Associated 1-201029943-C-T NaN 1-201060815-C-T NaN False """ b = BytesIO(pkgutil.get_data(__name__, 'data/allele-table.csv')) return pd.read_csv(b) @@ -906,10 +898,42 @@ def load_cnv_table(): >>> import pypgx >>> df = pypgx.load_cnv_table() + >>> df.head() + Gene Name + 0 CYP2A6 Normal + 1 CYP2A6 Deletion1Het + 2 CYP2A6 Deletion1Hom + 3 CYP2A6 Deletion2Het + 4 CYP2A6 Deletion3Het """ b = BytesIO(pkgutil.get_data(__name__, 'data/cnv-table.csv')) return pd.read_csv(b) +def load_cpic_table(): + """ + Load the CPIC table. + + Returns + ------- + pandas.DataFrame + Requested table. + + Examples + -------- + + >>> import pypgx + >>> df = pypgx.load_cpic_table() + >>> df.head() + Gene Drug Guideline CPICLevel CPICLevelStatus PharmGKBLevel FDALabel PMID + 0 HLA-B abacavir https://cpicpgx.org/guidelines/guideline-for-a... A Final 1A Testing required 24561393;22378157 + 1 HLA-B allopurinol https://cpicpgx.org/guidelines/guideline-for-a... A Final 1A Testing recommended 23232549;26094938 + 2 MT-RNR1 amikacin https://cpicpgx.org/guidelines/cpic-guideline-... A Final 3 NaN 34032273 + 3 CYP2C19 amitriptyline https://cpicpgx.org/guidelines/guideline-for-t... A Final 1A NaN 23486447;27997040 + 4 CYP2D6 amitriptyline https://cpicpgx.org/guidelines/guideline-for-t... A Final 1A Actionable PGx 23486447;27997040 + """ + b = BytesIO(pkgutil.get_data(__name__, 'data/cpic-table.csv')) + return pd.read_csv(b) + def load_diplotype_table(): """ Load the diplotype table. @@ -924,6 +948,13 @@ def load_diplotype_table(): >>> import pypgx >>> df = pypgx.load_diplotype_table() + >>> df.head() + Gene Diplotype Phenotype + 0 CACNA1S Reference/Reference Uncertain Susceptibility + 1 CACNA1S Reference/c.520C>T Malignant Hyperthermia Susceptibility + 2 CACNA1S Reference/c.3257G>A Malignant Hyperthermia Susceptibility + 3 CACNA1S c.520C>T/c.520C>T Malignant Hyperthermia Susceptibility + 4 CACNA1S c.520C>T/c.3257G>A Malignant Hyperthermia Susceptibility """ b = BytesIO(pkgutil.get_data(__name__, 'data/diplotype-table.csv')) return pd.read_csv(b) @@ -942,6 +973,13 @@ def load_equation_table(): >>> import pypgx >>> df = pypgx.load_equation_table() + >>> df.head() + Gene Phenotype Equation + 0 CYP2C9 Poor Metabolizer 0 <= score < 1 + 1 CYP2C9 Intermediate Metabolizer 1 <= score < 2 + 2 CYP2C9 Normal Metabolizer 2 == score + 3 CYP2D6 Poor Metabolizer 0 <= score < 0.25 + 4 CYP2D6 Intermediate Metabolizer 0.25 <= score < 1.25 """ b = BytesIO(pkgutil.get_data(__name__, 'data/equation-table.csv')) return pd.read_csv(b) @@ -960,6 +998,13 @@ def load_gene_table(): >>> import pypgx >>> df = pypgx.load_gene_table() + >>> df.head() + Gene Target Control Paralog Variants SV PhenotypeMethod RefAllele GRCh37Default GRCh38Default Strand GRCh37Region GRCh38Region GRCh37ExonStarts GRCh37ExonEnds GRCh38ExonStarts GRCh38ExonEnds + 0 ABCB1 True False NaN True False NaN *1 *2 *2 - 7:87130178-87345639 7:87500862-87716323 87133178,87135212,87138590,87144546,87145824,8... 87133765,87135359,87138797,87144744,87145981,8... 87503862,87505896,87509274,87515230,87516508,8... 87504449,87506043,87509481,87515428,87516665,8... + 1 CACNA1S True False NaN True False Diplotype Reference Reference Reference - 1:201005639-201084694 1:201036511-201115426 201008639,201009358,201009749,201010631,201012... 201009210,201009502,201009841,201010717,201012... 201039511,201040230,201040621,201041503,201043... 201040082,201040374,201040713,201041589,201043... + 2 CFTR True False NaN True False Diplotype Reference Reference Reference + 7:117117016-117311719 7:117477024-117671665 117120016,117144306,117149087,117170952,117174... 117120201,117144417,117149196,117171168,117174... 117480024,117504252,117509033,117530898,117534... 117480147,117504363,117509142,117531114,117534... + 3 CYP1A1 True False NaN True False NaN *1 *1 *1 - 15:75008882-75020951 15:74716541-74728528 75011882,75013307,75013539,75013754,75013931,7... 75013115,75013394,75013663,75013844,75014058,7... 74719541,74720966,74721198,74721413,74721590,7... 74720774,74721053,74721322,74721503,74721717,7... + 4 CYP1A2 True False NaN True False NaN *1A *1A *1A + 15:75038183-75051941 15:74745844-74759607 75041183,75042070,75043529,75044105,75044464,7... 75041238,75042910,75043650,75044195,75044588,7... 74748844,74749729,74751188,74751764,74752123,7... 74748897,74750569,74751309,74751854,74752247,7... """ b = BytesIO(pkgutil.get_data(__name__, 'data/gene-table.csv')) return pd.read_csv(b) @@ -978,6 +1023,13 @@ def load_phenotype_table(): >>> import pypgx >>> df = pypgx.load_phenotype_table() + >>> df.head() + Gene Phenotype Priority + 0 CACNA1S Uncertain Susceptibility Normal Risk + 1 CACNA1S Malignant Hyperthermia Susceptibility Abnormal/Priority/High Risk + 2 CFTR Favorable Response None + 3 CFTR Unfavorable Response None + 4 CFTR Indeterminate None """ b = BytesIO(pkgutil.get_data(__name__, 'data/phenotype-table.csv')) return pd.read_csv(b) @@ -996,6 +1048,13 @@ def load_variant_table(): >>> import pypgx >>> df = pypgx.load_phenotype_table() + >>> df.head() + Gene Phenotype Priority + 0 CACNA1S Uncertain Susceptibility Normal Risk + 1 CACNA1S Malignant Hyperthermia Susceptibility Abnormal/Priority/High Risk + 2 CFTR Favorable Response None + 3 CFTR Unfavorable Response None + 4 CFTR Indeterminate None """ b = BytesIO(pkgutil.get_data(__name__, 'data/variant-table.csv')) df = pd.read_csv(b) @@ -1037,7 +1096,7 @@ def predict_phenotype(gene, a, b): 'Rapid Metabolizer' """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) df = load_gene_table() phenotype_method = df[df.Gene == gene].PhenotypeMethod.values[0] @@ -1140,7 +1199,7 @@ def predict_score(gene, allele): nan """ if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) if not has_score(gene): return np.nan @@ -1241,7 +1300,7 @@ def func1(allele): if gene is None: raise ValueError('Gene is required when sorting by priority') if not is_target_gene(gene): - raise NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) function = get_function(gene, allele) a = FUNCTION_ORDER.index(function) core_variants = list_variants(gene, alleles=allele, assembly=assembly, mode='core') diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv index c27bf6f2..b615cee6 100644 --- a/pypgx/api/data/cnv-table.csv +++ b/pypgx/api/data/cnv-table.csv @@ -1,66 +1,73 @@ -Gene,Name,Code -CYP2A6,Normal,0 -CYP2A6,Deletion1Het,1 -CYP2A6,Deletion1Hom,2 -CYP2A6,Deletion2Het,3 -CYP2A6,Deletion3Het,4 -CYP2A6,Hybrid1,5 -CYP2A6,Hybrid2,6 -CYP2A6,Hybrid3,7 -CYP2A6,PseudogeneDuplication,8 -CYP2A6,Duplication1,9 -CYP2A6,Duplication2,10 -CYP2A6,Duplication3,11 -CYP2B6,Normal,0 -CYP2B6,Hybrid,1 -CYP2B6,Duplication,2 -CYP2D6,Normal,0 -CYP2D6,DeletionHet,1 -CYP2D6,DeletionHom,2 -CYP2D6,Duplication,3 -CYP2D6,Multiplication,4 -CYP2D6,Tandem1A,5 -CYP2D6,Tandem1B,6 -CYP2D6,Tandem2A,7 -CYP2D6,Tandem2B,8 -CYP2D6,Tandem2C,9 -CYP2D6,Tandem3,10 -CYP2D6,"DeletionHet,Tandem1A",11 -CYP2D6,"Duplication,Tandem1A",12 -CYP2D6,Unknown1,13 -CYP2E1,Normal,0 -CYP2E1,Duplication1,1 -CYP2E1,Duplication2,2 -CYP2E1,PartialDuplication,3 -CYP2E1,Multiplication,4 -CYP4F2,Normal,0 -CYP4F2,DeletionHet,1 -G6PD,Female,0 -G6PD,Male,1 -GSTM1,Normal,0 -GSTM1,DeletionHet,1 -GSTM1,DeletionHom,2 -GSTM1,Duplication,3 -GSTM1,UpstreamDeletionHet,4 -GSTM1,"DeletionHet,UpstreamDeletionHet",5 -GSTT1,Normal,0 -GSTT1,DeletionHet,1 -GSTT1,DeletionHom,2 -SLC22A2,Normal,0 -SLC22A2,Exon11Deletion,1 -SLC22A2,Intron9Deletion,2 -SULT1A1,Normal,0 -SULT1A1,DeletionHet,1 -SULT1A1,Duplication,2 -SULT1A1,Multiplication1,3 -SULT1A1,Multiplication2,4 -UGT1A4,Normal,0 -UGT1A4,Intron1DeletionA,1 -UGT1A4,Intron1DeletionB,2 -UGT2B15,Normal,0 -UGT2B15,PartialDeletion1,1 -UGT2B15,PartialDeletion2,2 -UGT2B17,Normal,0 -UGT2B17,DeletionHet,1 -UGT2B17,DeletionHom,2 -UGT2B17,PartialDeletionHet,3 \ No newline at end of file +Gene,Name +CYP2A6,Normal +CYP2A6,Deletion1Het +CYP2A6,Deletion1Hom +CYP2A6,Deletion2Het +CYP2A6,Deletion3Het +CYP2A6,Hybrid1 +CYP2A6,Hybrid2 +CYP2A6,Hybrid3 +CYP2A6,PseudogeneDuplication +CYP2A6,Duplication1 +CYP2A6,Duplication2 +CYP2A6,Duplication3 +CYP2B6,Normal +CYP2B6,Hybrid +CYP2B6,Duplication +CYP2D6,Normal +CYP2D6,DeletionHet +CYP2D6,DeletionHom +CYP2D6,Duplication +CYP2D6,Multiplication +CYP2D6,Tandem1A +CYP2D6,Tandem1B +CYP2D6,Tandem2A +CYP2D6,Tandem2B +CYP2D6,Tandem2C +CYP2D6,Tandem3 +CYP2D6,"DeletionHet,Tandem1A" +CYP2D6,"Duplication,Tandem1A" +CYP2D6,Unknown1 +CYP2D6,PseudogeneDeletion +CYP2E1,Normal +CYP2E1,Duplication1 +CYP2E1,Duplication2 +CYP2E1,PartialDuplicationHet +CYP2E1,PartialDuplicationHom +CYP2E1,Multiplication +CYP4F2,Normal +CYP4F2,DeletionHet +G6PD,Female +G6PD,Male +GSTM1,Normal +GSTM1,DeletionHet +GSTM1,DeletionHom +GSTM1,Duplication +GSTM1,UpstreamDeletionHet +GSTM1,"DeletionHet,UpstreamDeletionHet" +GSTT1,Normal +GSTT1,DeletionHet +GSTT1,DeletionHom +SLC22A2,Normal +SLC22A2,Exon11Deletion +SLC22A2,Intron9Deletion +SLC22A2,"Intron9Deletion,Exon11Deletion" +SULT1A1,Normal +SULT1A1,DeletionHet +SULT1A1,Duplication +SULT1A1,Multiplication1 +SULT1A1,Multiplication2 +UGT1A4,Normal +UGT1A4,Intron1DeletionA +UGT1A4,Intron1DeletionB +UGT1A4,Intron1PartialDup +UGT2B15,Normal +UGT2B15,PartialDeletion1 +UGT2B15,PartialDeletion2 +UGT2B15,PartialDeletion3 +UGT2B15,Deletion +UGT2B17,"Normal,Normal" +UGT2B17,"Normal,Deletion" +UGT2B17,"Deletion,Deletion" +UGT2B17,"Deletion,PartialDeletion1" +UGT2B17,"Deletion,PartialDeletion2" diff --git a/pypgx/api/data/cpic-table.csv b/pypgx/api/data/cpic-table.csv new file mode 100644 index 00000000..45d90871 --- /dev/null +++ b/pypgx/api/data/cpic-table.csv @@ -0,0 +1,444 @@ +Gene,Drug,Guideline,CPICLevel,CPICLevelStatus,PharmGKBLevel,FDALabel,PMID +HLA-B,abacavir,https://cpicpgx.org/guidelines/guideline-for-abacavir-and-hla-b/,A,Final,1A,Testing required,24561393;22378157 +HLA-B,allopurinol,https://cpicpgx.org/guidelines/guideline-for-allopurinol-and-hla-b/,A,Final,1A,Testing recommended,23232549;26094938 +MT-RNR1,amikacin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,3,,34032273 +CYP2C19,amitriptyline,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,A,Final,1A,,23486447;27997040 +CYP2D6,amitriptyline,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,A,Final,1A,Actionable PGx,23486447;27997040 +UGT1A1,atazanavir,https://cpicpgx.org/guidelines/guideline-for-atazanavir-and-ugt1a1/,A,Final,1A,,26417955 +CYP2D6,atomoxetine,https://cpicpgx.org/guidelines/cpic-guideline-for-atomoxetine-based-on-cyp2d6-genotype/,A,Final,1A,Actionable PGx,30801677 +NUDT15,azathioprine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,1A,Testing recommended,21270794;23422873;30447069 +TPMT,azathioprine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,1A,Testing recommended,21270794;23422873;30447069 +DPYD,capecitabine,https://cpicpgx.org/guidelines/guideline-for-fluoropyrimidines-and-dpyd/,A,Final,1A,Actionable PGx,23988873;29152729 +HLA-A,carbamazepine,https://cpicpgx.org/guidelines/guideline-for-carbamazepine-and-hla-b/,A,Final,1A,Actionable PGx,23695185;29392710 +HLA-B,carbamazepine,https://cpicpgx.org/guidelines/guideline-for-carbamazepine-and-hla-b/,A,Final,1A,Testing required,23695185;29392710 +CYP2C9,celecoxib,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,Actionable PGx,32189324 +CYP2C19,citalopram,https://cpicpgx.org/guidelines/guideline-for-selective-serotonin-reuptake-inhibitors-and-cyp2d6-and-cyp2c19/,A,Final,1A,Actionable PGx,25974703 +CYP2C19,clopidogrel,https://cpicpgx.org/guidelines/guideline-for-clopidogrel-and-cyp2c19/,A,Final,1A,Actionable PGx,21716271;23698643 +CYP2D6,codeine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,A,Final,1A,Actionable PGx,22205192;24458010;33387367 +CACNA1S,desflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +RYR1,desflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +CYP2B6,efavirenz,https://cpicpgx.org/guidelines/cpic-guideline-for-efavirenz-based-on-cyp2b6-genotype/,A,Final,1A,Actionable PGx,31006110 +CACNA1S,enflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +RYR1,enflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +CYP2C19,escitalopram,https://cpicpgx.org/guidelines/guideline-for-selective-serotonin-reuptake-inhibitors-and-cyp2d6-and-cyp2c19/,A,Final,1A,Actionable PGx,25974703 +DPYD,fluorouracil,https://cpicpgx.org/guidelines/guideline-for-fluoropyrimidines-and-dpyd/,A,Final,1A,Actionable PGx,23988873;29152729 +CYP2C9,flurbiprofen,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,Actionable PGx,32189324 +CYP2C9,fosphenytoin,https://cpicpgx.org/guidelines/guideline-for-phenytoin-and-cyp2c9-and-hla-b/,A,Final,,Actionable PGx,25099164;32779747 +HLA-B,fosphenytoin,https://cpicpgx.org/guidelines/guideline-for-phenytoin-and-cyp2c9-and-hla-b/,A,Final,,Actionable PGx,25099164;32779747 +MT-RNR1,gentamicin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,1A,,34032273 +CACNA1S,halothane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,,30499100 +RYR1,halothane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,,30499100 +CYP2C9,ibuprofen,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,,32189324 +UGT1A1,irinotecan,,A,Provisional,1A,Actionable PGx, +CACNA1S,isoflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +RYR1,isoflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +CFTR,ivacaftor,https://cpicpgx.org/guidelines/guideline-for-ivacaftor-and-cftr/,A,Final,1A,Testing required,24598717 +MT-RNR1,kanamycin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,3,,34032273 +CYP2C19,lansoprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,A,Final,1A,Informative PGx,32770672 +CYP2C9,lornoxicam,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,,32189324 +CYP2C9,meloxicam,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,Actionable PGx,32189324 +NUDT15,mercaptopurine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,1A,Testing recommended,21270794;23422873;30447069 +TPMT,mercaptopurine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,1A,Testing recommended,21270794;23422873;30447069 +CACNA1S,methoxyflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,,30499100 +RYR1,methoxyflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,,30499100 +CYP2D6,nortriptyline,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,A,Final,1A,Actionable PGx,23486447;27997040 +CYP2C19,omeprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,A,Final,1A,Actionable PGx,32770672 +CYP2D6,ondansetron,https://cpicpgx.org/guidelines/guideline-for-ondansetron-and-tropisetron-and-cyp2d6-genotype/,A,Final,1A,Informative PGx,28002639 +HLA-B,oxcarbazepine,https://cpicpgx.org/guidelines/guideline-for-carbamazepine-and-hla-b/,A,Final,1A,Testing recommended,29392710 +CYP2C19,pantoprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,A,Final,1A,Actionable PGx,32770672 +MT-RNR1,paromomycin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,,,34032273 +CYP2D6,paroxetine,https://cpicpgx.org/guidelines/guideline-for-selective-serotonin-reuptake-inhibitors-and-cyp2d6-and-cyp2c19/,A,Final,1A,Informative PGx,25974703 +IFNL3,peginterferon alfa-2a,https://cpicpgx.org/guidelines/guideline-for-peg-interferon-alpha-based-regimens-and-ifnl3/,A,Final,1A,,24096968 +IFNL4,peginterferon alfa-2a,https://cpicpgx.org/guidelines/guideline-for-peg-interferon-alpha-based-regimens-and-ifnl3/,A,Final,1A,, +IFNL3,peginterferon alfa-2b,https://cpicpgx.org/guidelines/guideline-for-peg-interferon-alpha-based-regimens-and-ifnl3/,A,Final,1A,Actionable PGx,24096968 +IFNL4,peginterferon alfa-2b,https://cpicpgx.org/guidelines/guideline-for-peg-interferon-alpha-based-regimens-and-ifnl3/,A,Final,1A,, +CYP2C9,phenytoin,https://cpicpgx.org/guidelines/guideline-for-phenytoin-and-cyp2c9-and-hla-b/,A,Final,1A,Actionable PGx,25099164;32779747 +HLA-B,phenytoin,https://cpicpgx.org/guidelines/guideline-for-phenytoin-and-cyp2c9-and-hla-b/,A,Final,1A,Actionable PGx,25099164;32779747 +CYP2C9,piroxicam,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,Actionable PGx,32189324 +CYP2D6,pitolisant,,A,Provisional,,Actionable PGx, +MT-RNR1,plazomicin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,,,34032273 +G6PD,rasburicase,https://cpicpgx.org/guidelines/guideline-for-rasburicase-and-g6pd/,A,Final,1A,Testing required,24787449 +CACNA1S,sevoflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +RYR1,sevoflurane,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +SLCO1B1,simvastatin,https://cpicpgx.org/guidelines/guideline-for-simvastatin-and-slco1b1/,A,Final,1A,Informative PGx,22617227;24918167 +CYP2C9,siponimod,,A,Provisional,1A,Testing required, +MT-RNR1,streptomycin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,1A,,34032273 +CACNA1S,succinylcholine,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +RYR1,succinylcholine,https://cpicpgx.org/guidelines/cpic-guideline-for-ryr1-and-cacna1s/,A,Final,1A,Actionable PGx,30499100 +CYP3A5,tacrolimus,https://cpicpgx.org/guidelines/guideline-for-tacrolimus-and-cyp3a5/,A,Final,1A,,25801146 +G6PD,tafenoquine,,A,Provisional,,Testing required, +CYP2D6,tamoxifen,https://cpicpgx.org/guidelines/cpic-guideline-for-tamoxifen-based-on-cyp2d6-genotype/,A,Final,1A,Actionable PGx,29385237 +CYP2C9,tenoxicam,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,A,Final,1A,,32189324 +NUDT15,thioguanine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,3,Testing recommended,21270794;23422873;30447069 +TPMT,thioguanine,https://cpicpgx.org/guidelines/guideline-for-thiopurines-and-tpmt/,A,Final,3,Testing recommended,21270794;23422873;30447069 +MT-RNR1,tobramycin,https://cpicpgx.org/guidelines/cpic-guideline-for-aminoglycosides-and-mt-rnr1/,A,Final,3,,34032273 +CYP2D6,tramadol,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,A,Final,1A,Actionable PGx,33387367 +CYP2D6,tropisetron,https://cpicpgx.org/guidelines/guideline-for-ondansetron-and-tropisetron-and-cyp2d6-genotype/,A,Final,1A,,28002639 +CYP2C19,voriconazole,https://cpicpgx.org/guidelines/guideline-for-voriconazole-and-cyp2c19/,A,Final,1A,Actionable PGx,27981572 +CYP2C9,warfarin,https://cpicpgx.org/guidelines/guideline-for-warfarin-and-cyp2c9-and-vkorc1/,A,Final,1A,Actionable PGx,21900891;28198005 +CYP4F2,warfarin,https://cpicpgx.org/guidelines/guideline-for-warfarin-and-cyp2c9-and-vkorc1/,A,Final,1A,,21900891;28198005 +VKORC1,warfarin,https://cpicpgx.org/guidelines/guideline-for-warfarin-and-cyp2c9-and-vkorc1/,A,Final,1A,Actionable PGx,21900891;28198005 +G6PD,aspirin,,A/B,Provisional,3,, +POLG,divalproex sodium,,A/B,Provisional,,Testing required, +CYP2D6,eliglustat,,A/B,Provisional,,Testing required, +NAT2,hydralazine,,A/B,Provisional,3,, +CYP2D6,oliceridine,,A/B,Provisional,,Actionable PGx, +CYP2D6,pimozide,,A/B,Provisional,3,Testing required, +CYP2D6,tetrabenazine,,A/B,Provisional,,Testing required, +POLG,valproic acid,,A/B,Provisional,3,Testing required, +GBA,velaglucerase alfa,,A/B,Provisional,,Testing required, +CYP2D6,venlafaxine,,A/B,Provisional,1A,Actionable PGx, +CYP2D6,vortioxetine,,A/B,Provisional,3,Actionable PGx, +CYP2C9,acenocoumarol,,B,Provisional,1B,, +CYP4F2,acenocoumarol,,B,Provisional,2A,, +CYP2D6,aripiprazole,,B,Provisional,1A,Actionable PGx, +UGT1A1,belinostat,,B,Provisional,,Actionable PGx, +CYP2C19,brivaracetam,,B,Provisional,3,Actionable PGx, +SCN1A,carbamazepine,,B,Provisional,2B,, +NAGS,carglumic acid,,B,Provisional,,Testing required, +G6PD,chloramphenicol,,B,Provisional,3,, +G6PD,chlorpropamide,,B,Provisional,,Actionable PGx, +G6PD,ciprofloxacin,,B,Provisional,3,, +CYP2C19,clomipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,,23486447;27997040 +CYP2D6,clomipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +G6PD,dapsone,,B,Provisional,4,Actionable PGx, +CYP2D6,desipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +CYP2C19,dexlansoprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,B,Final,1A,Actionable PGx,32770672 +G6PD,dimercaprol,,B,Provisional,3,, +CYP2C19,doxepin,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +CYP2D6,doxepin,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +CYP2D6,fluvoxamine,https://cpicpgx.org/guidelines/guideline-for-selective-serotonin-reuptake-inhibitors-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,25974703 +G6PD,glibenclamide,,B,Provisional,3,Actionable PGx, +G6PD,glimepiride,,B,Provisional,,Actionable PGx, +G6PD,glipizide,,B,Provisional,,Actionable PGx, +CYP2D6,hydrocodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,B,Final,1A,,33387367 +CYP2C19,imipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,,23486447;27997040 +CYP2D6,imipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +G6PD,mafenide,,B,Provisional,,Actionable PGx, +G6PD,mesalazine,,B,Provisional,,, +CYP2B6,methadone,,B,Provisional,2A,, +G6PD,methylene blue,,B,Provisional,3,Actionable PGx, +G6PD,moxifloxacin,,B,Provisional,,, +HPRT1,mycophenolic acid,,B,Provisional,,Actionable PGx, +G6PD,nalidixic acid,,B,Provisional,,Actionable PGx, +G6PD,nitrofurantoin,,B,Provisional,3,Actionable PGx, +G6PD,norfloxacin,,B,Provisional,,Actionable PGx, +G6PD,pegloticase,,B,Provisional,3,Testing required, +G6PD,phenazopyridine,,B,Provisional,3,, +CYP4F2,phenprocoumon,,B,Provisional,3,, +SCN1A,phenytoin,,B,Provisional,3,, +G6PD,primaquine,,B,Provisional,3,Testing required, +G6PD,probenecid,,B,Provisional,,Actionable PGx, +G6PD,quinine,,B,Provisional,,Actionable PGx, +CYP2D6,risperidone,,B,Provisional,1A,Informative PGx, +ABCG2,rosuvastatin,,B,Provisional,2A,, +CYP2C19,sertraline,https://cpicpgx.org/guidelines/guideline-for-selective-serotonin-reuptake-inhibitors-and-cyp2d6-and-cyp2c19/,B,Final,1A,,25974703 +G6PD,sodium nitrite,,B,Provisional,,Actionable PGx, +G6PD,sulfacetamide,,B,Provisional,,, +G6PD,sulfadiazine,,B,Provisional,,Actionable PGx, +G6PD,sulfamethoxazole / trimethoprim,,B,Provisional,4,Actionable PGx, +G6PD,sulfasalazine,,B,Provisional,3,Actionable PGx, +G6PD,sulfisoxazole,,B,Provisional,,, +CYP2C19,trimipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,,23486447;27997040 +CYP2D6,trimipramine,https://cpicpgx.org/guidelines/guideline-for-tricyclic-antidepressants-and-cyp2d6-and-cyp2c19/,B,Final,1A,Actionable PGx,23486447;27997040 +ABL2,valproic acid,,B,Provisional,,, +ASL,valproic acid,,B,Provisional,,, +ASS1,valproic acid,,B,Provisional,,, +CPS1,valproic acid,,B,Provisional,,, +NAGS,valproic acid,,B,Provisional,,, +OTC,valproic acid,,B,Provisional,,Actionable PGx, +HLA-A,allopurinol,,B/C,Provisional,2B,, +NAT2,amifampridine,,B/C,Provisional,,Actionable PGx, +NAT2,amifampridine phosphate,,B/C,Provisional,,Actionable PGx, +CYP2D6,amoxapine,,B/C,Provisional,,Actionable PGx, +CYP2D6,amphetamine,,B/C,Provisional,,Informative PGx, +CYP2D6,aripiprazole lauroxil,,B/C,Provisional,,Actionable PGx, +ADRB1,atenolol,,B/C,Provisional,,, +GRK5,atenolol,,B/C,Provisional,,, +CYP2C9,avatrombopag,,B/C,Provisional,,Informative PGx, +CYP2D6,brexpiprazole,,B/C,Provisional,,Actionable PGx, +ADRB1,bucindolol,,B/C,Provisional,3,, +CYP2B6,bupropion,,B/C,Provisional,2A,, +CYP2C19,carisoprodol,,B/C,Provisional,4,Actionable PGx, +ADRB1,carvedilol,,B/C,Provisional,3,, +CYP2D6,carvedilol,,B/C,Provisional,3,Actionable PGx, +GRK5,carvedilol,,B/C,Provisional,,, +CYP2D6,cevimeline,,B/C,Provisional,,Actionable PGx, +SLC6A4,citalopram,,B/C,Provisional,3,, +CYP2C19,clobazam,,B/C,Provisional,3,Actionable PGx, +CYP2D6,clozapine,,B/C,Provisional,,Actionable PGx, +G6PD,dabrafenib,,B/C,Provisional,,Actionable PGx, +SLC28A3,daunorubicin,,B/C,Provisional,,, +CYP2D6,deutetrabenazine,,B/C,Provisional,,Actionable PGx, +CYP2D6,dextromethorphan,,B/C,Provisional,3,, +CYP2C19,diazepam,,B/C,Provisional,3,Actionable PGx, +UGT1A1,dolutegravir,,B/C,Provisional,,Actionable PGx, +CYP2D6,donepezil,,B/C,Provisional,3,Actionable PGx, +SLC28A3,doxorubicin,,B/C,Provisional,,, +CYP2C9,dronabinol,,B/C,Provisional,,Actionable PGx, +SLCO1B1,elagolix,,B/C,Provisional,,Actionable PGx, +CYP2C9,erdafitinib,,B/C,Provisional,,Actionable PGx, +SLC6A4,escitalopram,,B/C,Provisional,3,, +CYP2D6,flecainide,,B/C,Provisional,1A,, +CYP2D6,gefitinib,,B/C,Provisional,3,Actionable PGx, +CYP2D6,haloperidol,,B/C,Provisional,1A,, +G6PD,hydroxychloroquine,,B/C,Provisional,,Actionable PGx, +CYP2D6,iloperidone,,B/C,Provisional,3,Actionable PGx, +CYP2D6,labetalol,,B/C,Provisional,,, +HLA-DRB1,lapatinib,,B/C,Provisional,3,Actionable PGx, +CYP2C9,lesinurad,,B/C,Provisional,,Actionable PGx, +G6PD,lidocaine,,B/C,Provisional,,, +MTHFR,l-methylfolate,,B/C,Provisional,3,, +CYP2D6,lofexidine,,B/C,Provisional,,Actionable PGx, +CYP2D6,meclizine,,B/C,Provisional,,Actionable PGx, +CYP2D6,metoclopramide,,B/C,Provisional,,Actionable PGx, +ADRB1,metoprolol,,B/C,Provisional,3,, +CYP2D6,metoprolol,,B/C,Provisional,1A,Informative PGx, +GRK5,metoprolol,,B/C,Provisional,,, +CYP2D6,mirabegron,,B/C,Provisional,,Actionable PGx, +CYP2D6,mirtazapine,,B/C,Provisional,2A,, +BCHE,mivacurium,,B/C,Provisional,,, +CYP2D6,nebivolol,,B/C,Provisional,,Informative PGx, +CYP2B6,nevirapine,,B/C,Provisional,2A,, +UGT1A1,nilotinib,,B/C,Provisional,3,Actionable PGx, +HLA-B,pazopanib,,B/C,Provisional,,Actionable PGx, +UGT1A1,pazopanib,,B/C,Provisional,3,Actionable PGx, +CYP2D6,perphenazine,,B/C,Provisional,,Actionable PGx, +NAT2,procainamide,,B/C,Provisional,,Informative PGx, +CYP2D6,propafenone,,B/C,Provisional,1A,Actionable PGx, +CYP2D6,propranolol,,B/C,Provisional,4,Informative PGx, +CYP2D6,protriptyline,,B/C,Provisional,,Actionable PGx, +UGT1A1,raltegravir,,B/C,Provisional,,Informative PGx, +BCHE,succinylcholine,,B/C,Provisional,3,Actionable PGx, +NAT2,sulfamethoxazole / trimethoprim,,B/C,Provisional,,Actionable PGx, +NAT2,sulfasalazine,,B/C,Provisional,,Actionable PGx, +CYP2D6,tamsulosin,,B/C,Provisional,,Actionable PGx, +CYP2D6,thioridazine,,B/C,Provisional,3,Actionable PGx, +CYP2D6,timolol,,B/C,Provisional,3,, +CYP2D6,valbenazine,,B/C,Provisional,,Actionable PGx, +CYP2D6,zuclopenthixol,,B/C,Provisional,1A,, +CYP2C9,aceclofenac,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,,,32189324 +TNF,adalimumab,,C,Provisional,,, +COMT,alfentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,alfentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +HLA-C,allopurinol,,C,Provisional,2B,, +MC4R,amisulpride,,C,Provisional,3,, +MC4R,aripiprazole,,C,Provisional,3,, +CYP2C9,aspirin,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,,,32189324 +HLA-DPB1,aspirin,,C,Provisional,2B,, +CFTR,ataluren,,C,Provisional,3,, +COMT,buprenorphine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,buprenorphine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +HLA-B,carbimazole,,C,Provisional,3,, +MTHFR,carboplatin,,C,Provisional,3,, +G6PD,chloroquine,,C,Provisional,4,Actionable PGx, +COMT,citalopram,,C,Provisional,,, +GRIK4,citalopram,,C,Provisional,,, +HTR2A,citalopram,,C,Provisional,3,, +CES1,clopidogrel,,C,Provisional,2B,, +HTR2C,clozapine,,C,Provisional,3,, +MC4R,clozapine,,C,Provisional,3,, +COMT,codeine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,codeine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP3A5,cyclosporine,,C,Provisional,3,, +HLA-B,dapsone,,C,Provisional,2A,, +CYP2D6,darifenacin,,C,Provisional,,Actionable PGx, +CYP2C8,diclofenac,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,3,,32189324 +CYP2C9,diclofenac,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,3,,32189324 +ABCB1,digoxin,,C,Provisional,3,, +CYP2D6,dolasetron,,C,Provisional,4,, +CYP2D6,duloxetine,,C,Provisional,,Actionable PGx, +F5,eltrombopag,,C,Provisional,,Actionable PGx, +COMT,escitalopram,,C,Provisional,,, +GRIK4,escitalopram,,C,Provisional,,, +CYP2C19,esomeprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,C,Final,3,Actionable PGx,32770672 +TNF,etanercept,,C,Provisional,2B,, +ABCB1,fentanyl,,C,Provisional,3,, +COMT,fentanyl,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,fentanyl,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP2D6,fesoterodine,,C,Provisional,,Actionable PGx, +CYP2C19,flibanserin,,C,Provisional,,Actionable PGx, +CYP2C9,flibanserin,,C,Provisional,,Actionable PGx, +CYP2D6,flibanserin,,C,Provisional,,Actionable PGx, +CYP2D6,fluoxetine,,C,Provisional,3,Informative PGx, +GRIK4,fluoxetine,,C,Provisional,,, +CYP2D6,galantamine,,C,Provisional,3,Informative PGx, +MC4R,haloperidol,,C,Provisional,3,, +F5,hormonal contraceptives for systemic use,,C,Provisional,1A,, +COMT,hydrocodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,hydrocodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +COMT,hydromorphone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,hydromorphone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +CYP2C8,ibuprofen,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,3,,32189324 +CYP2C9,indomethacin,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,3,,32189324 +TNF,infliximab,,C,Provisional,3,, +NAT2,isoniazid,,C,Provisional,1B,, +HLA-DQA1,lapatinib,,C,Provisional,3,Actionable PGx, +COMT,levomethadone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,levomethadone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +CYP2C9,lumiracoxib,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,,,32189324 +COMT,methadone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP2D6,methadone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,methadone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +HLA-B,methazolamide,,C,Provisional,2A,, +HLA-C,methazolamide,,C,Provisional,2B,, +HLA-B,methimazole,,C,Provisional,3,, +ABCB1,methotrexate,,C,Provisional,3,, +MTHFR,methotrexate,,C,Provisional,2A,, +SLCO1B1,methotrexate,,C,Provisional,3,, +CYP2D6,methylphenidate,,C,Provisional,3,, +CYP2D6,modafinil,,C,Provisional,,Actionable PGx, +COMT,morphine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,morphine,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP2C9,nabumetone,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,,,32189324 +OPRM1,naloxone,,C,Provisional,3,, +COMT,naltrexone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,,,33387367 +OPRM1,naltrexone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,4,,33387367 +CYP2C9,naproxen,https://cpicpgx.org/guidelines/cpic-guideline-for-nsaids-based-on-cyp2c9-genotype/,C,Final,3,,32189324 +ABCB1,nevirapine,,C,Provisional,3,, +HLA-B,nevirapine,,C,Provisional,3,, +HLA-DRB1,nevirapine,,C,Provisional,2B,, +HTR2C,olanzapine,,C,Provisional,3,, +MC4R,olanzapine,,C,Provisional,3,, +HLA-A,oxcarbazepine,https://cpicpgx.org/guidelines/guideline-for-carbamazepine-and-hla-b/,C,Final,3,,29392710 +COMT,oxycodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP2D6,oxycodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,2A,,33387367 +OPRM1,oxycodone,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +MC4R,paliperidone,,C,Provisional,3,, +CYP2D6,palonosetron,,C,Provisional,,Informative PGx, +GRIK4,paroxetine,,C,Provisional,,, +SLCO1B1,pravastatin,,C,Provisional,2A,, +HLA-B,propylthiouracil,,C,Provisional,3,, +MC4R,quetiapine,,C,Provisional,3,, +CYP2D6,quinidine,,C,Provisional,,Informative PGx, +CYP2D6,quinine,,C,Provisional,,Actionable PGx, +CYP2C19,rabeprazole,https://cpicpgx.org/guidelines/cpic-guideline-for-proton-pump-inhibitors-and-cyp2c19/,C,Final,2A,Actionable PGx,32770672 +COMT,remifentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,remifentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +DRD2,risperidone,,C,Provisional,3,, +HTR2C,risperidone,,C,Provisional,3,, +MC4R,risperidone,,C,Provisional,3,, +CYP2C8,rosiglitazone,,C,Provisional,3,, +SLCO1B1,rosuvastatin,,C,Provisional,2A,Actionable PGx, +CYP2D6,sertraline,,C,Provisional,3,, +GRIK4,sertraline,,C,Provisional,,, +ABCB1,simvastatin,,C,Provisional,3,, +CYP3A5,sirolimus,,C,Provisional,3,, +COMT,sufentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,sufentanil,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +CYP3A4,tacrolimus,,C,Provisional,1B,, +DPYD,tegafur,https://cpicpgx.org/guidelines/guideline-for-fluoropyrimidines-and-dpyd/,C,Final,1A,,23988873;29152729 +CYP2D6,terbinafine,,C,Provisional,,Informative PGx, +CYP2D6,tolterodine,,C,Provisional,3,Actionable PGx, +COMT,tramadol,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +OPRM1,tramadol,https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/,C,Final,3,,33387367 +GRIK4,venlafaxine,,C,Provisional,,, +G6PD,vitamin c,,C,Provisional,,, +MC4R,ziprasidone,,C,Provisional,,, +ITPA,"interferon alfa-2b, recombinant",,C/D,Provisional,3,, +CYP3A5,midazolam,,C/D,Provisional,3,, +COMT,nicotine,,C/D,Provisional,3,, +ABCB1,ondansetron,,C/D,Provisional,3,, +UGT2B15,oxazepam,,C/D,Provisional,3,, +FDPS,alendronate,,D,Provisional,,, +GP1BA,aspirin,,D,Provisional,3,, +LTC4S,aspirin,,D,Provisional,3,, +PTGS1,aspirin,,D,Provisional,3,, +APOE,atorvastatin,,D,Provisional,2B,, +CETP,atorvastatin,,D,Provisional,4,, +COQ2,atorvastatin,,D,Provisional,3,, +KIF6,atorvastatin,,D,Provisional,3,, +LDLR,atorvastatin,,D,Provisional,,Informative PGx, +LPA,atorvastatin,,D,Provisional,,, +CRHR1,budesonide,,D,Provisional,,, +ADORA2A,caffeine,,D,Provisional,3,, +TYMS,capecitabine,,D,Provisional,3,, +ACE,captopril,,D,Provisional,2A,, +EPHX1,carbamazepine,,D,Provisional,3,, +CETP,cerivastatin,,D,Provisional,,, +LPA,cerivastatin,,D,Provisional,,, +EGF,cetuximab,,D,Provisional,3,, +ACYP2,cisplatin,,D,Provisional,3,, +ERCC1,cisplatin,,D,Provisional,3,, +GSTM1,cisplatin,,D,Provisional,3,, +NQO1,cisplatin,,D,Provisional,,, +XPC,cisplatin,,D,Provisional,3,, +FKBP5,citalopram,,D,Provisional,3,, +ANKK1,clozapine,,D,Provisional,,, +GSTP1,cyclophosphamide,,D,Provisional,3,, +NQO1,cyclophosphamide,,D,Provisional,3,, +SOD2,cyclophosphamide,,D,Provisional,3,, +CBR3,daunorubicin,,D,Provisional,,, +HAS3,daunorubicin,,D,Provisional,,, +CBR3,doxorubicin,,D,Provisional,3,, +HAS3,doxorubicin,,D,Provisional,,, +NQO1,doxorubicin,,D,Provisional,,, +SERPINC1,eltrombopag,,D,Provisional,,Actionable PGx, +CBR3,epirubicin,,D,Provisional,3,, +GSTP1,epirubicin,,D,Provisional,3,, +HAS3,epirubicin,,D,Provisional,,, +NQO1,epirubicin,,D,Provisional,3,, +DYNC2H1,etoposide,,D,Provisional,3,, +GSTP1,fluorouracil,,D,Provisional,3,, +NQO1,fluorouracil,,D,Provisional,3,, +TYMS,fluorouracil,,D,Provisional,3,, +UMPS,fluorouracil,,D,Provisional,3,, +FKBP5,fluoxetine,,D,Provisional,3,, +CRHR1,fluticasone propionate,,D,Provisional,,, +CRHR1,fluticasone/salmeterol,,D,Provisional,,, +CETP,fluvastatin,,D,Provisional,3,, +LPA,fluvastatin,,D,Provisional,,, +ADD1,furosemide,,D,Provisional,3,, +NT5C2,gemcitabine,,D,Provisional,3,, +NEDD4L,hydrochlorothiazide,,D,Provisional,3,, +PRKCA,hydrochlorothiazide,,D,Provisional,3,, +YEATS4,hydrochlorothiazide,,D,Provisional,3,, +CBR3,idarubicin,,D,Provisional,,, +HAS3,idarubicin,,D,Provisional,,, +C8orf34,irinotecan,,D,Provisional,3,, +SEMA3C,irinotecan,,D,Provisional,3,, +UGT1A4,lamotrigine,,D,Provisional,3,, +PTGFR,latanoprost,,D,Provisional,3,, +C11orf65,metformin,,D,Provisional,4,, +ATIC,methotrexate,,D,Provisional,2B,, +MTRR,methotrexate,,D,Provisional,3,, +CYB5R1,metoclopramide,,D,Provisional,,Actionable PGx, +CYB5R2,metoclopramide,,D,Provisional,,Actionable PGx, +CYB5R3,metoclopramide,,D,Provisional,,Actionable PGx, +CYB5R4,metoclopramide,,D,Provisional,,Actionable PGx, +FKBP5,mirtazapine,,D,Provisional,3,, +CCHCR1,nevirapine,,D,Provisional,3,, +CHRNA3,nicotine,,D,Provisional,3,, +ANKK1,olanzapine,,D,Provisional,3,, +GSTM1,oxaliplatin,,D,Provisional,3,, +GSTP1,oxaliplatin,,D,Provisional,3,, +FKBP5,paroxetine,,D,Provisional,3,, +HTR1A,paroxetine,,D,Provisional,3,, +VDR,peginterferon alfa-2b,,D,Provisional,3,, +CETP,pravastatin,,D,Provisional,3,, +HMGCR,pravastatin,,D,Provisional,3,, +KIF6,pravastatin,,D,Provisional,2B,, +LPA,pravastatin,,D,Provisional,,, +FDPS,raloxifene,,D,Provisional,,, +VDR,ribavirin,,D,Provisional,3,, +FDPS,risedronate,,D,Provisional,,, +ANKK1,risperidone,,D,Provisional,3,, +FCGR3A,rituximab,,D,Provisional,2B,, +COQ2,rosuvastatin,,D,Provisional,3,, +LPA,rosuvastatin,,D,Provisional,3,, +ADRB2,salbutamol,,D,Provisional,3,, +COL22A1,salbutamol,,D,Provisional,3,, +CRHR2,salbutamol,,D,Provisional,3,, +ADRB2,salmeterol,,D,Provisional,2A,, +GNB3,sildenafil,,D,Provisional,3,, +CETP,simvastatin,,D,Provisional,3,, +HMGCR,simvastatin,,D,Provisional,3,, +LPA,simvastatin,,D,Provisional,,, +ADD1,spironolactone,,D,Provisional,3,, +ABCC4,tenofovir,,D,Provisional,3,, +CRHR1,triamcinolone,,D,Provisional,,, +FKBP5,venlafaxine,,D,Provisional,3,, +CALU,warfarin,,D,Provisional,3,, +GGCX,warfarin,,D,Provisional,3,, +PROC,warfarin,,D,Provisional,,Actionable PGx, +PROS1,warfarin,,D,Provisional,,Actionable PGx, \ No newline at end of file diff --git a/pypgx/api/genotype.py b/pypgx/api/genotype.py index 18412bc2..28b62815 100644 --- a/pypgx/api/genotype.py +++ b/pypgx/api/genotype.py @@ -82,6 +82,23 @@ def _call_multiplication(r): return result +def _call_linked_allele(r, linked, target): + """ + Call linked star allele. + """ + a1, a2 = r.Haplotype1[0], r.Haplotype2[0] + h1 = linked in r.Haplotype1 + h2 = linked in r.Haplotype2 + if h1 and h2: + result = [a1, target] + elif h1 and not h2: + result = [a2, target] + elif not h1 and h2: + result = [a1, target] + else: + result = ['Indeterminate'] + return result + ############################### # Public classes and methods # ############################### @@ -162,7 +179,7 @@ class CYP2D6Genotyper: def one_row(self, r): a1, a2 = r.Haplotype1[0], r.Haplotype2[0] s1, s2 = core.sort_alleles([a1, a2], by='priority', gene=self.gene, assembly=self.assembly) - if r.CNV in ['Normal', 'AssumeNormal']: + if r.CNV in ['Normal', 'AssumeNormal', 'PseudogeneDeletion']: result = [a1, a2] elif r.CNV == 'DeletionHom': result = ['*5', '*5'] @@ -273,9 +290,9 @@ def one_row(self, r): a1, a2 = r.Haplotype1[0], r.Haplotype2[0] if r.CNV in ['Normal', 'AssumeNormal']: result = [a1, a2] - elif r.CNV == 'PartialDuplication': - h1 = '*4'in r.Haplotype1 and '*7' in r.Haplotype1 - h2 = '*4'in r.Haplotype2 and '*7' in r.Haplotype2 + elif r.CNV == 'PartialDuplicationHet': + h1 = '*7' in r.Haplotype1 + h2 = '*7' in r.Haplotype2 if h1 and h2: result = [a1, '*S1'] elif h1 and not h2: @@ -284,6 +301,8 @@ def one_row(self, r): result = [a1, '*S1'] else: result = ['Indeterminate'] + elif r.CNV == 'PartialDuplicationHom': + result = ['*S1', '*S1'] elif r.CNV in ['Duplication1', 'Duplication2']: result = _call_duplication(r) elif r.CNV == 'Multiplication': @@ -417,6 +436,12 @@ def one_row(self, r): result = [a1, '*S2'] else: result = ['Indeterminate'] + elif r.CNV == 'Intron9Deletion,Exon11Deletion': + if (('*3' in r.Haplotype1 or '*3' in r.Haplotype2) and + ('*K432Q' in r.Haplotype1 or '*K432Q' in r.Haplotype2)): + result = ['*S1', '*S2'] + else: + result = ['Indeterminate'] else: result = ['Indeterminate'] return '/'.join(core.sort_alleles(result, by='name')) @@ -463,21 +488,11 @@ def one_row(self, r): if r.CNV in ['Normal', 'AssumeNormal']: result = [a1, a2] elif r.CNV == 'Intron1DeletionA': - if a1 == a2: - result = [a1, '*S1'] - else: - result = ['Indeterminate'] + result = _call_linked_allele(r, '*1', '*S1') elif r.CNV == 'Intron1DeletionB': - h1 = '*1'in r.Haplotype1 - h2 = '*1'in r.Haplotype2 - if h1 and h2: - result = [a1, '*S2'] - elif h1 and not h2: - result = [a2, '*S2'] - elif not h1 and h2: - result = [a1, '*S2'] - else: - result = ['Indeterminate'] + result = _call_linked_allele(r, '*1', '*S2') + elif r.CNV == 'Intron1PartialDup': + result = _call_linked_allele(r, '*1', '*S3') else: result = ['Indeterminate'] return '/'.join(core.sort_alleles(result, by='name')) @@ -501,6 +516,10 @@ def one_row(self, r): result = [a1, '*S1'] elif r.CNV == 'PartialDeletion2': result = [a1, '*S2'] + elif r.CNV == 'PartialDeletion3': + result = [a1, '*S3'] + elif r.CNV == 'Deletion': + result = [a1, '*S4'] else: result = ['Indeterminate'] return '/'.join(core.sort_alleles(result, by='name')) @@ -516,14 +535,16 @@ class UGT2B17Genotyper: """ def one_row(self, r): - if r.CNV == 'DeletionHet': + if r.CNV == 'Normal,Deletion': result = ['*1', '*2'] - elif r.CNV == 'DeletionHom': + elif r.CNV == 'Deletion,Deletion': result = ['*2', '*2'] - elif r.CNV in ['Normal', 'AssumeNormal']: + elif r.CNV in ['Normal,Normal', 'AssumeNormal']: result = ['*1', '*1'] - elif r.CNV == 'PartialDeletionHet': + elif r.CNV == 'Deletion,PartialDeletion1': result = ['*2', '*S1'] + elif r.CNV == 'Deletion,PartialDeletion2': + result = ['*2', '*S2'] else: result = ['Indeterminate'] return '/'.join(core.sort_alleles(result, by='name')) diff --git a/pypgx/api/pipeline.py b/pypgx/api/pipeline.py index 74492c5b..c1b3f858 100644 --- a/pypgx/api/pipeline.py +++ b/pypgx/api/pipeline.py @@ -12,11 +12,11 @@ from . import utils, plot, genotype, core def run_chip_pipeline( - gene, output, variants, panel=None, assembly='GRCh37', impute=False, + gene, output, variants, assembly='GRCh37', panel=None, impute=False, force=False, samples=None, exclude=False ): """ - Run PyPGx's genotyping pipeline for chip data. + Run genotyping pipeline for chip data. Parameters ---------- @@ -24,7 +24,7 @@ def run_chip_pipeline( Target gene. output : str Output directory. - variants : str, optional + variants : str Input VCF file must be already BGZF compressed (.gz) and indexed (.tbi) to allow random access. Statistical haplotype phasing will be skipped if input VCF is already fully phased. @@ -46,7 +46,7 @@ def run_chip_pipeline( If True, exclude specified samples. """ if not core.is_target_gene(gene): - raise core.NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) if os.path.exists(output) and force: shutil.rmtree(output) @@ -80,6 +80,56 @@ def run_chip_pipeline( ) results.to_file(f'{output}/results.zip') +def run_long_read_pipeline( + gene, output, variants, assembly='GRCh37', force=False, samples=None, + exclude=False +): + """ + Run genotyping pipeline for long-read sequencing data. + + Parameters + ---------- + gene : str + Target gene. + output : str + Output directory. + variants : str + Input VCF file must be already BGZF compressed (.gz) and indexed + (.tbi) to allow random access. + assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' + Reference genome assembly. + force : bool, default : False + Overwrite output directory if it already exists. + samples : str or list, optional + Subset the VCF for specified samples. This can be a text file (.txt, + .tsv, .csv, or .list) containing one sample per line. Alternatively, + you can provide a list of samples. + exclude : bool, default: False + If True, exclude specified samples. + """ + if not core.is_target_gene(gene): + raise sdk.utils.NotTargetGeneError(gene) + + if os.path.exists(output) and force: + shutil.rmtree(output) + + os.mkdir(output) + + consolidated_variants = utils.import_variants(gene, variants, + assembly=assembly, platform='LongRead', samples=samples, + exclude=exclude) + consolidated_variants.to_file(f'{output}/consolidated-variants.zip') + alleles = utils.predict_alleles(consolidated_variants) + alleles.to_file(f'{output}/alleles.zip') + genotypes = genotype.call_genotypes(alleles=alleles) + genotypes.to_file(f'{output}/genotypes.zip') + phenotypes = utils.call_phenotypes(genotypes) + phenotypes.to_file(f'{output}/phenotypes.zip') + results = utils.combine_results( + genotypes=genotypes, phenotypes=phenotypes, alleles=alleles + ) + results.to_file(f'{output}/results.zip') + def run_ngs_pipeline( gene, output, variants=None, depth_of_coverage=None, control_statistics=None, platform='WGS', assembly='GRCh37', panel=None, @@ -88,7 +138,7 @@ def run_ngs_pipeline( cnv_caller=None ): """ - Run PyPGx's genotyping pipeline for NGS data. + Run genotyping pipeline for NGS data. During copy number analysis, if the input data is targeted sequencing, the method will apply inter-sample normalization using summary statistics @@ -138,7 +188,7 @@ def run_ngs_pipeline( used. """ if not core.is_target_gene(gene): - raise core.NotTargetGeneError(gene) + raise sdk.utils.NotTargetGeneError(gene) gene_table = core.load_gene_table() small_var = gene_table[gene_table.Gene == gene].Variants.values[0] diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py index 770a6556..9c6fdabb 100644 --- a/pypgx/api/utils.py +++ b/pypgx/api/utils.py @@ -25,6 +25,116 @@ # Private methods # ################### +def _phase_extension(vf, gene, assembly): + """ + Apply the phase-extension algorithm. + + Anchor variants are those variants that have been haplotype phased + using a reliable method (e.g. statistical haplotype phasing and + read-backed phasing) and are later used by the phase-extension + algorithm (PE). Basically, PE determines the most likely haplotype + phase of the remaining unphased variants using anchor variants. + For each unphased variant, PE first finds all star alleles carrying the + variant and then counts how many anchor variants per haplotype are + overlapped to each of the star alleles. For example, if the second + haplotype's anchor variants (i.e. variants with '0|1') were found to + have the most overlapping with the *2 allele, then PE will assign the + phase of the variant of interest to '0|1'. + """ + anchors = {} + + for i, r in vf.df.iterrows(): + for allele in r.ALT.split(','): + variant = f'{r.CHROM}-{r.POS}-{r.REF}-{allele}' + for sample in vf.samples: + if sample not in anchors: + anchors[sample] = [[], []] + gt = r[sample].split(':')[0] + if '|' not in gt: + continue + gt = gt.split('|') + if gt[0] != '0': + anchors[sample][0].append(variant) + if gt[1] != '0': + anchors[sample][1].append(variant) + + variant_synonyms = core.get_variant_synonyms(gene, assembly=assembly) + + def one_row(r): + if pyvcf.row_phased(r): + return r + + r.FORMAT += ':PE' + + for sample in vf.samples: + if not pyvcf.gt_het(r[sample]): + r[sample] = pyvcf.gt_pseudophase(r[sample]) + ':0,0,0,0' + continue + + scores = [[0, 0], [0, 0]] + + gt = r[sample].split(':')[0].split('/') + + for i in [0, 1]: + if gt[i] == '0': + continue + + alt_allele = r.ALT.split(',')[int(gt[i]) - 1] + + variant = f'{r.CHROM}-{r.POS}-{r.REF}-{alt_allele}' + + if variant in variant_synonyms: + variant = variant_synonyms[variant] + + star_alleles = core.list_alleles(gene, variants=variant, assembly=assembly) + + for j in [0, 1]: + for star_allele in star_alleles: + score = 0 + for x in anchors[sample][j]: + if x in core.list_variants(gene, alleles=star_allele, assembly=assembly, mode='all'): + score += 1 + if score > scores[i][j]: + scores[i][j] = score + + a = scores[0][0] + b = scores[0][1] + c = scores[1][0] + d = scores[1][1] + + if max([a, b]) == max([c, d]): + if a < b and c > d: + flip = True + elif a == b and c > d: + flip = True + elif a < b and c == d: + flip = True + else: + flip = False + else: + if max([a, b]) > max([c, d]): + if a > b: + flip = False + else: + flip = True + else: + if c > d: + flip = True + else: + flip = False + + if flip: + result = f'{gt[1]}|{gt[0]}' + else: + result = f'{gt[0]}|{gt[1]}' + + result = result + ':' + ':'.join(r[sample].split(':')[1:]) + r[sample] = result + ':' + ','.join([str(x) for x in scores[0] + scores[1]]) + + return r + + return pyvcf.VcfFrame([], vf.df.apply(one_row, axis=1)) + def _process_copy_number(copy_number): df = copy_number.data.copy_df() region = core.get_region(copy_number.metadata['Gene'], assembly=copy_number.metadata['Assembly']) @@ -51,7 +161,7 @@ def _process_copy_number(copy_number): def call_phenotypes(genotypes): """ - Call phenotypes for the target gene. + Call phenotypes for target gene. Parameters ---------- @@ -226,47 +336,43 @@ def show_comparison(col): show_comparison(col) def compute_control_statistics( - bam=None, fn=None, gene=None, region=None, assembly='GRCh37', bed=None + gene, bams, assembly='GRCh37', bed=None ): """ - Compute summary statistics for the control gene from BAM files. + Compute summary statistics for control gene from BAM files. + + Note that for the arguments ``gene`` and ``bed``, the 'chr' prefix in + contig names (e.g. 'chr1' vs. '1') will be automatically added or removed + as necessary to match the input BAM's contig names. Parameters ---------- - bam : list, optional - One or more BAM files. Cannot be used with ``fn``. - fn : str, optional - File containing one BAM file per line. Cannot be used with ``bam``. - gene : str, optional - Control gene (recommended choices: 'EGFR', 'RYR1', 'VDR'). Cannot be - used with ``region``. - region : str, optional - Custom region to use as control gene ('chrom:start-end'). Cannot be - used with ``gene``. + gene : str + Control gene (recommended choices: 'EGFR', 'RYR1', 'VDR'). + Alternatively, you can provide a custom region (format: + chrom:start-end). + bams : str or list + One or more input BAM files. Alternatively, you can provide a text + file (.txt, .tsv, .csv, or .list) containing one BAM file per line. assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' Reference genome assembly. bed : str, optional By default, the input data is assumed to be WGS. If it's targeted sequencing, you must provide a BED file to indicate probed regions. - Note that the 'chr' prefix in BED contig names (e.g. 'chr1' vs. '1') - will be automatically added or removed as necessary to match the BAM - contig names. Returns ------- pypgx.Archive Archive object with the semantic type SampleTable[Statistcs]. """ - bam_files, bam_prefix = sdk.parse_input_bams(bam=bam, fn=fn) - - df = core.load_gene_table() + gene_table = core.load_gene_table() - if gene is not None: - region = df[df.Gene == gene][f'{assembly}Region'].values[0] + if gene in core.list_genes(mode='all'): + region = gene_table[gene_table.Gene == gene][f'{assembly}Region'].values[0] + else: + region = gene - cf = pycov.CovFrame.from_bam( - bam=bam_files, region=f'{bam_prefix}{region}', zero=False - ) + cf = pycov.CovFrame.from_bam(bams, regions=region, zero=False) metadata = { 'Control': gene, @@ -302,7 +408,7 @@ def compute_copy_number( read_depth, control_statistics, samples_without_sv=None ): """ - Compute copy number from read depth for the target gene. + Compute copy number from read depth for target gene. The method will convert read depth from target gene to copy number by performing intra-sample normalization using summary statistics from @@ -366,13 +472,10 @@ def compute_copy_number( return sdk.Archive(metadata, cf) def compute_target_depth( - gene, bam=None, fn=None, assembly='GRCh37', bed=None + gene, bams, assembly='GRCh37', bed=None ): """ - Compute read depth for the target gene from BAM files. - - Input BAM files must be specified with either ``bam`` or ``fn``, but - it's an error to use both. + Compute read depth for target gene from BAM files. By default, the input data is assumed to be WGS. If it's targeted sequencing, you must provide a BED file with ``bed`` to indicate @@ -382,10 +485,9 @@ def compute_target_depth( ---------- gene : str Target gene. - bam : list, optional - One or more BAM files. - fn : str, optional - File containing one BAM file per line. + bams : str or list + One or more input BAM files. Alternatively, you can provide a text + file (.txt, .tsv, .csv, or .list) containing one BAM file per line. assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' Reference genome assembly. bed : str, optional @@ -402,13 +504,9 @@ def compute_target_depth( 'SemanticType': 'CovFrame[ReadDepth]', } - bam_files, bam_prefix = sdk.parse_input_bams(bam=bam, fn=fn) - region = core.get_region(gene, assembly=assembly) - data = pycov.CovFrame.from_bam( - bam=bam_files, region=f'{bam_prefix}{region}', zero=True - ) + data = pycov.CovFrame.from_bam(bams, regions=region, zero=True) if bed: metadata['Platform'] = 'Targeted' @@ -504,125 +602,35 @@ def create_consolidated_vcf(imported_variants, phased_variants): vf1 = imported_variants.data.strip(format) vf2 = phased_variants.data.strip('GT') + # For every variant in VcfFrame[Phased] (e.g. '0|1'), find and append its + # accompanying data from VcfFrame[Imported] (e.g. '0|1:15,15:30:0.5'). def one_row(r): variant = f'{r.CHROM}-{r.POS}-{r.REF}-{r.ALT}' s = vf1.fetch(variant) - if s.empty: return r - def one_gt(g): return ':'.join(g.split(':')[1:]) - s[9:] = s[9:].apply(one_gt) r[9:] = r[9:].str.cat(s[9:], sep=':') - return r vf3 = pyvcf.VcfFrame([], vf2.df.apply(one_row, axis=1)) vf3.df.INFO = 'Phased' vf3.df.FORMAT = format + # Remove variants that are in both VcfFrame[Imported] and + # VcfFrame[Phased]. Append remaining unphased variants to + # VcfFrame[Phased]. vf4 = vf1.filter_vcf(vf2, opposite=True) vf5 = pyvcf.VcfFrame([], pd.concat([vf3.df, vf4.df])).sort() - anchors = {} - - for i, r in vf2.df.iterrows(): - for allele in r.ALT.split(','): - variant = f'{r.CHROM}-{r.POS}-{r.REF}-{allele}' - for sample in vf2.samples: - if sample not in anchors: - anchors[sample] = [[], []] - gt = r[sample].split(':')[0].split('|') - if gt[0] != '0': - anchors[sample][0].append(variant) - if gt[1] != '0': - anchors[sample][1].append(variant) - - variant_synonyms = core.get_variant_synonyms(gene, assembly=assembly) - - def one_row(r): - if 'Phased' in r.INFO: - return r - - r.FORMAT += ':PE' - - for sample in vf5.samples: - if not pyvcf.gt_het(r[sample]): - r[sample] = pyvcf.gt_pseudophase(r[sample]) + ':0,0,0,0' - continue - - scores = [[0, 0], [0, 0]] - - gt = r[sample].split(':')[0].split('/') - - for i in [0, 1]: - if gt[i] == '0': - continue - - alt_allele = r.ALT.split(',')[int(gt[i]) - 1] - - variant = f'{r.CHROM}-{r.POS}-{r.REF}-{alt_allele}' - - if variant in variant_synonyms: - variant = variant_synonyms[variant] - - star_alleles = core.list_alleles(gene, variants=variant, assembly=assembly) - - for j in [0, 1]: - for star_allele in star_alleles: - score = 0 - for x in anchors[sample][j]: - if x in core.list_variants(gene, alleles=star_allele, assembly=assembly, mode='all'): - score += 1 - if score > scores[i][j]: - scores[i][j] = score - - a = scores[0][0] - b = scores[0][1] - c = scores[1][0] - d = scores[1][1] - - if max([a, b]) == max([c, d]): - if a < b and c > d: - flip = True - elif a == b and c > d: - flip = True - elif a < b and c == d: - flip = True - else: - flip = False - else: - if max([a, b]) > max([c, d]): - if a > b: - flip = False - else: - flip = True - else: - if c > d: - flip = True - else: - flip = False - - if flip: - result = f'{gt[1]}|{gt[0]}' - else: - result = f'{gt[0]}|{gt[1]}' - - result = result + ':' + ':'.join(r[sample].split(':')[1:]) - r[sample] = result + ':' + ','.join([str(x) for x in scores[0] + scores[1]]) - - return r - - vf5.df = vf5.df.apply(one_row, axis=1) + vf6 = _phase_extension(vf5, gene, assembly) metadata = phased_variants.copy_metadata() metadata['SemanticType'] = 'VcfFrame[Consolidated]' - result = sdk.Archive(metadata, vf5) - - return result + return sdk.Archive(metadata, vf6) def create_regions_bed( assembly='GRCh37', add_chr_prefix=False, merge=False, sv_genes=False @@ -671,14 +679,13 @@ def estimate_phase_beagle( """ Estimate haplotype phase of observed variants with the Beagle program. - The 'chr' prefix in contig names (e.g. 'chr1' vs. '1') in the input VCF - will be automatically added or removed as necessary to match that of the - reference VCF. - Parameters ---------- imported_variants : str or pypgx.Archive - Archive file or object with the semantic type VcfFrame[Imported]. + Archive file or object with the semantic type VcfFrame[Imported]. The + 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be + automatically added or removed as necessary to match the reference + VCF's contig names. panel : str, optional VCF file corresponding to a reference haplotype panel (compressed or uncompressed). By default, the 1KGP panel in the ``~/pypgx-bundle`` @@ -767,6 +774,9 @@ def filter_samples(archive, samples, exclude=False): 'CovFrame' in archive.metadata['SemanticType']): data = archive.data.subset(samples, exclude=exclude) elif 'SampleTable' in archive.metadata['SemanticType']: + if exclude: + samples = [x for x in archive.data.index.to_list() + if x not in samples] data = archive.data.loc[samples] else: pass @@ -822,13 +832,11 @@ def import_variants( gene, vcf, assembly='GRCh37', platform='WGS', samples=None, exclude=False ): """ - Import variant (SNV/indel) data for the target gene. + Import SNV/indel data for target gene. - The method will first slice input VCF for the target gene and then assess - whether every genotype call in the sliced VCF is haplotype phased. It - will return an archive object with the semantic type - VcfFrame[Consolidated] if the VCF is fully phased or otherwise - VcfFrame[Imported]. + The method will slice the input VCF for the target gene to create an + archive object with the semantic type VcfFrame[Imported] or + VcfFrame[Consolidated]. Parameters ---------- @@ -840,8 +848,15 @@ def import_variants( VcfFrame object. assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' Reference genome assembly. - platform : {'WGS', 'Targeted', 'Chrip'}, default: 'WGS' - Genotyping platform. + platform : {'WGS', 'Targeted', 'Chip', 'LongRead'}, default: 'WGS' + Genotyping platform used. When the platform is 'WGS', 'Targeted', or + 'Chip', the method will assess whether every genotype call in the + sliced VCF is haplotype phased (e.g. '0|1'). If the sliced VCF is + fully phased, the method will return VcfFrame[Consolidated] or + otherwise VcfFrame[Imported]. When the platform is 'LongRead', the + method will return VcfFrame[Consolidated] after applying the + phase-extension algorithm to estimate haplotype phase of any variants + that could not be resolved by read-backed phasing. samples : str or list, optional Specify which samples should be included for analysis by providing a text file (.txt, .tsv, .csv, or .list) containing one sample per @@ -870,11 +885,15 @@ def import_variants( samples = common.parse_list_or_file(samples) vf = vf.subset(samples, exclude=exclude) - if vf.phased: + if platform == 'LongRead': + vf = _phase_extension(vf, gene, assembly) semantic_type = 'VcfFrame[Consolidated]' else: - vf = vf.unphase() - semantic_type = 'VcfFrame[Imported]' + if vf.phased: + semantic_type = 'VcfFrame[Consolidated]' + else: + vf = vf.unphase() + semantic_type = 'VcfFrame[Imported]' metadata = { 'Platform': platform, @@ -915,7 +934,7 @@ def predict_alleles(consolidated_variants): reformatted_variants = {} - for x in consolidated_variants.data.variants(): + for x in consolidated_variants.data.to_variants(): if x in variant_synonyms: y = variant_synonyms[x] if y in reformatted_variants: @@ -1003,7 +1022,7 @@ def one_row(r, sample, i): def predict_cnv(copy_number, cnv_caller=None): """ - Predict CNV for the target gene based on copy number data. + Predict CNV from copy number data for target gene. Genomic positions that are missing copy number because, for example, the input data is targeted sequencing will be imputed with forward filling. @@ -1041,40 +1060,40 @@ def predict_cnv(copy_number, cnv_caller=None): cnv_caller.check_type('Model[CNV]') copy_number = _process_copy_number(copy_number) - df = copy_number.data.df.iloc[:, 2:] X = df.T.to_numpy() predictions = cnv_caller.data.predict(X) - df = core.load_cnv_table() - df = df[df.Gene == copy_number.metadata['Gene']] - cnvs = dict(zip(df.Code, df.Name)) - predictions = [cnvs[x] for x in predictions] + cnv_table = core.load_cnv_table() + cnv_table = cnv_table[cnv_table.Gene == copy_number.metadata['Gene']] + code2name = dict(zip(list(range(len(cnv_table.Name))), cnv_table.Name)) + predictions = [code2name[x] for x in predictions] metadata = copy_number.copy_metadata() metadata['SemanticType'] = 'SampleTable[CNVCalls]' data = pd.DataFrame({'CNV': predictions}) data.index = copy_number.data.samples + return sdk.Archive(metadata, data) def prepare_depth_of_coverage( - bam=None, fn=None, assembly='GRCh37', bed=None + bams, assembly='GRCh37', bed=None ): """ - Prepare a depth of coverage file for all target genes with SV. + Prepare a depth of coverage file for all target genes with SV from BAM + files. Parameters ---------- - bam : list, optional - One or more BAM files. - fn : str, optional - File containing one BAM file per line. + bams : str or list + One or more input BAM files. Alternatively, you can provide a text + file (.txt, .tsv, .csv, or .list) containing one BAM file per line. assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37' Reference genome assembly. bed : str, optional By default, the input data is assumed to be WGS. If it's targeted sequencing, you must provide a BED file to indicate probed regions. - Note that the 'chr' prefix in BED contig names (e.g. 'chr1' vs. '1') - will be automatically added or removed as necessary to match the BAM - contig names. + Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will + be automatically added or removed as necessary to match the input + BAM's contig names. Returns ------- @@ -1086,23 +1105,11 @@ def prepare_depth_of_coverage( 'SemanticType': 'CovFrame[DepthOfCoverage]', } - bam_files, bam_prefix = sdk.parse_input_bams(bam=bam, fn=fn) - regions = create_regions_bed( merge=True, sv_genes=True, assembly=assembly, - ).gr.df.apply( - lambda r: f'{r.Chromosome}:{r.Start}-{r.End}', axis=1 - ).to_list() + ).to_regions() - cfs = [] - - for region in regions: - cf = pycov.CovFrame.from_bam( - bam=bam_files, region=f'{bam_prefix}{region}', zero=True - ) - cfs.append(cf) - - cf = pycov.concat(cfs) + cf = pycov.CovFrame.from_bam(bams, regions=regions, zero=True) if bed: metadata['Platform'] = 'Targeted' @@ -1154,7 +1161,8 @@ def test_cnv_caller( cnv_calls : str or pypgx.Archive Archive file or object with the semantic type SampleTable[CNVCalls]. confusion_matrix : str, optional - Write the confusion matrix as a CSV file. + Write the confusion matrix as a CSV file where rows indicate actual + class and columns indicate prediction class. """ if isinstance(cnv_caller, str): cnv_caller = sdk.Archive.from_file(cnv_caller) @@ -1178,9 +1186,9 @@ def test_cnv_caller( cnv_table = core.load_cnv_table() cnv_table = cnv_table[cnv_table.Gene == copy_number.metadata['Gene']] - name2code = dict(zip(cnv_table.Name, cnv_table.Code)) - code2name = dict(zip(cnv_table.Code, cnv_table.Name)) - + code = list(range(len(cnv_table.Name))) + code2name = dict(zip(code, cnv_table.Name)) + name2code = dict(zip(cnv_table.Name, code)) cnv_calls.data['Code'] = cnv_calls.data.apply(lambda r: name2code[r.CNV], axis=1) columns = ['Chromosome', 'Position'] + cnv_calls.data.Sample.to_list() copy_number.data.df = copy_number.data.df[columns] @@ -1213,7 +1221,8 @@ def train_cnv_caller(copy_number, cnv_calls, confusion_matrix=None): cnv_calls : str or pypgx.Archive Archive file or object with the semantic type SampleTable[CNVCalls]. confusion_matrix : str, optional - Write the confusion matrix as a CSV file. + Write the confusion matrix as a CSV file where rows indicate actual + class and columns indicate prediction class. Returns ------- @@ -1237,8 +1246,9 @@ def train_cnv_caller(copy_number, cnv_calls, confusion_matrix=None): cnv_table = core.load_cnv_table() cnv_table = cnv_table[cnv_table.Gene == copy_number.metadata['Gene']] - name2code = dict(zip(cnv_table.Name, cnv_table.Code)) - code2name = dict(zip(cnv_table.Code, cnv_table.Name)) + code = list(range(len(cnv_table.Name))) + code2name = dict(zip(code, cnv_table.Name)) + name2code = dict(zip(cnv_table.Name, code)) cnv_calls.data['Code'] = cnv_calls.data.apply(lambda r: name2code[r.CNV], axis=1) columns = ['Chromosome', 'Position'] + cnv_calls.data.Sample.to_list() copy_number.data.df = copy_number.data.df[columns] diff --git a/pypgx/cli/call_genotypes.py b/pypgx/cli/call_genotypes.py index 86998cc9..fc1fb4e2 100644 --- a/pypgx/cli/call_genotypes.py +++ b/pypgx/cli/call_genotypes.py @@ -5,30 +5,36 @@ import fuc description = f""" -Call genotypes for the target gene. +Call genotypes for target gene. """ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Call genotypes for the target gene.', description=description, + help= +"""Call genotypes for target gene.""" ) parser.add_argument( 'genotypes', - help='Archive file with the semantic type \n' - 'SampleTable[Genotypes].' + help= +"""Output archive file with the semantic type +SampleTable[Genotypes].""" ) parser.add_argument( '--alleles', metavar='PATH', - help='Archive file with the semantic type SampleTable[Alleles].' + help= +"""Input archive file with the semantic type +SampleTable[Alleles].""" ) parser.add_argument( '--cnv-calls', metavar='PATH', - help='Archive file with the semantic type SampleTable[CNVCalls].' + help= +"""Input archive file with the semantic type +SampleTable[CNVCalls].""" ) def main(args): diff --git a/pypgx/cli/call_phenotypes.py b/pypgx/cli/call_phenotypes.py index b2271c20..51718388 100644 --- a/pypgx/cli/call_phenotypes.py +++ b/pypgx/cli/call_phenotypes.py @@ -5,23 +5,28 @@ import fuc description = f""" -Call phenotypes for the target gene. +Call phenotypes for target gene. """ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Call phenotypes for the target gene.', description=description, + help= +"""Call phenotypes for target gene.""" ) parser.add_argument( 'genotypes', - help='Archive file with the semantic type SampleTable[Genotypes].' + help= +"""Input archive file with the semantic type +SampleTable[Genotypes].""" ) parser.add_argument( 'phenotypes', - help='Archive file with the semantic type SampleTable[Phenotypes].' + help= +"""Output archive file with the semantic type +SampleTable[Phenotypes].""" ) def main(args): diff --git a/pypgx/cli/combine_results.py b/pypgx/cli/combine_results.py index 5d16437e..0b7e1121 100644 --- a/pypgx/cli/combine_results.py +++ b/pypgx/cli/combine_results.py @@ -5,43 +5,50 @@ import fuc description = f""" -Combine various results for the target gene. +Combine various results for target gene. """ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Combine various results for the target gene.', description=description, + help= +"""Combine various results for target gene.""" ) parser.add_argument( 'results', - help='Archive file with the semantic type SampleTable[Results].' + help= +"""Output archive file with the semantic type +SampleTable[Results].""" ) parser.add_argument( '--genotypes', metavar='PATH', - help='Archive file with the semantic type \n' - 'SampleTable[Genotypes].' + help= +"""Input archive file with the semantic type +SampleTable[Genotypes].""" ) parser.add_argument( '--phenotypes', metavar='PATH', - help='Archive file with the semantic type \n' - 'SampleTable[Phenotypes].' + help= +"""Input archive file with the semantic type +SampleTable[Phenotypes].""" ) parser.add_argument( '--alleles', metavar='PATH', - help='Archive file with the semantic type \n' - 'SampleTable[Alleles].' + help= +"""Input archive file with the semantic type +SampleTable[Alleles].""" ) parser.add_argument( '--cnv-calls', metavar='PATH', - help='Archive file with the semantic type \n' - 'SampleTable[CNVCalls].' + help= +"""Input archive file with the semantic type +SampleTable[CNVCalls].""" ) def main(args): diff --git a/pypgx/cli/compare_genotypes.py b/pypgx/cli/compare_genotypes.py index 9d48e6f1..bbe0c845 100644 --- a/pypgx/cli/compare_genotypes.py +++ b/pypgx/cli/compare_genotypes.py @@ -15,25 +15,29 @@ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Calculate concordance between two genotype results.', description=description, + help= +"""Calculate concordance between two genotype results.""" ) parser.add_argument( 'first', - help='First archive file with the semantic type \n' - 'SampleTable[Results].' + help= +"""First archive file with the semantic type +SampleTable[Results].""" ) parser.add_argument( 'second', - help='Second archive file with the semantic type \n' - 'SampleTable[Results].' + help= +"""Second archive file with the semantic type +SampleTable[Results].""" ) parser.add_argument( '--verbose', action='store_true', - help='Whether to print the verbose version of output, including \n' - 'discordant calls.' + help= +"""Whether to print the verbose version of output, including +discordant calls.""" ) def main(args): diff --git a/pypgx/cli/compute_control_statistics.py b/pypgx/cli/compute_control_statistics.py index 9a5bd31e..007499aa 100644 --- a/pypgx/cli/compute_control_statistics.py +++ b/pypgx/cli/compute_control_statistics.py @@ -6,21 +6,25 @@ import pysam description = """ -Compute summary statistics for the control gene from BAM files. +Compute summary statistics for control gene from BAM files. + +Note that for the arguments gene and --bed, the 'chr' prefix in contig names +(e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to +match the input BAM's contig names. """ epilog = f""" -[Example] To compute summary statistics for the VDR gene from WGS data: +[Example] For the VDR gene from WGS data: $ pypgx {fuc.api.common._script_name()} \\ - control-statistcs-VDR.zip \\ - --gene VDR \\ - --bam A.bam B.bam + VDR \\ + control-statistcs.zip \\ + 1.bam 2.bam [Example] For a custom region from targeted sequencing data: $ pypgx {fuc.api.common._script_name()} \\ - control-statistcs-VDR.zip \\ - --gene chr1:100-200 \\ - --fn bam.list \\ + chr1:100-200 \\ + control-statistcs.zip \\ + bam.list \\ --bed probes.bed """ @@ -30,60 +34,51 @@ def create_parser(subparsers): fuc.api.common._script_name(), description=description, epilog=epilog, - help='Compute summary statistics for the control gene from \n' - 'BAM files.', + help= +"""Compute summary statistics for control gene from BAM +files.""" + ) + parser.add_argument( + 'gene', + help= +"""Control gene (recommended choices: 'EGFR', 'RYR1', +'VDR'). Alternatively, you can provide a custom region +(format: chrom:start-end).""" ) parser.add_argument( 'control_statistics', metavar='control-statistics', - help='Archive file with the semantic type \n' - 'SampleTable[Statistics].' + help= +"""Output archive file with the semantic type +SampleTable[Statistics].""" ) parser.add_argument( - '--bam', - metavar='PATH', + 'bams', nargs='+', - help='One or more BAM files. Cannot be used with --fn.' - ) - parser.add_argument( - '--fn', - metavar='PATH', - help='File containing one BAM file per line. Cannot be \n' - 'used with --bam.' - ) - parser.add_argument( - '--gene', - metavar='TEXT', - help="Control gene (recommended choices: 'EGFR', 'RYR1', \n" - "'VDR'). Cannot be used with --region." - ) - parser.add_argument( - '--region', - metavar='TEXT', - help="Custom region to use as control gene \n" - "('chrom:start-end'). Cannot be used with --gene." + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--bed', metavar='PATH', - help="By default, the input data is assumed to be WGS. If \n" - "it's targeted sequencing, you must provide a BED file \n" - "to indicate probed regions. Note that the 'chr' \n" - "prefix in BED contig names (e.g. 'chr1' vs. '1') will \n" - "be automatically added or removed as necessary to \n" - "match the BAM contig names." + help= +"""By default, the input data is assumed to be WGS. If +it's targeted sequencing, you must provide a BED file +to indicate probed regions.""" ) def main(args): result = utils.compute_control_statistics( - bam=args.bam, fn=args.fn, gene=args.gene, region=args.region, - assembly=args.assembly, bed=args.bed + args.gene, args.bams, assembly=args.assembly, bed=args.bed ) result.to_file(args.control_statistics) diff --git a/pypgx/cli/compute_copy_number.py b/pypgx/cli/compute_copy_number.py index 1ba07f02..9b084fab 100644 --- a/pypgx/cli/compute_copy_number.py +++ b/pypgx/cli/compute_copy_number.py @@ -6,7 +6,7 @@ import pysam description = f""" -Compute copy number from read depth for the target gene. +Compute copy number from read depth for target gene. The command will convert read depth to copy number by performing intra-sample normalization using summary statistics from the control gene. @@ -22,31 +22,36 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Compute copy number from read depth for the target \n' - 'gene.', + help= +"""Compute copy number from read depth for target gene.""" ) parser.add_argument( 'read_depth', metavar='read-depth', - help='Archive file with the semantic type \n' - 'CovFrame[ReadDepth].' + help= +"""Input archive file with the semantic type +CovFrame[ReadDepth].""" ) parser.add_argument( 'control_statistcs', metavar='control-statistcs', - help='Archive file with the semantic type \n' - 'SampleTable[Statistics].' + help= +"""Input archive file with the semantic type +SampleTable[Statistics].""" ) parser.add_argument( - 'output', - help='Archive file with the semantic type \n' - 'CovFrame[CopyNumber].' + 'copy_number', + metavar='copy-number', + help= +"""Output archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( '--samples-without-sv', metavar='TEXT', nargs='+', - help='List of known samples with no SV.' + help= +"""List of known samples with no SV.""" ) def main(args): @@ -54,4 +59,4 @@ def main(args): args.read_depth, args.control_statistcs, samples_without_sv=args.samples_without_sv ) - result.to_file(args.output) + result.to_file(args.copy_number) diff --git a/pypgx/cli/compute_target_depth.py b/pypgx/cli/compute_target_depth.py index 88aa3851..6492df24 100644 --- a/pypgx/cli/compute_target_depth.py +++ b/pypgx/cli/compute_target_depth.py @@ -6,7 +6,7 @@ import pysam description = f""" -Compute read depth for the target gene from BAM files. +Compute read depth for target gene from BAM files. """ epilog = f""" @@ -14,13 +14,13 @@ $ pypgx {fuc.api.common._script_name()} \\ CYP2D6 \\ read-depth.zip \\ - --bam A.bam B.bam + 1.bam 2.bam [Example] For the CYP2D6 gene from targeted sequencing data: $ pypgx {fuc.api.common._script_name()} \\ CYP2D6 \\ read-depth.zip \\ - --fn bam.txt \\ + bam.list \\ --bed probes.bed """ @@ -30,47 +30,48 @@ def create_parser(subparsers): fuc.api.common._script_name(), description=description, epilog=epilog, - help='Compute read depth for the target gene from BAM files.', + help= +"""Compute read depth for target gene from BAM files.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( - 'output', - help='Archive file with the semantic type \n' - 'CovFrame[ReadDepth].' + 'read_depth', + metavar='read-depth', + help= +"""Output archive file with the semantic type +CovFrame[ReadDepth].""" ) parser.add_argument( - '--bam', - metavar='PATH', + 'bams', nargs='+', - help='One or more BAM files. Cannot be used with --fn.' - ) - parser.add_argument( - '--fn', - metavar='PATH', - help='File containing one BAM file per line. Cannot be \n' - 'used with --bam.' + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--bed', metavar='PATH', - help="By default, the input data is assumed to be WGS. If it \n" - "is targeted sequencing, you must provide a BED file to \n" - "indicate probed regions." + help= +"""By default, the input data is assumed to be WGS. If it +is targeted sequencing, you must provide a BED file to +indicate probed regions.""" ) def main(args): archive = utils.compute_target_depth( - args.gene, bam=args.bam, fn=args.fn, assembly=args.assembly, - bed=args.bed + args.gene, args.bams, assembly=args.assembly, bed=args.bed ) - archive.to_file(args.output) + archive.to_file(args.read_depth) diff --git a/pypgx/cli/create_consolidated_vcf.py b/pypgx/cli/create_consolidated_vcf.py index 5307a9ae..5e2e7988 100644 --- a/pypgx/cli/create_consolidated_vcf.py +++ b/pypgx/cli/create_consolidated_vcf.py @@ -13,26 +13,30 @@ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Create a consolidated VCF file.', description=description, + help= +"""Create a consolidated VCF file.""" ) parser.add_argument( 'imported_variants', metavar='imported-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Imported].' + help= +"""Input archive file with the semantic type +VcfFrame[Imported].""" ) parser.add_argument( 'phased_variants', metavar='phased-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Phased].' + help= +"""Input archive file with the semantic type +VcfFrame[Phased].""" ) parser.add_argument( 'consolidated_variants', metavar='consolidated-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Consolidated].' + help= +"""Output archive file with the semantic type +VcfFrame[Consolidated].""" ) def main(args): diff --git a/pypgx/cli/create_regions_bed.py b/pypgx/cli/create_regions_bed.py index 57a64644..bece78fb 100644 --- a/pypgx/cli/create_regions_bed.py +++ b/pypgx/cli/create_regions_bed.py @@ -14,31 +14,36 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Create a BED file which contains all regions used by \n' - 'PyPGx.', + help= +"""Create a BED file which contains all regions used by +PyPGx.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--add-chr-prefix', action='store_true', - help="Whether to add the 'chr' string in contig names." + help= +"""Whether to add the 'chr' string in contig names.""" ) parser.add_argument( '--merge', action='store_true', - help='Whether to merge overlapping intervals (gene names will \n' - 'be removed too).' + help= +"""Whether to merge overlapping intervals (gene names will +be removed too).""" ) parser.add_argument( '--sv-genes', action='store_true', - help='Whether to only return genes with SV.' + help= +"""Whether to only return genes with SV.""" ) def main(args): diff --git a/pypgx/cli/estimate_phase_beagle.py b/pypgx/cli/estimate_phase_beagle.py index f9991fa5..7d418d95 100644 --- a/pypgx/cli/estimate_phase_beagle.py +++ b/pypgx/cli/estimate_phase_beagle.py @@ -7,10 +7,6 @@ description = f""" Estimate haplotype phase of observed variants with the Beagle program. - -The 'chr' prefix in contig names (e.g. 'chr1' vs. '1') in the input VCF will -be automatically added or removed as necessary to match that of the reference -VCF. """ def create_parser(subparsers): @@ -18,30 +14,40 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Estimate haplotype phase of observed variants with \n' - 'the Beagle program.', + help= +"""Estimate haplotype phase of observed variants with +the Beagle program.""" ) parser.add_argument( 'imported_variants', metavar='imported-variants', - help='Archive file with the semantic type VcfFrame[Imported].' + help= +"""Input archive file with the semantic type +VcfFrame[Imported]. The 'chr' prefix in contig names +(e.g. 'chr1' vs. '1') will be automatically added or +removed as necessary to match the reference VCF's contig +names.""" ) parser.add_argument( 'phased_variants', metavar='phased-variants', - help='Archive file with the semantic type VcfFrame[Phased].' + help= +"""Output archive file with the semantic type +VcfFrame[Phased].""" ) parser.add_argument( '--panel', metavar='PATH', - help='VCF file corresponding to a reference haplotype panel \n' - '(compressed or uncompressed). By default, the 1KGP panel \n' - 'in the ~/pypgx-bundle directory will be used.' + help= +"""VCF file (compressed or uncompressed) corresponding to a +reference haplotype panel. By default, the 1KGP panel in +the ~/pypgx-bundle directory will be used.""" ) parser.add_argument( '--impute', action='store_true', - help='Perform imputation of missing genotypes.' + help= +"""Perform imputation of missing genotypes.""" ) def main(args): diff --git a/pypgx/cli/filter_samples.py b/pypgx/cli/filter_samples.py index 19e70c46..4151678b 100644 --- a/pypgx/cli/filter_samples.py +++ b/pypgx/cli/filter_samples.py @@ -14,28 +14,33 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Filter Archive file for specified samples.', + help= +"""Filter Archive file for specified samples.""" ) parser.add_argument( 'input', - help='Input archive file.' + help= +"""Input archive file.""" ) parser.add_argument( 'output', - help='Output archive file.' + help= +"""Output archive file.""" ) parser.add_argument( 'samples', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified samples.' + help= +"""Exclude specified samples.""" ) def main(args): diff --git a/pypgx/cli/import_read_depth.py b/pypgx/cli/import_read_depth.py index b234d32f..c42cd7ad 100644 --- a/pypgx/cli/import_read_depth.py +++ b/pypgx/cli/import_read_depth.py @@ -6,7 +6,7 @@ import pysam description = f""" -Import read depth data for the target gene. +Import read depth data for target gene. """ def create_parser(subparsers): @@ -14,36 +14,43 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Import read depth data for the target gene.', + help= +"""Import read depth data for target gene.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( 'depth_of_coverage', metavar='depth-of-coverage', - help='Archive file with the semantic type \n' - 'CovFrame[DepthOfCoverage].' + help= +"""Input archive file with the semantic type +CovFrame[DepthOfCoverage].""" ) parser.add_argument( 'read_depth', metavar='read-depth', - help='Archive file with the semantic type CovFrame[ReadDepth].' + help= +"""Output archive file with the semantic type +CovFrame[ReadDepth].""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified samples.' + help= +"""Exclude specified samples.""" ) def main(args): diff --git a/pypgx/cli/import_variants.py b/pypgx/cli/import_variants.py index 588bea90..3eeb7dd0 100644 --- a/pypgx/cli/import_variants.py +++ b/pypgx/cli/import_variants.py @@ -6,12 +6,10 @@ import pysam description = f""" -Import variant (SNV/indel) data for the target gene. +Import SNV/indel data for target gene. -The command will first slice input VCF for the target gene and then assess -whether every genotype call in the sliced VCF is haplotype phased. It will -return an archive file with the semantic type VcfFrame[Consolidated] if the -VCF is fully phased or otherwise VcfFrame[Imported]. +The command will slice the input VCF for the target gene to create an archive +file with the semantic type VcfFrame[Imported] or VcfFrame[Consolidated]. """ def create_parser(subparsers): @@ -19,51 +17,69 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Import variant (SNV/indel) data for the target gene', + help= +"""Import SNV/indel data for target gene.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( 'vcf', - help='Input VCF file must be already BGZF compressed (.gz) and \n' - 'indexed (.tbi) to allow random access.' + help= +"""Input VCF file must be already BGZF compressed (.gz) +and indexed (.tbi) to allow random access.""" ) parser.add_argument( 'imported_variants', metavar='imported-variants', - help='Archive file with the semantic type VcfFrame[Imported] \n' - 'or VcfFrame[Consolidated].' + help= +"""Output archive file with the semantic type +VcfFrame[Imported] or VcfFrame[Consolidated].""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') (choices: \n" - "'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--platform', metavar='TEXT', default='WGS', - choices=['WGS', 'Targeted', 'Chip'], - help="Genotyping platform (default: 'WGS') (choices: 'WGS', \n" - "'Targeted', 'Chip')." + choices=['WGS', 'Targeted', 'Chip', 'LongRead'], + help= +"""Genotyping platform used (default: 'WGS') (choices: +'WGS', 'Targeted', 'Chip', 'LongRead'). When the +platform is 'WGS', 'Targeted', or 'Chip', the command +will assess whether every genotype call in the sliced +VCF is haplotype phased (e.g. '0|1'). If the sliced +VCF is fully phased, the command will return +VcfFrame[Consolidated] or otherwise +VcfFrame[Imported]. When the platform is 'LongRead', +the command will return VcfFrame[Consolidated] after +applying the phase-extension algorithm to estimate +haplotype phase of any variants that could not be +resolved by read-backed phasing.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you +can provide a list of samples.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified samples.' + help= +"""Exclude specified samples.""" ) def main(args): diff --git a/pypgx/cli/plot_bam_copy_number.py b/pypgx/cli/plot_bam_copy_number.py index f70f0554..b98329f0 100644 --- a/pypgx/cli/plot_bam_copy_number.py +++ b/pypgx/cli/plot_bam_copy_number.py @@ -14,53 +14,61 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Plot copy number profile from CovFrame[CopyNumber].', + help= +"""Plot copy number profile from CovFrame[CopyNumber].""" ) parser.add_argument( 'copy_number', metavar='copy-number', - help='Archive file with the semantic type \n' - 'CovFrame[CopyNumber].' + help= +"""Input archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( '--fitted', action='store_true', - help='Show the fitted line as well.' + help= +"""Show the fitted line as well.""" ) parser.add_argument( '--path', metavar='PATH', - help='Create plots in this directory.' + help= +"""Create plots in this directory.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--ymin', metavar='FLOAT', type=float, default=-0.3, - help='Y-axis bottom (default: -0.3).' + help= +"""Y-axis bottom (default: -0.3).""" ) parser.add_argument( '--ymax', metavar='FLOAT', type=float, default=6.3, - help='Y-axis top (default: 6.3).' + help= +"""Y-axis top (default: 6.3).""" ) parser.add_argument( '--fontsize', metavar='FLOAT', type=float, default=25, - help='Text fontsize (default: 25).' + help= +"""Text fontsize (default: 25).""" ) def main(args): diff --git a/pypgx/cli/plot_bam_read_depth.py b/pypgx/cli/plot_bam_read_depth.py index 4af412f0..e119c22f 100644 --- a/pypgx/cli/plot_bam_read_depth.py +++ b/pypgx/cli/plot_bam_read_depth.py @@ -14,45 +14,53 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Plot read depth profile with BAM data.', + help= +"""Plot read depth profile with BAM data.""" ) parser.add_argument( 'read_depth', - help='Archive file with the semantic type \n' - 'CovFrame[ReadDepth].' + metavar='read-depth', + help= +"""Input archive file with the semantic type +CovFrame[ReadDepth].""" ) parser.add_argument( '--path', metavar='PATH', - help='Create plots in this directory.' + help= +"""Create plots in this directory.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--ymin', metavar='FLOAT', type=float, - help='Y-axis bottom.' + help= +"""Y-axis bottom.""" ) parser.add_argument( '--ymax', metavar='FLOAT', type=float, - help='Y-axis top.' + help= +"""Y-axis top.""" ) parser.add_argument( '--fontsize', metavar='FLOAT', type=float, default=25, - help='Text fontsize (default: 25).' + help= +"""Text fontsize (default: 25).""" ) def main(args): diff --git a/pypgx/cli/plot_cn_af.py b/pypgx/cli/plot_cn_af.py index 549ebe7b..c7aa0bab 100644 --- a/pypgx/cli/plot_cn_af.py +++ b/pypgx/cli/plot_cn_af.py @@ -14,55 +14,63 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Plot both copy number profile and allele fraction \n' - 'profile in one figure.', + help= +"""Plot both copy number profile and allele fraction +profile in one figure.""" ) parser.add_argument( 'copy_number', metavar='copy-number', - help='Archive file with the semantic type \n' - 'CovFrame[CopyNumber].' + help= +"""Input archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( 'imported_variants', metavar='imported-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Imported].' + help= +"""Input archive file with the semantic type +VcfFrame[Imported].""" ) parser.add_argument( '--path', metavar='PATH', - help='Create plots in this directory.' + help= +"""Create plots in this directory.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--ymin', metavar='FLOAT', type=float, default=-0.3, - help='Y-axis bottom (default: -0.3).' + help= +"""Y-axis bottom (default: -0.3).""" ) parser.add_argument( '--ymax', metavar='FLOAT', type=float, default=6.3, - help='Y-axis top (default: 6.3).' + help= +"""Y-axis top (default: 6.3).""" ) parser.add_argument( '--fontsize', metavar='FLOAT', type=float, default=25, - help='Text fontsize (default: 25).' + help= +"""Text fontsize (default: 25).""" ) def main(args): diff --git a/pypgx/cli/plot_vcf_allele_fraction.py b/pypgx/cli/plot_vcf_allele_fraction.py index ecb49f9e..a34f7a68 100644 --- a/pypgx/cli/plot_vcf_allele_fraction.py +++ b/pypgx/cli/plot_vcf_allele_fraction.py @@ -14,34 +14,39 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Plot allele fraction profile with VCF data.', + help= +"""Plot allele fraction profile with VCF data.""" ) parser.add_argument( 'imported_variants', metavar='imported-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Imported].' + help= +"""Input archive file with the semantic type +VcfFrame[Imported].""" ) parser.add_argument( '--path', metavar='PATH', - help='Create plots in this directory.' + help= +"""Create plots in this directory.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--fontsize', metavar='FLOAT', type=float, default=25, - help='Text fontsize (default: 25).' + help= +"""Text fontsize (default: 25).""" ) def main(args): diff --git a/pypgx/cli/plot_vcf_read_depth.py b/pypgx/cli/plot_vcf_read_depth.py index 88efb03e..917b945c 100644 --- a/pypgx/cli/plot_vcf_read_depth.py +++ b/pypgx/cli/plot_vcf_read_depth.py @@ -14,48 +14,56 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Plot read depth profile with VCF data.', + help= +"""Plot read depth profile with VCF data.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( 'vcf', - help='VCF file.' + help= +"""Input VCF file.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--path', metavar='PATH', - help='Create plots in this directory.' + help= +"""Create plots in this directory.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you can +provide a list of samples.""" ) parser.add_argument( '--ymin', metavar='FLOAT', type=float, - help='Y-axis bottom.' + help= +"""Y-axis bottom.""" ) parser.add_argument( '--ymax', metavar='FLOAT', type=float, - help='Y-axis top.' + help= +"""Y-axis top.""" ) def main(args): diff --git a/pypgx/cli/predict_alleles.py b/pypgx/cli/predict_alleles.py index fec396ac..b241eedd 100644 --- a/pypgx/cli/predict_alleles.py +++ b/pypgx/cli/predict_alleles.py @@ -14,19 +14,22 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Predict candidate star alleles based on observed \n' - 'variants.', + help= +"""Predict candidate star alleles based on observed +variants.""" ) parser.add_argument( 'consolidated_variants', metavar='consolidated-variants', - help='Archive file with the semantic type \n' - 'VcfFrame[Consolidated].' + help= +"""Input archive file with the semantic type +VcfFrame[Consolidated].""" ) parser.add_argument( 'alleles', - help='Archive file with the semantic type \n' - 'SampleTable[Alleles].' + help= +"""Output archive file with the semantic type +SampleTable[Alleles].""" ) def main(args): diff --git a/pypgx/cli/predict_cnv.py b/pypgx/cli/predict_cnv.py index 3842f51d..d78787e1 100644 --- a/pypgx/cli/predict_cnv.py +++ b/pypgx/cli/predict_cnv.py @@ -6,7 +6,7 @@ import pysam description = f""" -Predict CNV for the target gene based on copy number data. +Predict CNV from copy number data for target gene. Genomic positions that are missing copy number because, for example, the input data is targeted sequencing will be imputed with forward filling. @@ -17,26 +17,30 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Predict CNV for the target gene based on copy number \n' - 'data.', + help= +"""Predict CNV from copy number data for target gene.""" ) parser.add_argument( 'copy_number', metavar='copy-number', - help='Archive file with the semantic type CovFrame[CopyNumber].' + help= +"""Input archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( 'cnv_calls', metavar='cnv-calls', - help='Archive file with the semantic type \n' - 'SampleTable[CNVCalls].' + help= +"""Output archive file with the semantic type +SampleTable[CNVCalls].""" ) parser.add_argument( '--cnv-caller', metavar='PATH', - help='Archive file with the semantic type Model[CNV]. By \n' - 'default, a pre-trained CNV caller in the ~/pypgx-bundle \n' - 'directory will be used.' + help= +"""Archive file with the semantic type Model[CNV]. By +default, a pre-trained CNV caller in the ~/pypgx-bundle +directory will be used.""" ) def main(args): diff --git a/pypgx/cli/prepare_depth_of_coverage.py b/pypgx/cli/prepare_depth_of_coverage.py index 06e6ffff..26bf752c 100644 --- a/pypgx/cli/prepare_depth_of_coverage.py +++ b/pypgx/cli/prepare_depth_of_coverage.py @@ -6,19 +6,19 @@ import pysam description = """ -Prepare a depth of coverage file for all target genes with SV. +Prepare a depth of coverage file for all target genes with SV from BAM files. """ epilog = f""" -[Example] When the input data is WGS: +[Example] From WGS data: $ pypgx {fuc.api.common._script_name()} \\ depth-of-coverage.zip \\ - --bam A.bam B.bam + 1.bam 2.bam -[Example] When the input data is targeted sequencing: +[Example] From targeted sequencing data: $ pypgx {fuc.api.common._script_name()} \\ depth-of-coverage.zip \\ - --fn bam.txt \\ + bam.list \\ --bed probes.bed """ @@ -28,47 +28,47 @@ def create_parser(subparsers): fuc.api.common._script_name(), description=description, epilog=epilog, - help='Prepare a depth of coverage file for all target \n' - 'genes with SV.', + help= +"""Prepare a depth of coverage file for all target +genes with SV from BAM files.""" ) parser.add_argument( 'depth_of_coverage', metavar='depth-of-coverage', - help='Archive file with the semantic type \n' - 'CovFrame[DepthOfCoverage].' + help= +"""Output archive file with the semantic type +CovFrame[DepthOfCoverage].""" ) parser.add_argument( - '--bam', - metavar='PATH', + 'bams', nargs='+', - help='One or more BAM files. Cannot be used with --fn.' - ) - parser.add_argument( - '--fn', - metavar='PATH', - help='File containing one BAM file per line. Cannot be used \n' - 'with --bam.' + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--bed', metavar='PATH', - help="By default, the input data is assumed to be WGS. If \n" - "it's targeted sequencing, you must provide a BED file \n" - "to indicate probed regions. Note that the 'chr' \n" - "prefix in BED contig names (e.g. 'chr1' vs. '1') will \n" - "be automatically added or removed as necessary to \n" - "match the BAM contig names." + help= +"""By default, the input data is assumed to be WGS. If +it's targeted sequencing, you must provide a BED file +to indicate probed regions. Note that the 'chr' prefix +in contig names (e.g. 'chr1' vs. '1') will be +automatically added or removed as necessary to match +the input BAM's contig names.""" ) def main(args): archive = utils.prepare_depth_of_coverage( - bam=args.bam, fn=args.fn, assembly=args.assembly, bed=args.bed + args.bams, assembly=args.assembly, bed=args.bed ) archive.to_file(args.depth_of_coverage) diff --git a/pypgx/cli/print_metadata.py b/pypgx/cli/print_metadata.py index e55a1d70..b05cff71 100644 --- a/pypgx/cli/print_metadata.py +++ b/pypgx/cli/print_metadata.py @@ -13,12 +13,14 @@ def create_parser(subparsers): parser = fuc.api.common._add_parser( subparsers, fuc.api.common._script_name(), - help='Print the metadata of specified archive.', description=description, + help= +"""Print the metadata of specified archive.""" ) parser.add_argument( 'input', - help='Archive file.' + help= +"""Input archive file.""" ) def main(args): diff --git a/pypgx/cli/run_chip_pipeline.py b/pypgx/cli/run_chip_pipeline.py index 89a5b786..1dde9fdd 100644 --- a/pypgx/cli/run_chip_pipeline.py +++ b/pypgx/cli/run_chip_pipeline.py @@ -5,7 +5,7 @@ import fuc description = f""" -Run PyPGx's genotyping pipeline for chip data. +Run genotyping pipeline for chip data. """ epilog = f""" @@ -22,60 +22,70 @@ def create_parser(subparsers): fuc.api.common._script_name(), description=description, epilog=epilog, - help="Run PyPGx's genotyping pipeline for chip data.", + help= +"""Run genotyping pipeline for chip data.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( 'output', - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'variants', - help='Input VCF file must be already BGZF compressed (.gz) \n' - 'and indexed (.tbi) to allow random access. Statistical \n' - 'haplotype phasing will be skipped if input VCF is \n' - 'already fully phased.' + help= +"""Input VCF file must be already BGZF compressed (.gz) +and indexed (.tbi) to allow random access. +Statistical haplotype phasing will be skipped if +input VCF is already fully phased.""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') (choices: \n" - "'GRCh37', 'GRCh38')." + help=""" +Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--panel', metavar='PATH', - help='VCF file corresponding to a reference haplotype panel \n' - '(compressed or uncompressed). By default, the 1KGP panel \n' - 'in the ~/pypgx-bundle directory will be used.' + help= +"""VCF file corresponding to a reference haplotype panel +(compressed or uncompressed). By default, the 1KGP +panel in the ~/pypgx-bundle directory will be used.""" ) parser.add_argument( '--impute', action='store_true', - help='Perform imputation of missing genotypes.' + help= +"""Perform imputation of missing genotypes.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite output directory if it already exists.' + help= +"""Overwrite output directory if it already exists.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you can \n' - 'provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you +can provide a list of samples.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified samples.' + help= +"""Exclude specified samples.""" ) def main(args): diff --git a/pypgx/cli/run_long_read_pipeline.py b/pypgx/cli/run_long_read_pipeline.py new file mode 100644 index 00000000..863dff2c --- /dev/null +++ b/pypgx/cli/run_long_read_pipeline.py @@ -0,0 +1,79 @@ +import sys + +from ..api import pipeline + +import fuc + +description = f""" +Run genotyping pipeline for long-read sequencing data. +""" + +epilog = f""" +[Example] To genotype the CYP3A5 gene from long-read sequencing data: + $ pypgx {fuc.api.common._script_name()} \\ + CYP3A5 \\ + CYP3A5-pipeline \\ + variants.vcf.gz +""" + +def create_parser(subparsers): + parser = fuc.api.common._add_parser( + subparsers, + fuc.api.common._script_name(), + description=description, + epilog=epilog, + help= +"""Run genotyping pipeline for long-read sequencing data.""" + ) + parser.add_argument( + 'gene', + help= +"""Target gene.""" + ) + parser.add_argument( + 'output', + help= +"""Output directory.""" + ) + parser.add_argument( + 'variants', + help= +"""Input VCF file must be already BGZF compressed (.gz) +and indexed (.tbi) to allow random access.""" + ) + parser.add_argument( + '--assembly', + metavar='TEXT', + default='GRCh37', + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" + ) + parser.add_argument( + '--force', + action='store_true', + help= +"""Overwrite output directory if it already exists.""" + ) + parser.add_argument( + '--samples', + metavar='TEXT', + nargs='+', + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you +can provide a list of samples.""" + ) + parser.add_argument( + '--exclude', + action='store_true', + help= +"""Exclude specified samples.""" + ) + +def main(args): + pipeline.run_long_read_pipeline( + args.gene, args.output, args.variants, assembly=args.assembly, + force=args.force, samples=args.samples, exclude=args.exclude + ) diff --git a/pypgx/cli/run_ngs_pipeline.py b/pypgx/cli/run_ngs_pipeline.py index 4ff354ad..03273333 100644 --- a/pypgx/cli/run_ngs_pipeline.py +++ b/pypgx/cli/run_ngs_pipeline.py @@ -5,7 +5,7 @@ import fuc description = """ -Run PyPGx's genotyping pipeline for NGS data. +Run genotyping pipeline for NGS data. During copy number analysis, if the input data is targeted sequencing, the command will apply inter-sample normalization using summary statistics across @@ -44,99 +44,115 @@ def create_parser(subparsers): fuc.api.common._script_name(), description=description, epilog=epilog, - help="Run PyPGx's genotyping pipeline for NGS data.", + help= +"""Run genotyping pipeline for NGS data.""" ) parser.add_argument( 'gene', - help='Target gene.' + help= +"""Target gene.""" ) parser.add_argument( 'output', - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( '--variants', metavar='PATH', - help='Input VCF file must be already BGZF compressed (.gz) \n' - 'and indexed (.tbi) to allow random access. \n' - 'Statistical haplotype phasing will be skipped if \n' - 'input VCF is already fully phased.' + help= +"""Input VCF file must be already BGZF compressed (.gz) +and indexed (.tbi) to allow random access. +Statistical haplotype phasing will be skipped if +input VCF is already fully phased.""" ) parser.add_argument( '--depth-of-coverage', metavar='PATH', - help='Archive file with the semantic type \n' - 'CovFrame[DepthOfCoverage].' + help= +"""Archive file with the semantic type +CovFrame[DepthOfCoverage].""" ) parser.add_argument( '--control-statistics', metavar='PATH', - help='Archive file with the semantic type \n' - 'SampleTable[Statistcs].' + help= +"""Archive file with the semantic type +SampleTable[Statistcs].""" ) parser.add_argument( '--platform', metavar='TEXT', default='WGS', choices=['WGS', 'Targeted'], - help="Genotyping platform (default: 'WGS') (choices: 'WGS', \n" - "'Targeted')" + help= +"""Genotyping platform (default: 'WGS') (choices: 'WGS', +'Targeted')""" ) parser.add_argument( '--assembly', metavar='TEXT', default='GRCh37', - help="Reference genome assembly (default: 'GRCh37') \n" - "(choices: 'GRCh37', 'GRCh38')." + help= +"""Reference genome assembly (default: 'GRCh37') +(choices: 'GRCh37', 'GRCh38').""" ) parser.add_argument( '--panel', metavar='PATH', - help='VCF file corresponding to a reference haplotype panel \n' - '(compressed or uncompressed). By default, the 1KGP panel \n' - 'in the ~/pypgx-bundle directory will be used.' + help= +"""VCF file corresponding to a reference haplotype panel +(compressed or uncompressed). By default, the 1KGP panel +in the ~/pypgx-bundle directory will be used.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite output directory if it already exists.' + help= +"""Overwrite output directory if it already exists.""" ) parser.add_argument( '--samples', metavar='TEXT', nargs='+', - help='Specify which samples should be included for analysis \n' - 'by providing a text file (.txt, .tsv, .csv, or .list) \n' - 'containing one sample per line. Alternatively, you \n' - 'can provide a list of samples.' + help= +"""Specify which samples should be included for analysis +by providing a text file (.txt, .tsv, .csv, or .list) +containing one sample per line. Alternatively, you +can provide a list of samples.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified samples.' + help= +"""Exclude specified samples.""" ) parser.add_argument( '--samples-without-sv', metavar='TEXT', nargs='+', - help="List of known samples without SV." + help= +"""List of known samples without SV.""" ) parser.add_argument( '--do-not-plot-copy-number', action='store_true', - help='Do not plot copy number profile.' + help= +"""Do not plot copy number profile.""" ) parser.add_argument( '--do-not-plot-allele-fraction', action='store_true', - help='Do not plot allele fraction profile.' + help= +"""Do not plot allele fraction profile.""" ) parser.add_argument( '--cnv-caller', metavar='PATH', - help='Archive file with the semantic type Model[CNV]. By \n' - 'default, a pre-trained CNV caller in the ~/pypgx-bundle \n' - 'directory will be used.' + help= +"""Archive file with the semantic type Model[CNV]. By +default, a pre-trained CNV caller in the ~/pypgx-bundle +directory will be used.""" ) def main(args): diff --git a/pypgx/cli/test_cnv_caller.py b/pypgx/cli/test_cnv_caller.py index f665ea30..5aa4cca8 100644 --- a/pypgx/cli/test_cnv_caller.py +++ b/pypgx/cli/test_cnv_caller.py @@ -6,7 +6,7 @@ import pysam description = f""" -Test a CNV caller for the target gene. +Test CNV caller for target gene. """ def create_parser(subparsers): @@ -14,29 +14,36 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Test a CNV caller for the target gene.', + help= +"""Test CNV caller for target gene.""" ) parser.add_argument( 'cnv_caller', metavar='cnv-caller', - help='Archive file with the semantic type Model[CNV].' + help= +"""Input archive file with the semantic type Model[CNV].""" ) parser.add_argument( 'copy_number', metavar='copy-number', - help='Archive file with the semantic type \n' - 'CovFrame[CopyNumber].' + help= +"""Input archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( 'cnv_calls', metavar='cnv-calls', - help='Archive file with the semantic type \n' - 'SampleTable[CNVCalls].' + help= +"""Input archive file with the semantic type +SampleTable[CNVCalls].""" ) parser.add_argument( '--confusion-matrix', metavar='PATH', - help='Write the confusion matrix as a CSV file.' + help= +"""Write the confusion matrix as a CSV file where rows +indicate actual class and columns indicate prediction +class.""" ) def main(args): diff --git a/pypgx/cli/train_cnv_caller.py b/pypgx/cli/train_cnv_caller.py index cb4f47df..5d9f9a5c 100644 --- a/pypgx/cli/train_cnv_caller.py +++ b/pypgx/cli/train_cnv_caller.py @@ -6,7 +6,7 @@ import pysam description = f""" -Train a CNV caller for the target gene. +Train CNV caller for target gene. This command will return a SVM-based multiclass classifier that makes CNV calls using the one-vs-rest strategy. @@ -17,29 +17,36 @@ def create_parser(subparsers): subparsers, fuc.api.common._script_name(), description=description, - help='Train a CNV caller for the target gene.', + help= +"""Train CNV caller for target gene.""" ) parser.add_argument( 'copy_number', metavar='copy-number', - help='Archive file with the semantic type \n' - 'CovFrame[CopyNumber].' + help= +"""Input archive file with the semantic type +CovFrame[CopyNumber].""" ) parser.add_argument( 'cnv_calls', metavar='cnv-calls', - help='Archive file with the semantic type \n' - 'SampleTable[CNVCalls].' + help= +"""Input archive file with the semantic type +SampleTable[CNVCalls].""" ) parser.add_argument( 'cnv_caller', metavar='cnv-caller', - help='Archive file with the semantic type Model[CNV].' + help= +"""Output archive file with the semantic type Model[CNV].""" ) parser.add_argument( '--confusion-matrix', metavar='PATH', - help='Write the confusion matrix as a CSV file.' + help= +"""Write the confusion matrix as a CSV file where rows +indicate actual class and columns indicate prediction +class.""" ) def main(args): diff --git a/pypgx/sdk/__init__.py b/pypgx/sdk/__init__.py index e0933b35..ea421123 100644 --- a/pypgx/sdk/__init__.py +++ b/pypgx/sdk/__init__.py @@ -1,3 +1,3 @@ -from .utils import (Archive, add_cn_samples, parse_input_bams, compare_metadata, simulate_copy_number) +from .utils import (Archive, add_cn_samples, compare_metadata, simulate_copy_number) -__all__ = ['Archive', 'add_cn_samples', 'parse_input_bams', 'compare_metadata', 'simulate_copy_number'] +__all__ = ['Archive', 'add_cn_samples', 'compare_metadata', 'simulate_copy_number'] diff --git a/pypgx/sdk/utils.py b/pypgx/sdk/utils.py index 1eb3f7e6..b02a93bc 100644 --- a/pypgx/sdk/utils.py +++ b/pypgx/sdk/utils.py @@ -9,15 +9,30 @@ import numpy as np from fuc import pyvcf, pycov, common, pybam +class AlleleNotFoundError(Exception): + """Raise if specified allele is not present in the allele table.""" + +class GeneNotFoundError(Exception): + """Raise if specified gene is not present in the gene table.""" + class IncorrectMetadataError(Exception): """Raised when specified metadata is incorrect.""" class IncorrectSemanticTypeError(Exception): """Raised when specified semantic type is incorrect.""" +class NotTargetGeneError(Exception): + """Raise if specified gene is not one of the target genes.""" + +class PhenotypeNotFoundError(Exception): + """Raise if specified phenotype is not present in the phenotype table.""" + class SemanticTypeNotFoundError(Exception): """Raised when specified semantic type is not supported.""" +class VariantNotFoundError(Exception): + """Raise if specified variant is not present in the variant table.""" + class Archive: """ Class for storing various data. @@ -254,51 +269,6 @@ def func(r): df2['rsID'] = df2.apply(lambda r: rs_dict[r.Name], axis=1) df2.to_csv(f'{gene}-{assembly}.csv') -def parse_input_bams(bam=None, fn=None): - """ - Parse input BAM files for downstream analyses. - - Many of the PyPGx actions accept BAM files as input and users have a - choice between manually specifying individual BAM files (``bam``) and - simply provding a BAM list (``fn``). This method will parse a user's - choice and then return a list of input BAM files. As a bonus, it will - also determine whether the 'chr' string is found in the contig names. - - Parameters - ---------- - bam : list, optional - One or more BAM files. - fn : str, optional - File containing one BAM file per line. - - Returns - ------- - list - List of BAM files. - str - Either '' or 'chr' depending on the contig names. - """ - bam_files = [] - - if bam is None and fn is None: - raise ValueError("Must provide either 'bam' or 'fn'") - elif bam is not None and fn is not None: - raise ValueError("Cannot use 'bam' and 'fn' together") - elif bam is not None and fn is None: - if isinstance(bam, str): - bam_files.append(bam) - else: - bam_files += bam - else: - bam_files += common.convert_file2list(fn) - - if all([pybam.has_chr_prefix(x) for x in bam_files]): - chr_prefix = 'chr' - else: - chr_prefix = '' - - return bam_files, chr_prefix - def simulate_copy_number( target, source, sample, sv, n=3, mu=0, sigma=0.05 ): diff --git a/pypgx/version.py b/pypgx/version.py index 2c7bffbf..2d7893e3 100644 --- a/pypgx/version.py +++ b/pypgx/version.py @@ -1 +1 @@ -__version__ = '0.12.0' +__version__ = '0.13.0'