Update CLI

sbslee · Oct 23, 2021 · 68e59af · 68e59af
1 parent 52ed0b0
commit 68e59af
Show file tree

Hide file tree

Showing 29 changed files with 499 additions and 525 deletions.
diff --git a/README.rst b/README.rst
@@ -169,18 +169,18 @@ For getting help on the CLI:
    
    positional arguments:
      COMMAND
-       call-genotypes      Call genotypes for target gene.
+       call-genotypes      Call genotypes for the target gene.
        call-phenotypes     Call phenotypes for the target gene.
        combine-results     Combine various results for the target gene.
        compare-genotypes   Calculate concordance rate between two genotype results.
        compute-control-statistics
-                           Compute various statistics for control gene with BAM data.
+                           Compute summary statistics for the control gene from BAM files.
        compute-copy-number
                            Compute copy number from read depth for the target gene.
        compute-target-depth
-                           Compute read depth for target gene with BAM data.
+                           Compute read depth for the target gene from BAM files.
        create-consolidated-vcf
-                           Create consolidated VCF.
+                           Create a consolidated VCF file.
        create-regions-bed  Create a BED file which contains all regions used by PyPGx.
        estimate-phase-beagle
                            Estimate haplotype phase of observed variants with the Beagle program.
@@ -197,12 +197,12 @@ For getting help on the CLI:
        plot-vcf-read-depth
                            Plot read depth profile with VCF data.
        predict-alleles     Predict candidate star alleles based on observed variants.
-       predict-cnv         Predict CNV for target gene based on copy number data.
+       predict-cnv         Predict CNV for the target gene based on copy number data.
        prepare-depth-of-coverage
                            Prepare a depth of coverage file for all target genes with SV.
        print-metadata      Print the metadata of specified archive.
-       run-chip-pipeline   Run genotyping pipeline for chip data.
-       run-ngs-pipeline    Run genotyping pipeline for NGS data.
+       run-chip-pipeline   Run PyPGx's genotyping pipeline for chip data.
+       run-ngs-pipeline    Run PyPGx's genotyping pipeline for NGS data.
        test-cnv-caller     Test a CNV caller for the target gene.
        train-cnv-caller    Train a CNV caller for the target gene.
    

diff --git a/docs/cli.rst b/docs/cli.rst
diff --git a/pypgx/api/pipeline.py b/pypgx/api/pipeline.py
@@ -15,18 +15,18 @@ def run_chip_pipeline(
     gene, output, variants, panel=None, impute=False, force=False
 ):
     """
-    Run genotyping pipeline for chip data.
+    Run PyPGx's genotyping pipeline for chip data.
 
     Parameters
     ----------
     gene : str
         Target gene.
     output : str
         Output directory.
-    variants : str, optional
+    variants : str
         VCF file (zipped or unzipped).
     impute : bool, default: False
-        Whether to perform imputation of missing genotypes.
+        If True, perform imputation of missing genotypes.
     force : bool, default : False
         Overwrite output directory if it already exists.
     """
@@ -62,7 +62,12 @@ def run_ngs_pipeline(
     do_not_plot_allele_fraction=False
 ):
     """
-    Run genotyping pipeline for NGS data (WGS and targeted sequencing).
+    Run PyPGx's genotyping pipeline for NGS data.
+
+    During copy number analysis, if the input data is targeted sequencing,
+    the method will apply inter-sample normalization using summary statistics
+    across all samples. For best results, it is recommended to specify known
+    samples without SV using ``samples``.
 
     Parameters
     ----------
@@ -80,7 +85,8 @@ def run_ngs_pipeline(
     platform : {'WGS', 'Targeted'}, default: 'WGS'
         Genotyping platform.
     panel : str, optional
-        Reference haplotype panel.
+        VCF file corresponding to a reference haplotype panel (zipped or
+        unzipped). By default, the 1KGP panel is used.
     force : bool, default : False
         Overwrite output directory if it already exists.
     samples : list, optional

diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
@@ -218,30 +218,25 @@ def compute_control_statistics(
     bam=None, fn=None, gene=None, region=None, assembly='GRCh37', bed=None
 ):
     """
-    Compute copy number from read depth for target gene.
-
-    Input BAM files must be specified with either ``bam`` or ``fn``, but
-    it's an error to use both. Similarly, control gene must be specified with
-    either ``gene`` or ``region``, but it's an error to use both.
-
-    By default, the input data is assumed to be WGS. If it's targeted
-    sequencing, you must provide a BED file with ``bed`` to indicate
-    probed regions.
+    Compute summary statistics for the control gene from BAM files.
 
     Parameters
     ----------
     bam : list, optional
-        One or more BAM files.
+        One or more BAM files. Cannot be used with ``fn``.
     fn : str, optional
-        File containing one BAM file per line.
+        File containing one BAM file per line. Cannot be used with ``bam``.
     gene : str, optional
-        Control gene (recommended choices: 'EGFR', 'RYR1', 'VDR').
+        Control gene (recommended choices: 'EGFR', 'RYR1', 'VDR'). Cannot be
+        used with ``region``.
     region : str, optional
-        Custom region to use as control gene ('chrom:start-end').
+        Custom region to use as control gene ('chrom:start-end'). Cannot be
+        used with ``gene``.
     assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37'
         Reference genome assembly.
     bed : str, optional
-        BED file.
+        By default, the input data is assumed to be WGS. If it targeted
+        sequencing, you must provide a BED file to indicate probed regions.
 
     Returns
     -------
@@ -357,7 +352,7 @@ def compute_target_depth(
     gene, bam=None, fn=None, assembly='GRCh37', bed=None
 ):
     """
-    Compute read depth for target gene with BAM data.
+    Compute read depth for the target gene from BAM files.
 
     Input BAM files must be specified with either ``bam`` or ``fn``, but
     it's an error to use both.
@@ -445,7 +440,7 @@ def one_row(r):
 
 def create_consolidated_vcf(imported_variants, phased_variants):
     """
-    Create consolidated VCF.
+    Create a consolidated VCF file.
 
     Parameters
     ----------
@@ -664,9 +659,10 @@ def estimate_phase_beagle(
     imported_variants : str or pypgx.Archive
         Archive file or object with the semantic type VcfFrame[Imported].
     panel : str, optional
-        Reference haplotype panel. By default, the 1KGP panel is used.
+        VCF file corresponding to a reference haplotype panel (zipped or
+        unzipped). By default, the 1KGP panel is used.
     impute : bool, default: False
-        Whether to perform imputation of missing genotypes.
+        If True, perform imputation of missing genotypes.
 
     Returns
     -------
@@ -705,23 +701,22 @@ def estimate_phase_beagle(
         data = pyvcf.VcfFrame.from_file(f'{t}/output.vcf.gz')
     return sdk.Archive(metadata, data)
 
-def filter_samples(archive, samples=None, exclude=False, fn=None):
+def filter_samples(archive, samples=None, fn=None, exclude=False):
     """
     Filter Archive for specified samples.
 
-    Samples can be specified with either ``samples`` or ``fn``, but it's an
-    error to use both.
-
     Parameters
     ----------
     archive : str or pypgx.archive
         Archive file or object.
     samples : str or list
-        Sample name or list of names (the order matters).
+        Sample name or list of names (the order matters). Cannot be used with
+        ``fn``.
+    fn : str
+        File containing one filename per line. Cannot be used with
+        ``samples``.
     exclude : bool, default: False
         If True, exclude specified samples.
-    fn : str
-        File containing one filename per line.
 
     Returns
     -------
@@ -941,11 +936,10 @@ def one_row(r, sample, i):
 
 def predict_cnv(copy_number, cnv_caller=None):
     """
-    Predict CNV for target gene based on copy number data.
+    Predict CNV for the target gene based on copy number data.
 
-    If there are missing values because, for example, the input data was
-    generated with targeted sequencing, they will be imputed with forward
-    filling.
+    Genomic positions that are missing copy number, because for example the
+    input data is targeted sequencing, will be imputed with forward filling.
 
     Parameters
     ----------

diff --git a/pypgx/cli/call_genotypes.py b/pypgx/cli/call_genotypes.py
@@ -5,24 +5,20 @@
 import fuc
 
 description = f"""
-###################################
-# Call genotypes for target gene. #
-###################################
-
-Usage examples:
-  $ pypgx {fuc.api.common._script_name()} CYP2D6-genotypes.zip --alleles CYP2D6-alleles.zip --cnv-calls CYP2D6-cnv-calls.zip
+Call genotypes for the target gene.
 """
 
 def create_parser(subparsers):
     parser = fuc.api.common._add_parser(
         subparsers,
         fuc.api.common._script_name(),
-        help='Call genotypes for target gene.',
+        help='Call genotypes for the target gene.',
         description=description,
     )
     parser.add_argument(
         'genotypes',
-        help='Archive file with the semantic type SampleTable[Genotypes].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[Genotypes].'
     )
     parser.add_argument(
         '--alleles',

diff --git a/pypgx/cli/call_phenotypes.py b/pypgx/cli/call_phenotypes.py
@@ -5,12 +5,7 @@
 import fuc
 
 description = f"""
-########################################
-# Call phenotypes for the target gene. #
-########################################
-
-Usage examples:
-  $ pypgx {fuc.api.common._script_name()} CYP2D6-genotypes.zip CYP2D6-phenotypes.zip
+Call phenotypes for the target gene.
 """
 
 def create_parser(subparsers):

diff --git a/pypgx/cli/combine_results.py b/pypgx/cli/combine_results.py
@@ -5,12 +5,7 @@
 import fuc
 
 description = f"""
-################################################
-# Combine various results for the target gene. #
-################################################
-
-Usage examples:
-  $ pypgx {fuc.api.common._script_name()} CYP2D6-results.zip --genotypes CYP2D6-genotypes.zip --phenotypes CYP2D6-phenotypes.zip --alleles CYP2D6-alleles.zip --cnv-calls CYP2D6-cnv-calls.zip
+Combine various results for the target gene.
 """
 
 def create_parser(subparsers):
@@ -27,22 +22,26 @@ def create_parser(subparsers):
     parser.add_argument(
         '--genotypes',
         metavar='PATH',
-        help='Archive file with the semantic type SampleTable[Genotypes].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[Genotypes].'
     )
     parser.add_argument(
         '--phenotypes',
         metavar='PATH',
-        help='Archive file with the semantic type SampleTable[Phenotypes].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[Phenotypes].'
     )
     parser.add_argument(
         '--alleles',
         metavar='PATH',
-        help='Archive file with the semantic type SampleTable[Alleles].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[Alleles].'
     )
     parser.add_argument(
         '--cnv-calls',
         metavar='PATH',
-        help='Archive file with the semantic type SampleTable[CNVCalls].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[CNVCalls].'
     )
 
 def main(args):

diff --git a/pypgx/cli/compute_control_statistics.py b/pypgx/cli/compute_control_statistics.py
@@ -5,65 +5,77 @@
 import fuc
 import pysam
 
-description = f"""
-##############################################################
-# Compute various statistics for control gene with BAM data. #
-##############################################################
-
-Input BAM files must be specified with either '--bam' or '--fn', but it's an error to use both. Similarly, control gene must be specified with either '--gene' or '--region', but it's an error to use both.
+description = """
+Compute summary statistics for the control gene from BAM files.
+"""
 
-By default, the input data is assumed to be WGS. If it's targeted sequencing, you must provide a BED file with '--bed' to indicate probed regions.
+epilog = f"""
+[Example] To compute summary statistics for the VDR gene from WGS data:
+  $ pypgx {fuc.api.common._script_name()} \\
+  control-statistcs-VDR.zip \\
+  --gene VDR \\
+  --bam A.bam B.bam
 
-Usage examples:
-  $ pypgx {fuc.api.common._script_name()} control-statistcs-VDR.zip --gene VDR --bam A.bam B.bam
-  $ pypgx {fuc.api.common._script_name()} control-statistcs-VDR.zip --gene VDR --fn bam.list
-  $ pypgx {fuc.api.common._script_name()} control-statistcs-VDR.zip --gene VDR --fn bam.list --bed probes.bed
-  $ pypgx {fuc.api.common._script_name()} control-statistcs-custom.zip --region chr1:100-200 --fn bam.list
+[Example] For a custom region from targeted sequencing data:
+  $ pypgx {fuc.api.common._script_name()} \\
+  control-statistcs-VDR.zip \\
+  --gene chr1:100-200 \\
+  --fn bam.list \\
+  --bed probes.bed
 """
 
 def create_parser(subparsers):
     parser = fuc.api.common._add_parser(
         subparsers,
         fuc.api.common._script_name(),
-        help='Compute various statistics for control gene with BAM data.',
         description=description,
+        epilog=epilog,
+        help='Compute summary statistics for the control gene from '
+             'BAM files.',
     )
     parser.add_argument(
         'control_statistics',
         metavar='control-statistics',
-        help='Archive file with the semantic type SampleTable[Statistics].'
+        help='Archive file with the semantic type \n'
+             'SampleTable[Statistics].'
     )
     parser.add_argument(
         '--bam',
         metavar='PATH',
         nargs='+',
-        help='One or more BAM files.'
+        help='One or more BAM files. Cannot be used with --fn.'
     )
     parser.add_argument(
         '--fn',
         metavar='PATH',
-        help='File containing one BAM file per line.'
+        help='File containing one BAM file per line. Cannot be \n'
+             'used with --bam.'
     )
     parser.add_argument(
         '--gene',
         metavar='TEXT',
-        help="Control gene (recommended choices: 'EGFR', 'RYR1', 'VDR')."
+        help="Control gene (recommended choices: 'EGFR', 'RYR1', \n"
+             "'VDR'). Cannot be used with --region."
     )
     parser.add_argument(
         '--region',
         metavar='TEXT',
-        help="Custom region to use as control gene ('chrom:start-end')."
+        help="Custom region to use as control gene \n"
+             "('chrom:start-end'). Cannot be used with --gene."
     )
     parser.add_argument(
         '--assembly',
         metavar='TEXT',
         default='GRCh37',
-        help="Reference genome assembly (default: 'GRCh37') (choices: 'GRCh37', 'GRCh38')."
+        help="Reference genome assembly (default: 'GRCh37') \n"
+             "(choices: 'GRCh37', 'GRCh38')."
     )
     parser.add_argument(
         '--bed',
         metavar='PATH',
-        help='BED file.'
+        help="By default, the input data is assumed to be WGS. If it \n"
+             "is targeted sequencing, you must provide a BED file to \n"
+             "indicate probed regions."
     )
 
 def main(args):