From 3461bb3e7b1648cf5cb9e60d9c0d62fbb13ff3c2 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Mon, 4 Apr 2022 07:21:35 +0900
Subject: [PATCH 01/32] Bump up version number

---
 CHANGELOG.rst    | 3 +++
 pypgx/version.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f5959baf..0224f5ad 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,9 @@
 Changelog
 *********
 
+0.15.0 (in development)
+-----------------------
+
 0.14.0 (2022-04-03)
 -------------------
 
diff --git a/pypgx/version.py b/pypgx/version.py
index ef919940..a842d05a 100644
--- a/pypgx/version.py
+++ b/pypgx/version.py
@@ -1 +1 @@
-__version__ = '0.14.0'
+__version__ = '0.15.0'

From 809f4352b3a2b22277497f3ae354178268c38cb7 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 4 Apr 2022 16:20:12 +0900
Subject: [PATCH 02/32] Update `prepare-depth-of-coverage`:

* Add new optional arguments ``--genes`` and ``--exclude`` to
:command:`prepare-depth-of-coverage` command.
---
 CHANGELOG.rst                          |  2 ++
 docs/cli.rst                           | 33 +++++++++++++++-----------
 pypgx/api/utils.py                     |  9 +++++--
 pypgx/cli/prepare_depth_of_coverage.py | 17 ++++++++++++-
 4 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0224f5ad..1665930d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,8 @@ Changelog
 0.15.0 (in development)
 -----------------------
 
+* Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
+
 0.14.0 (2022-04-03)
 -------------------
 
diff --git a/docs/cli.rst b/docs/cli.rst
index 1b22d6c0..a83208de 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -703,6 +703,7 @@ prepare-depth-of-coverage
 
    $ pypgx prepare-depth-of-coverage -h
    usage: pypgx prepare-depth-of-coverage [-h] [--assembly TEXT] [--bed PATH]
+                                          [--genes TEXT [TEXT ...]] [--exclude]
                                           depth-of-coverage bams [bams ...]
    
    Prepare a depth of coverage file for all target genes with SV from BAM files.
@@ -713,22 +714,26 @@ prepare-depth-of-coverage
    have star alleles defined only by SNVs/indels (e.g. CYP3A5).
    
    Positional arguments:
-     depth-of-coverage  Output archive file with the semantic type
-                        CovFrame[DepthOfCoverage].
-     bams               One or more input BAM files. Alternatively, you can
-                        provide a text file (.txt, .tsv, .csv, or .list)
-                        containing one BAM file per line.
+     depth-of-coverage     Output archive file with the semantic type
+                           CovFrame[DepthOfCoverage].
+     bams                  One or more input BAM files. Alternatively, you can
+                           provide a text file (.txt, .tsv, .csv, or .list)
+                           containing one BAM file per line.
    
    Optional arguments:
-     -h, --help         Show this help message and exit.
-     --assembly TEXT    Reference genome assembly (default: 'GRCh37')
-                        (choices: 'GRCh37', 'GRCh38').
-     --bed PATH         By default, the input data is assumed to be WGS. If
-                        it's targeted sequencing, you must provide a BED file
-                        to indicate probed regions. Note that the 'chr' prefix
-                        in contig names (e.g. 'chr1' vs. '1') will be
-                        automatically added or removed as necessary to match
-                        the input BAM's contig names.
+     -h, --help            Show this help message and exit.
+     --assembly TEXT       Reference genome assembly (default: 'GRCh37')
+                           (choices: 'GRCh37', 'GRCh38').
+     --bed PATH            By default, the input data is assumed to be WGS. If
+                           it's targeted sequencing, you must provide a BED file
+                           to indicate probed regions. Note that the 'chr' prefix
+                           in contig names (e.g. 'chr1' vs. '1') will be
+                           automatically added or removed as necessary to match
+                           the input BAM's contig names.
+     --genes TEXT [TEXT ...]
+                           List of genes to include.
+     --exclude             Exclude specified genes. Ignored when --genes is not
+                           used.
    
    [Example] From WGS data:
      $ pypgx prepare-depth-of-coverage \
diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
index ae3007c8..099d49b9 100644
--- a/pypgx/api/utils.py
+++ b/pypgx/api/utils.py
@@ -1184,7 +1184,7 @@ def predict_cnv(copy_number, cnv_caller=None):
     return sdk.Archive(metadata, data)
 
 def prepare_depth_of_coverage(
-    bams, assembly='GRCh37', bed=None
+    bams, assembly='GRCh37', bed=None, genes=None, exclude=False
 ):
     """
     Prepare a depth of coverage file for all target genes with SV from BAM
@@ -1208,6 +1208,10 @@ def prepare_depth_of_coverage(
         Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will
         be automatically added or removed as necessary to match the input
         BAM's contig names.
+    genes : list, optional
+        List of genes to include.
+    exclude : bool, default: False
+        Exclude specified genes. Ignored when ``genes=None``.
 
     Returns
     -------
@@ -1220,7 +1224,8 @@ def prepare_depth_of_coverage(
     }
 
     regions = create_regions_bed(
-        merge=True, sv_genes=True, assembly=assembly,
+        merge=True, sv_genes=True, assembly=assembly, genes=genes,
+        exclude=exclude
     ).to_regions()
 
     cf = pycov.CovFrame.from_bam(bams, regions=regions, zero=True)
diff --git a/pypgx/cli/prepare_depth_of_coverage.py b/pypgx/cli/prepare_depth_of_coverage.py
index bf066162..987be753 100644
--- a/pypgx/cli/prepare_depth_of_coverage.py
+++ b/pypgx/cli/prepare_depth_of_coverage.py
@@ -71,9 +71,24 @@ def create_parser(subparsers):
 automatically added or removed as necessary to match
 the input BAM's contig names."""
     )
+    parser.add_argument(
+        '--genes',
+        metavar='TEXT',
+        nargs='+',
+        help=
+"""List of genes to include."""
+    )
+    parser.add_argument(
+        '--exclude',
+        action='store_true',
+        help=
+"""Exclude specified genes. Ignored when --genes is not
+used."""
+    )
 
 def main(args):
     archive = utils.prepare_depth_of_coverage(
-        args.bams, assembly=args.assembly, bed=args.bed
+        args.bams, assembly=args.assembly, bed=args.bed, genes=args.genes,
+        exclude=args.exclude
     )
     archive.to_file(args.depth_of_coverage)

From 231a7e229b3661e075744b4bdabe977b510b87c3 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 6 Apr 2022 14:33:46 +0900
Subject: [PATCH 03/32] Add new command `slice-bam`

---
 CHANGELOG.rst          |  1 +
 README.rst             |  1 +
 docs/cli.rst           | 27 ++++++++++++++++++++
 pypgx/__init__.py      |  1 +
 pypgx/api/utils.py     | 23 +++++++++++++++++
 pypgx/cli/slice_bam.py | 58 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 111 insertions(+)
 create mode 100644 pypgx/cli/slice_bam.py

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 1665930d..f9517e4c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,7 @@ Changelog
 -----------------------
 
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
+* Add new command :command:`slice-bam`.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/README.rst b/README.rst
index a3fd0334..2994d70b 100644
--- a/README.rst
+++ b/README.rst
@@ -669,6 +669,7 @@ For getting help on the CLI:
        run-long-read-pipeline
                            Run genotyping pipeline for long-read sequencing data.
        run-ngs-pipeline    Run genotyping pipeline for NGS data.
+       slice-bam           Slice BAM file for all genes used by PyPGx.
        test-cnv-caller     Test CNV caller for target gene.
        train-cnv-caller    Train CNV caller for target gene.
    
diff --git a/docs/cli.rst b/docs/cli.rst
index a83208de..91ecee79 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -60,6 +60,7 @@ For getting help on the CLI:
        run-long-read-pipeline
                            Run genotyping pipeline for long-read sequencing data.
        run-ngs-pipeline    Run genotyping pipeline for NGS data.
+       slice-bam           Slice BAM file for all genes used by PyPGx.
        test-cnv-caller     Test CNV caller for target gene.
        train-cnv-caller    Train CNV caller for target gene.
    
@@ -929,6 +930,32 @@ run-ngs-pipeline
      --control-statistcs control-statistics-VDR.zip \
      --platform Targeted
 
+slice-bam
+=========
+
+.. code-block:: text
+
+   $ pypgx slice-bam -h
+   usage: pypgx slice-bam [-h] [--assembly TEXT] [--genes TEXT [TEXT ...]]
+                          [--exclude]
+                          input output
+   
+   Slice BAM file for all genes used by PyPGx.
+   
+   Positional arguments:
+     input                 Input BAM file. It must be already indexed to allow
+                           random access.
+     output                Output BAM file.
+   
+   Optional arguments:
+     -h, --help            Show this help message and exit.
+     --assembly TEXT       Reference genome assembly (default: 'GRCh37')
+                           (choices: 'GRCh37', 'GRCh38').
+     --genes TEXT [TEXT ...]
+                           List of genes to include.
+     --exclude             Exclude specified genes. Ignored when --genes is not
+                           used.
+
 test-cnv-caller
 ===============
 
diff --git a/pypgx/__init__.py b/pypgx/__init__.py
index 9214d35f..f9690bca 100644
--- a/pypgx/__init__.py
+++ b/pypgx/__init__.py
@@ -54,6 +54,7 @@
     predict_cnv,
     prepare_depth_of_coverage,
     print_metadata,
+    slice_bam,
     test_cnv_caller,
     train_cnv_caller,
 )
diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
index 099d49b9..ce71efc5 100644
--- a/pypgx/api/utils.py
+++ b/pypgx/api/utils.py
@@ -1265,6 +1265,29 @@ def print_metadata(input):
     with zf.open(f'{parent}/metadata.txt') as f:
         print(f.read().decode('utf-8').strip())
 
+def slice_bam(
+    input, output, assembly='GRCh37', genes=None, exclude=False
+):
+    """
+    Slice BAM file for all genes used by PyPGx.
+
+    Parameters
+    ----------
+    input
+        Input BAM file. It must be already indexed to allow random access.
+    output : str
+        Output BAM file.
+    assembly : {'GRCh37', 'GRCh38'}, default: 'GRCh37'
+        Reference genome assembly.
+    genes : list, optional
+        List of genes to include.
+    exclude : bool, default: False
+        Exclude specified genes. Ignored when ``genes=None``.
+    """
+    bf = create_regions_bed(merge=True, assembly=assembly,
+        genes=genes, exclude=exclude)
+    pybam.slice(input, bf, path=output)
+
 def test_cnv_caller(
     cnv_caller, copy_number, cnv_calls, confusion_matrix=None
 ):
diff --git a/pypgx/cli/slice_bam.py b/pypgx/cli/slice_bam.py
new file mode 100644
index 00000000..37278cb8
--- /dev/null
+++ b/pypgx/cli/slice_bam.py
@@ -0,0 +1,58 @@
+import sys
+
+from ..api import utils
+
+import fuc
+import pysam
+
+description = f"""
+Slice BAM file for all genes used by PyPGx.
+"""
+
+def create_parser(subparsers):
+    parser = fuc.api.common._add_parser(
+        subparsers,
+        fuc.api.common._script_name(),
+        description=description,
+        help=
+"""Slice BAM file for all genes used by PyPGx."""
+    )
+    parser.add_argument(
+        'input',
+        help=
+"""Input BAM file. It must be already indexed to allow
+random access."""
+    )
+    parser.add_argument(
+        'output',
+        help=
+"""Output BAM file."""
+    )
+    parser.add_argument(
+        '--assembly',
+        metavar='TEXT',
+        default='GRCh37',
+        help=
+"""Reference genome assembly (default: 'GRCh37')
+(choices: 'GRCh37', 'GRCh38')."""
+    )
+    parser.add_argument(
+        '--genes',
+        metavar='TEXT',
+        nargs='+',
+        help=
+"""List of genes to include."""
+    )
+    parser.add_argument(
+        '--exclude',
+        action='store_true',
+        help=
+"""Exclude specified genes. Ignored when --genes is not
+used."""
+    )
+
+def main(args):
+    utils.slice_bam(
+        args.input, args.output, assembly=args.assembly, genes=args.genes,
+        exclude=args.exclude
+    )

From b8c02d2c6424985112529c742d9de2899902e280 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 7 Apr 2022 11:00:01 +0900
Subject: [PATCH 04/32] Add new command `print-data`

---
 CHANGELOG.rst           |  1 +
 README.rst              |  1 +
 docs/cli.rst            | 17 +++++++++++++++++
 pypgx/__init__.py       |  1 +
 pypgx/api/utils.py      | 20 ++++++++++++++++++++
 pypgx/cli/print_data.py | 27 +++++++++++++++++++++++++++
 6 files changed, 67 insertions(+)
 create mode 100644 pypgx/cli/print_data.py

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f9517e4c..0c6cc7cc 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,6 +6,7 @@ Changelog
 
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
+* Add new command :command:`print-data`.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/README.rst b/README.rst
index 2994d70b..6bf5778c 100644
--- a/README.rst
+++ b/README.rst
@@ -664,6 +664,7 @@ For getting help on the CLI:
        prepare-depth-of-coverage
                            Prepare a depth of coverage file for all target
                            genes with SV from BAM files.
+       print-data          Print the main data of specified archive.
        print-metadata      Print the metadata of specified archive.
        run-chip-pipeline   Run genotyping pipeline for chip data.
        run-long-read-pipeline
diff --git a/docs/cli.rst b/docs/cli.rst
index 91ecee79..f35edd08 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -55,6 +55,7 @@ For getting help on the CLI:
        prepare-depth-of-coverage
                            Prepare a depth of coverage file for all target
                            genes with SV from BAM files.
+       print-data          Print the main data of specified archive.
        print-metadata      Print the metadata of specified archive.
        run-chip-pipeline   Run genotyping pipeline for chip data.
        run-long-read-pipeline
@@ -747,6 +748,22 @@ prepare-depth-of-coverage
      bam.list \
      --bed probes.bed
 
+print-data
+==========
+
+.. code-block:: text
+
+   $ pypgx print-data -h
+   usage: pypgx print-data [-h] input
+   
+   Print the main data of specified archive.
+   
+   Positional arguments:
+     input       Input archive file.
+   
+   Optional arguments:
+     -h, --help  Show this help message and exit.
+
 print-metadata
 ==============
 
diff --git a/pypgx/__init__.py b/pypgx/__init__.py
index f9690bca..5b78bf50 100644
--- a/pypgx/__init__.py
+++ b/pypgx/__init__.py
@@ -53,6 +53,7 @@
     predict_alleles,
     predict_cnv,
     prepare_depth_of_coverage,
+    print_data,
     print_metadata,
     slice_bam,
     test_cnv_caller,
diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
index ce71efc5..a3c15a3d 100644
--- a/pypgx/api/utils.py
+++ b/pypgx/api/utils.py
@@ -1251,6 +1251,26 @@ def prepare_depth_of_coverage(
 
     return sdk.Archive(metadata, cf)
 
+def print_data(input):
+    """
+    Print the main data of specified archive.
+
+    Parameters
+    ----------
+    input : pypgx.Archive
+        Archive file.
+    """
+    archive = sdk.Archive.from_file(input)
+    if 'SampleTable' in archive.type:
+        data = archive.data.to_csv(sep='\t')
+    elif 'CovFrame' in archive.type:
+        data = archive.data.to_string()
+    elif 'VcfFrame' in archive.type:
+        data = archive.data.to_string()
+    else:
+        raise ValueError(f"Data cannot be printed for {archive.type}")
+    print(data, end='')
+
 def print_metadata(input):
     """
     Print the metadata of specified archive.
diff --git a/pypgx/cli/print_data.py b/pypgx/cli/print_data.py
new file mode 100644
index 00000000..70ba9478
--- /dev/null
+++ b/pypgx/cli/print_data.py
@@ -0,0 +1,27 @@
+import sys
+
+from ..api import utils
+
+import fuc
+import pysam
+
+description = f"""
+Print the main data of specified archive.
+"""
+
+def create_parser(subparsers):
+    parser = fuc.api.common._add_parser(
+        subparsers,
+        fuc.api.common._script_name(),
+        description=description,
+        help=
+"""Print the main data of specified archive."""
+    )
+    parser.add_argument(
+        'input',
+        help=
+"""Input archive file."""
+    )
+
+def main(args):
+    utils.print_data(args.input)

From 25e081d1ebb8dde4b9df1fb0ca4cd4f5b5c6150b Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 7 Apr 2022 11:17:19 +0900
Subject: [PATCH 05/32] Update docs

---
 README.rst     | 11 ++++++++++-
 docs/create.py | 11 ++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 6bf5778c..2b4e41f1 100644
--- a/README.rst
+++ b/README.rst
@@ -374,7 +374,8 @@ Wroking with archive files
 --------------------------
 
 To demonstrate how easy it is to work with PyPGx archive files, below we will
-show some examples. First, download an archive:
+show some examples. First, download an archive to play with, which has
+``SampleTable[Results]`` as semantic type:
 
 .. code-block:: text
 
@@ -389,6 +390,14 @@ Let's print its metadata:
     Assembly=GRCh37
     SemanticType=SampleTable[Results]
 
+Now print its main data (but display first sample only):
+
+.. code-block:: text
+
+    $ pypgx print-data grch37-CYP2D6-results.zip | head -n 2
+    	Genotype	Phenotype	Haplotype1	Haplotype2	AlternativePhase	VariantData	CNV
+    HG00276_PyPGx	*4/*5	Poor Metabolizer	*4;*10;*74;*2;	*10;*74;*2;	;	*4:22-42524947-C-T:0.913;*10:22-42526694-G-A,22-42523943-A-G:1.0,1.0;*74:22-42525821-G-T:1.0;*2:default;	DeletionHet
+
 We can unzip it to extract files inside (note that ``tmpcty4c_cr`` is the
 original folder name):
 
diff --git a/docs/create.py b/docs/create.py
index 2fbeefe9..dfdecfc0 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -401,7 +401,8 @@
 --------------------------
 
 To demonstrate how easy it is to work with PyPGx archive files, below we will
-show some examples. First, download an archive:
+show some examples. First, download an archive to play with, which has
+``SampleTable[Results]`` as semantic type:
 
 .. code-block:: text
 
@@ -416,6 +417,14 @@
     Assembly=GRCh37
     SemanticType=SampleTable[Results]
 
+Now print its main data (but display first sample only):
+
+.. code-block:: text
+
+    $ pypgx print-data grch37-CYP2D6-results.zip | head -n 2
+    	Genotype	Phenotype	Haplotype1	Haplotype2	AlternativePhase	VariantData	CNV
+    HG00276_PyPGx	*4/*5	Poor Metabolizer	*4;*10;*74;*2;	*10;*74;*2;	;	*4:22-42524947-C-T:0.913;*10:22-42526694-G-A,22-42523943-A-G:1.0,1.0;*74:22-42525821-G-T:1.0;*2:default;	DeletionHet
+
 We can unzip it to extract files inside (note that ``tmpcty4c_cr`` is the
 original folder name):
 

From 6a05b85e3ee0ef1d6f29f7432826a6c0277cd286 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 8 Apr 2022 12:37:45 +0900
Subject: [PATCH 06/32] Update docs

---
 docs/tutorials.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 307ebd5e..7b692828 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -54,6 +54,11 @@ those from:
 Please visit the :ref:`readme:Pipelines` page for details on how to generate
 the input files.
 
+In case you are interested in creating above input files on your own, I have
+also prepared "mini" BAM files where the original BAM files from GeT-RM have
+been sliced to only contain genes used by PyPGx. You can download them `here
+<https://1drv.ms/u/s!Apgoq3uQ2gCqgrovIFKJSi-ECXY9pw?e=uP5EeU>`__.
+
 Let's look at the metadata for some of these files:
 
 .. code-block:: text

From a1a11c3c036b8df6645f7b30102f7948483c299f Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 9 Apr 2022 11:05:45 +0900
Subject: [PATCH 07/32] Update docs

---
 docs/tutorials.rst | 77 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 7b692828..54862990 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -35,10 +35,13 @@ available for download and use from the `European Nucleotide Archive
 <https://www.ebi.ac.uk/ena/browser/view/PRJEB19931>`__. We will be using this
 WGS dataset throughout the tutorial.
 
-Because downloading the entire WGS dataset is not feasible for most users due
-to its file size (i.e. a 30x WGS sample ≈ 90 GB), I have prepared input files
-ranging from 2 KB to 17.6 MB, for both GRCh37 and GRCh38. You can download
-those from:
+Obtaining input files
+---------------------
+
+Because downloading the entire WGS dataset is probably not feasible for most
+users due to large file size (i.e. a 30x WGS sample ≈ 90 GB), I have prepared
+input files ranging from 2 KB to 25.5 MB, for both GRCh37 and GRCh38. You can
+easily download these with:
 
 .. code-block:: text
 
@@ -51,14 +54,6 @@ those from:
     $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-depth-of-coverage.zip
     $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-control-statistics-VDR.zip
 
-Please visit the :ref:`readme:Pipelines` page for details on how to generate
-the input files.
-
-In case you are interested in creating above input files on your own, I have
-also prepared "mini" BAM files where the original BAM files from GeT-RM have
-been sliced to only contain genes used by PyPGx. You can download them `here
-<https://1drv.ms/u/s!Apgoq3uQ2gCqgrovIFKJSi-ECXY9pw?e=uP5EeU>`__.
-
 Let's look at the metadata for some of these files:
 
 .. code-block:: text
@@ -67,19 +62,75 @@ Let's look at the metadata for some of these files:
     Assembly=GRCh37
     SemanticType=CovFrame[DepthOfCoverage]
     Platform=WGS
+
+.. code-block:: text
+
     $ pypgx print-metadata grch38-control-statistics-VDR.zip
     Control=VDR
     Assembly=GRCh38
     SemanticType=SampleTable[Statistics]
     Platform=WGS
 
+At this point, you are now ready to move on to the next step.
+
+Optionally, in case you are interested in creating above input files on your
+own, I have also prepared "mini" BAM files for GRCh37 where the original
+sequencing data from GeT-RM have been sliced to contain genes used by PyPGx
+only. You can download them `here <https://1drv.ms/u/
+s!Apgoq3uQ2gCqgrovIFKJSi-ECXY9pw?e=uP5EeU>`__. You will also need reference
+FASTA when creating input VCF, which can be downloaded from `here
+<https://1drv.ms/u/s!Apgoq3uQ2gCqgt4qGq9YsumpVk9xJQ?e=ZewLHu>`__.
+
+Once you are finished downloading the mini BAM files and the reference FASTA
+file, let's first create input VCF:
+
+.. code-block:: text
+
+    $ pypgx create-input-vcf \
+    grch37-variants.vcf.gz \
+    /path/to/genome.fa \
+    grch37-bam.list
+
+Note that this step can take some time to run. For example, it takes about 1
+hour to finish using my personal MacBook Air (M1, 2020) with 8 GB of memory.
+
+Next, we will compute depth of coverage for genes that are known to have SV:
+
+.. code-block:: text
+
+    $ pypgx prepare-depth-of-coverage \
+    grch37-depth-of-coverage.zip \
+    grch37-bam.list
+
+This step should be quick. It finishes in less than 30 seconds with my laptop.
+
+.. code-block:: text
+
+    $ pypgx compute-control-statistics \
+    VDR \
+    grch37-control-statistics-VDR.zip \
+    grch37-bam.list
+
+Finally, we can compute control statistics using the VDR gene as control
+locus, which will be used when converting read depth to copy number:
+
+.. code-block:: text
+
+    $ pypgx compute-control-statistics \
+    VDR \
+    grch37-control-statistics-VDR.zip \
+    grch37-bam.list
+
+This step should be quick as well. It finishes in less than 5 seconds with my
+laptop.
+
 Genotyping genes with SV
 ------------------------
 
 The first gene we are going to genotype is CYP2D6, which has almost 150
 star alleles including those with SV (e.g. gene deletions, duplications, and
 hybrids). To this end, we will run PyPGx's next-generation sequencing (NGS)
-pipeline:
+pipeline (see :ref:`readme:NGS pipeline` for more details):
 
 .. code-block:: text
 

From 5392a5c21747aca4dd2c171affae3486749e6f56 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 9 Apr 2022 11:27:08 +0900
Subject: [PATCH 08/32] Update docs

---
 README.rst         |  6 ++++++
 docs/create.py     |  6 ++++++
 docs/tutorials.rst | 11 ++---------
 pypgx/api/core.py  |  4 ++--
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/README.rst b/README.rst
index 2b4e41f1..70ed10ee 100644
--- a/README.rst
+++ b/README.rst
@@ -524,6 +524,9 @@ HaplotypeCaller). See the `Variant caller choice <https://pypgx.readthedocs.
 io/en/latest/faq.html#variant-caller-choice>`__ section for detailed
 discussion on when to use either option.
 
+Check out the `GeT-RM WGS tutorial <https://pypgx.readthedocs.io/en/latest/
+tutorials.html#get-rm-wgs-tutorial>`__ to see this pipeline in action.
+
 Chip pipeline
 -------------
 
@@ -543,6 +546,9 @@ The pipeline currently does not support SV detection. Please post a GitHub
 issue if you want to contribute your development skills and/or data for
 devising an SV detection algorithm.
 
+Check out the `Coriell Affy tutorial <https://pypgx.readthedocs.io/en/latest/
+tutorials.html#coriell-affy-tutorial>`__ to see this pipeline in action.
+
 Long-read pipeline
 ------------------
 
diff --git a/docs/create.py b/docs/create.py
index dfdecfc0..71a8bd80 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -551,6 +551,9 @@
 io/en/latest/faq.html#variant-caller-choice>`__ section for detailed
 discussion on when to use either option.
 
+Check out the `GeT-RM WGS tutorial <https://pypgx.readthedocs.io/en/latest/
+tutorials.html#get-rm-wgs-tutorial>`__ to see this pipeline in action.
+
 Chip pipeline
 -------------
 
@@ -570,6 +573,9 @@
 issue if you want to contribute your development skills and/or data for
 devising an SV detection algorithm.
 
+Check out the `Coriell Affy tutorial <https://pypgx.readthedocs.io/en/latest/
+tutorials.html#coriell-affy-tutorial>`__ to see this pipeline in action.
+
 Long-read pipeline
 ------------------
 
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 54862990..17952bce 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -104,15 +104,8 @@ Next, we will compute depth of coverage for genes that are known to have SV:
 
 This step should be quick. It finishes in less than 30 seconds with my laptop.
 
-.. code-block:: text
-
-    $ pypgx compute-control-statistics \
-    VDR \
-    grch37-control-statistics-VDR.zip \
-    grch37-bam.list
-
-Finally, we can compute control statistics using the VDR gene as control
-locus, which will be used when converting read depth to copy number:
+Finally, we will compute control statistics using the VDR gene as control
+locus, which is required when converting read depth to copy number:
 
 .. code-block:: text
 
diff --git a/pypgx/api/core.py b/pypgx/api/core.py
index 9e1916b8..90cdc201 100644
--- a/pypgx/api/core.py
+++ b/pypgx/api/core.py
@@ -1123,7 +1123,7 @@ def predict_phenotype(gene, a, b):
     gene deletion, duplication, and tandem arrangement.
 
     For detailed implementation, please see the `Phenotype prediction
-    <file:///Users/sbslee/Desktop/pypgx/docs/_build/html/
+    <https://pypgx.readthedocs.io/en/latest/
     readme.html#phenotype-prediction>`__ section.
 
     Parameters
@@ -1199,7 +1199,7 @@ def predict_score(gene, allele):
     activity score system.
 
     For detailed implementation, please see the `Phenotype prediction
-    <file:///Users/sbslee/Desktop/pypgx/docs/_build/html/
+    <https://pypgx.readthedocs.io/en/latest/
     readme.html#phenotype-prediction>`__ section.
 
     Parameters

From 3119a069a719417da75fd03fbd997e0e6ab6d682 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sun, 10 Apr 2022 09:53:46 +0900
Subject: [PATCH 09/32] Update `print-data` command to avoid BrokenPipeError

---
 pypgx/api/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
index a3c15a3d..9cccaa1f 100644
--- a/pypgx/api/utils.py
+++ b/pypgx/api/utils.py
@@ -8,6 +8,7 @@
 import zipfile
 import subprocess
 import os
+import sys
 import pickle
 import warnings
 
@@ -1269,7 +1270,14 @@ def print_data(input):
         data = archive.data.to_string()
     else:
         raise ValueError(f"Data cannot be printed for {archive.type}")
-    print(data, end='')
+
+    # https://docs.python.org/3/library/signal.html#note-on-sigpipe
+    try:
+        print(data, end='')
+    except BrokenPipeError:
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        os.dup2(devnull, sys.stdout.fileno())
+        sys.exit(1)
 
 def print_metadata(input):
     """

From d8f17fc3212b2adcb2095eebc634f97b7e65e754 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sun, 10 Apr 2022 10:30:23 +0900
Subject: [PATCH 10/32] Update docs

---
 docs/tutorials.rst | 81 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 17952bce..79f0aea1 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -54,7 +54,12 @@ easily download these with:
     $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-depth-of-coverage.zip
     $ wget https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/grch38-control-statistics-VDR.zip
 
-Let's look at the metadata for some of these files:
+Let's take a look at the metadata for some of these files. If you're not
+familiar with what metadata is, please visit `Archive file, semantic type,
+and metadata <https://pypgx.readthedocs.io/en/latest/
+readme.html#archive-file-semantic-type-and-metadata>`__. The first one we'll
+look at is an archive file with the semantic type
+``CovFrame[DepthOfCoverage]``:
 
 .. code-block:: text
 
@@ -63,6 +68,26 @@ Let's look at the metadata for some of these files:
     SemanticType=CovFrame[DepthOfCoverage]
     Platform=WGS
 
+We can see that above archive was created using WGS data aligned to GRCh37.
+It has following data structure:
+
+.. code-block:: text
+
+    $ pypgx print-data grch37-depth-of-coverage.zip | head
+    Chromosome	Position	NA18519_PyPGx	HG01190_PyPGx	NA12006_PyPGx	NA18484_PyPGx	NA07055_PyPGx	NA18980_PyPGx	NA19213_PyPGx	NA12813_PyPGx	NA19003_PyPGx	NA10831_PyPGx	NA18524_PyPGx	NA10851_PyPGx	NA18966_PyPGx	HG00589_PyPGx	NA18855_PyPGx	NA18544_PyPGx	NA18518_PyPGx	NA18973_PyPGx	NA19143_PyPGx	NA18992_PyPGx	NA12873_PyPGx	NA19207_PyPGx	NA18942_PyPGx	NA19178_PyPGx	NA19789_PyPGx	NA19122_PyPGx	NA19174_PyPGx	NA18868_PyPGx	HG00436_PyPGx	HG00276_PyPGx	NA19239_PyPGx	NA19109_PyPGx	NA20509_PyPGx	NA10854_PyPGx	NA19226_PyPGx	NA10847_PyPGx	NA18552_PyPGx	NA18526_PyPGx	NA07029_PyPGx	NA06991_PyPGx	NA11832_PyPGx	NA21781_PyPGx	NA12145_PyPGx	NA19007_PyPGx	NA18861_PyPGx	NA12156_PyPGx	NA18952_PyPGx	NA18565_PyPGx	NA19920_PyPGx	NA12003_PyPGx	NA20296_PyPGx	NA07019_PyPGx	NA07056_PyPGx	NA11993_PyPGx	NA19147_PyPGx	NA19819_PyPGx	NA07000_PyPGx	NA18540_PyPGx	NA19095_PyPGx	NA18509_PyPGx	NA19917_PyPGx	NA18617_PyPGx	NA07357_PyPGx	NA19176_PyPGx	NA18959_PyPGx	NA07348_PyPGx	NA18564_PyPGx	NA19908_PyPGx	NA11839_PyPGx	NA12717_PyPGx
+    chr1	110227417	17	0	9	12	12	13	10	0	0	0	0	1	14	10	4	26	7	6	0	0	4	19	8	6	0	15	0	17	20	0	0	15	10	11	0	7	18	0	0	0	0	22	11	0	6	0	0	0	24	17	17	12	19	0	14	0	0	13	15	8	0	24	0	10
+    chr1	110227418	17	0	9	12	12	13	10	0	0	0	0	1	14	10	4	26	8	8	0	0	4	19	9	6	0	15	0	18	20	0	0	16	10	11	0	8	18	0	0	0	0	22	11	0	6	0	0	0	24	17	17	12	20	0	14	0	0	13	15	8	0	24	0	10
+    chr1	110227419	17	0	10	12	12	13	10	0	0	0	0	1	14	10	4	27	8	8	0	0	5	19	9	6	0	16	0	18	20	0	0	16	11	11	0	8	18	0	0	0	0	22	12	0	6	0	0	0	24	17	17	12	20	0	14	0	0	14	15	8	0	24	0	10
+    chr1	110227420	17	0	10	13	13	12	10	0	0	0	0	1	14	10	3	27	8	8	0	0	5	18	9	6	0	15	0	18	19	0	0	16	11	11	0	8	16	0	0	0	0	22	12	0	6	0	0	0	24	19	17	11	19	0	13	0	0	14	15	8	0	23	0	10
+    chr1	110227421	17	0	10	13	13	12	10	0	0	0	0	1	13	10	3	27	8	8	0	0	5	18	8	7	0	15	0	19	19	0	0	16	11	11	0	8	15	0	0	0	0	22	12	0	6	0	0	0	25	20	17	11	19	0	13	0	0	15	15	8	0	23	0	10
+    chr1	110227422	18	0	10	13	13	12	10	0	0	0	0	1	13	10	3	27	8	8	0	0	5	18	9	7	0	15	0	19	19	0	0	17	11	11	0	8	15	0	0	0	0	21	12	0	6	0	0	0	25	20	18	11	19	0	13	0	0	16	15	9	0	23	0	10
+    chr1	110227423	18	0	10	13	13	12	10	0	0	0	0	1	13	10	3	25	8	8	0	0	5	18	9	7	0	15	0	19	18	0	0	17	11	11	0	9	15	0	0	0	0	21	13	0	6	0	0	0	25	20	18	11	19	0	13	0	0	17	15	9	0	23	0	10
+    chr1	110227424	18	0	10	13	13	12	10	0	0	0	0	1	13	10	3	25	8	8	0	0	5	18	9	7	0	15	0	19	18	0	0	17	11	11	0	9	15	0	0	0	0	21	13	0	6	0	0	0	26	20	18	11	19	0	14	0	0	16	15	9	0	23	0	10
+    chr1	110227425	19	0	11	13	13	12	10	0	0	0	0	1	13	10	3	25	8	8	0	0	5	18	9	8	0	15	0	20	18	0	0	17	11	11	0	9	15	0	0	0	0	21	13	0	6	0	0	0	26	20	18	13	19	0	15	0	0	16	15	9	0	23	0	10
+
+The second one is an archive file with the semantic type
+``SampleTable[Statistics]``:
+
 .. code-block:: text
 
     $ pypgx print-metadata grch38-control-statistics-VDR.zip
@@ -71,8 +96,41 @@ Let's look at the metadata for some of these files:
     SemanticType=SampleTable[Statistics]
     Platform=WGS
 
+Note that this archive was created using WGS data aligned to GRCh38 and the
+VDR gene as control locus, and has following data structure:
+
+.. code-block:: text
+
+    $ pypgx print-data grch38-control-statistics-VDR.zip | head
+    	count	mean	std	min	25%	50%	75%	max
+    NA19213_PyPGx	69459.0	40.464317079140216	7.416070659882781	5.0	35.0	40.0	45.0	67.0
+    HG00436_PyPGx	69459.0	39.05070617198635	7.041075412533929	3.0	34.0	39.0	44.0	66.0
+    NA12006_PyPGx	69459.0	44.49780446018514	7.565078889270334	6.0	39.0	44.0	50.0	73.0
+    NA12156_PyPGx	69459.0	39.53788565916584	7.463158820634827	3.0	34.0	39.0	44.0	66.0
+    NA12813_PyPGx	69459.0	37.33543529276264	6.920597209929764	7.0	33.0	37.0	42.0	67.0
+    NA19207_PyPGx	69459.0	40.59959112570005	7.042408883522744	4.0	36.0	41.0	45.0	63.0
+    NA07029_PyPGx	69459.0	38.69389136037086	7.075488283784741	2.0	34.0	39.0	44.0	67.0
+    NA18980_PyPGx	69459.0	34.79616752328712	6.685174389736681	1.0	30.0	35.0	39.0	59.0
+    NA18973_PyPGx	69459.0	36.43840251083373	7.0885860461926296	3.0	32.0	37.0	41.0	66.0
+
+Finally, we'll look at the input VCF. Note that it's not an archive file per
+se, but we can still peek at its data:
+
+.. code-block:: text
+
+    $ zcat grch37-variants.vcf.gz | grep "#CHROM" -A 5
+    #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA18519_PyPGx	HG01190_PyPGx	NA12006_PyPGx	NA18484_PyPGx	NA07055_PyPGx	NA18980_PyPGx	NA19213_PyPGx	NA12813_PyPGx	NA19003_PyPGx	NA10831_PyPGx	NA18524_PyPGx	NA10851_PyPGx	NA18966_PyPGx	HG00589_PyPGx	NA18855_PyPGx	NA18544_PyPGx	NA18518_PyPGx	NA18973_PyPGx	NA19143_PyPGx	NA18992_PyPGx	NA12873_PyPGx	NA19207_PyPGx	NA18942_PyPGx	NA19178_PyPGx	NA19789_PyPGx	NA19122_PyPGx	NA19174_PyPGx	NA18868_PyPGx	HG00436_PyPGx	HG00276_PyPGx	NA19239_PyPGx	NA19109_PyPGx	NA20509_PyPGx	NA10854_PyPGx	NA19226_PyPGx	NA10847_PyPGx	NA18552_PyPGx	NA18526_PyPGx	NA07029_PyPGx	NA06991_PyPGx	NA11832_PyPGx	NA21781_PyPGx	NA12145_PyPGx	NA19007_PyPGx	NA18861_PyPGx	NA12156_PyPGx	NA18952_PyPGx	NA18565_PyPGx	NA19920_PyPGx	NA12003_PyPGx	NA20296_PyPGx	NA07019_PyPGx	NA07056_PyPGx	NA11993_PyPGx	NA19147_PyPGx	NA19819_PyPGx	NA07000_PyPGx	NA18540_PyPGx	NA19095_PyPGx	NA18509_PyPGx	NA19917_PyPGx	NA18617_PyPGx	NA07357_PyPGx	NA19176_PyPGx	NA18959_PyPGx	NA07348_PyPGx	NA18564_PyPGx	NA19908_PyPGx	NA11839_PyPGx	NA12717_PyPGx
+    chr1	47261780	.	T	C	235.707	PASS	DP=1519;VDB=0.326231;SGB=-40.8249;RPBZ=0.398415;MQBZ=-15.2308;MQSBZ=0.889911;BQBZ=-10.8447;SCBZ=0.105486;FS=0;MQ0F=0;AC=120;AN=140;DP4=205,13,1153,122;MQ=49	GT:PL:AD	0/0:0,57,255:19,0	0/1:204,0,172:10,11	1/1:240,45,0:0,15	0/1:147,0,165:11,10	1/1:246,54,0:0,18	1/1:255,66,0:0,22	0/1:134,0,182:15,9	1/1:255,87,0:0,29	1/1:231,54,0:0,18	1/1:224,57,0:0,19	1/1:248,36,0:0,12	0/1:120,0,176:9,7	1/1:255,54,0:0,18	1/1:198,75,0:0,25	0/1:168,0,127:7,12	1/1:255,57,0:0,19	0/1:105,0,183:9,5	1/1:223,51,0:0,17	1/1:255,63,0:0,21	1/1:255,80,0:1,31	1/1:189,60,0:0,20	0/1:148,0,214:10,12	1/1:191,45,0:0,15	0/1:98,0,175:15,6	1/1:255,69,0:0,23	0/1:158,0,100:7,16	0/1:161,0,114:5,12	0/1:255,0,138:9,14	1/1:247,81,0:0,27	1/1:227,57,0:0,19	1/1:255,63,0:0,21	1/1:255,69,0:0,23	1/1:255,75,0:0,25	1/1:255,84,0:0,28	0/1:202,0,190:14,15	1/1:224,69,0:0,23	1/1:255,66,0:0,22	1/1:255,63,0:0,21	1/1:255,39,0:0,13	1/1:255,51,0:0,17	1/1:255,72,0:0,24	1/1:231,63,0:0,21	1/1:255,78,0:0,26	1/1:255,75,0:0,25	0/1:145,0,227:16,10	1/1:200,72,0:0,24	1/1:205,72,0:0,24	1/1:207,66,0:0,22	0/1:109,0,172:12,8	0/1:174,0,135:9,14	1/1:255,66,0:0,22	1/1:255,45,0:0,15	1/1:249,54,0:0,18	1/1:255,54,0:0,18	1/1:230,72,0:0,24	1/1:247,63,0:0,21	1/1:211,81,0:0,27	1/1:255,54,0:0,18	0/1:167,0,193:13,13	1/1:255,72,0:0,24	0/1:76,0,159:11,4	1/1:236,66,0:0,22	1/1:255,78,0:0,26	1/1:218,45,0:0,15	1/1:255,60,0:0,20	1/1:255,66,0:0,22	1/1:202,78,0:0,26	1/1:255,81,0:0,27	0/1:181,0,176:16,11	1/1:231,33,0:0,11
+    chr1	47261821	.	G	A	174.846	PASS	DP=1722;VDB=0.413935;SGB=-18.2343;RPBZ=0.238211;MQBZ=-1.89867;MQSBZ=6.49061;BQBZ=1.3413;SCBZ=0.173613;FS=0;MQ0F=0;AC=1;AN=140;DP4=1407,277,14,2;MQ=52	GT:PL:AD	0/0:0,81,255:27,0	0/0:0,84,255:28,0	0/0:0,60,255:20,0	0/0:0,90,239:30,0	0/0:0,60,221:20,0	0/0:0,84,255:28,0	0/0:0,84,241:28,0	0/0:0,81,255:27,0	0/0:0,63,190:21,0	0/1:200,0,127:11,110/0:0,63,255:21,0	0/0:0,75,255:25,0	0/0:0,63,255:21,0	0/0:0,63,215:21,0	0/0:0,69,216:23,0	0/0:0,75,255:25,0	0/0:0,54,244:18,0	0/0:0,57,212:19,0	0/0:0,90,255:30,0	0/0:0,96,255:32,0	0/0:0,72,241:24,0	0/0:0,72,223:24,0	0/0:0,54,191:18,0	0/0:0,75,223:25,0	0/0:0,75,255:25,0	0/0:0,90,222:30,0	0/0:0,54,180:18,0	0/0:0,99,255:33,0	0/0:0,93,255:31,0	0/0:0,66,212:22,0	0/0:0,72,255:24,0	0/0:0,75,243:25,0	0/0:0,72,255:24,0	0/0:0,69,255:27,1	0/0:0,102,250:34,0	0/0:0,81,186:27,0	0/0:0,66,255:22,0	0/0:0,72,255:24,0	0/0:0,50,236:21,1	0/0:0,60,255:20,0	0/0:0,75,255:25,0	0/0:0,54,182:18,0	0/0:0,75,255:25,0	0/0:0,78,255:26,0	0/0:0,81,233:27,0	0/0:0,78,153:26,0	0/0:0,75,180:25,0	0/0:0,60,174:20,0	0/0:0,51,189:17,0	0/0:0,84,234:28,0	0/0:0,63,255:21,0	0/0:0,48,210:16,0	0/0:0,63,231:21,0	0/0:0,69,255:23,0	0/0:0,81,252:27,0	0/0:0,69,178:23,0	0/0:0,69,221:23,0	0/0:0,57,255:19,0	0/0:0,75,217:25,0	0/0:0,93,255:31,0	0/0:0,54,231:18,0	0/0:0,96,211:32,0	0/0:0,93,255:31,0	0/0:0,54,211:18,0	0/0:0,66,243:22,0	0/0:0,72,222:24,0	0/0:0,90,236:30,0	0/0:0,78,242:26,0	0/0:0,87,255:29,0	0/0:0,45,255:15,0
+    chr1	47261822	.	A	T	232.856	PASS	DP=1729;VDB=0.568499;SGB=-11.6626;RPBZ=-0.581723;MQBZ=-14.8734;MQSBZ=6.53808;BQBZ=1.09344;SCBZ=1.03879;FS=0;MQ0F=0;AC=88;AN=140;DP4=544,110,864,174;MQ=52	GT:PL:AD	0/0:0,81,255:27,0	0/1:255,0,226:12,17	1/1:255,60,0:0,20	0/0:0,87,255:29,0	0/0:0,63,255:21,0	0/1:152,0,255:15,11	0/1:182,0,223:17,11	1/1:255,81,0:0,27	0/1:128,0,189:13,8	1/1:255,69,0:0,23	1/1:255,66,0:0,22	0/1:246,0,193:11,14	1/1:255,60,0:0,20	1/1:255,60,0:0,20	0/0:0,66,255:22,0	1/1:255,75,0:0,25	0/0:0,54,255:18,0	1/1:255,54,0:0,18	0/1:209,0,255:19,10	0/1:255,0,255:16,161/1:255,72,0:0,24	0/1:145,0,248:15,10	0/1:113,0,170:9,6	0/1:153,0,206:16,8	1/1:255,69,0:0,23	0/0:0,87,255:29,0	0/1:149,0,187:9,10	0/1:255,0,171:12,20	0/1:176,0,255:16,13	0/1:218,0,145:11,130/1:221,0,218:14,10	0/1:237,0,184:11,15	1/1:255,72,0:0,24	1/1:255,84,0:0,28	0/1:254,0,194:16,181/1:255,75,0:0,25	1/1:255,60,0:0,20	1/1:255,69,0:0,23	0/0:0,69,255:23,0	0/0:0,60,255:20,0	1/1:255,72,0:0,24	1/1:236,54,0:0,18	1/1:255,75,0:0,25	0/1:155,0,255:18,10	0/0:0,81,255:27,0	1/1:212,75,0:0,25	0/1:196,0,133:10,15	0/1:171,0,155:9,11	0/1:105,0,188:10,7	0/1:182,0,219:14,131/1:255,63,0:0,21	1/1:255,48,0:0,16	1/1:255,63,0:0,21	1/1:255,72,0:0,24	1/1:255,78,0:0,26	1/1:232,63,0:0,21	0/0:0,66,255:22,0	0/1:150,0,215:10,7	0/1:180,0,178:13,12	0/1:243,0,190:12,180/1:106,0,222:11,6	0/1:212,0,193:13,19	1/1:255,87,0:0,29	1/1:255,57,0:0,19	0/1:203,0,189:9,13	1/1:255,69,0:0,23	0/1:233,0,146:9,20	0/0:0,81,255:27,0	0/1:180,0,249:19,9	1/1:255,45,0:0,15
+    chr1	47261869	.	C	T	235.707	PASS	DP=1863;VDB=0.677143;SGB=5.02317;RPBZ=-2.55997;MQBZ=-8.87433;MQSBZ=3.1481;BQBZ=26.6865;SCBZ=0.647961;FS=0;MQ0F=0;AC=88;AN=140;DP4=522,174,834,311;MQ=56	GT:PL:AD	0/0:0,84,255:28,0	0/1:255,0,194:12,20	1/1:255,69,0:0,23	0/0:0,93,255:31,0	0/0:0,69,255:23,0	0/1:216,0,255:17,11	0/1:218,0,238:14,14	1/1:255,90,0:0,30	0/1:190,0,148:10,9	1/1:255,60,0:0,20	1/1:255,81,0:0,27	0/1:255,0,192:13,13	1/1:255,81,0:0,27	1/1:255,81,0:1,31	0/0:0,65,255:26,1	1/1:255,78,0:0,26	0/0:0,51,255:17,0	1/1:255,63,0:0,21	0/1:240,0,244:17,12	0/1:255,0,255:17,171/1:255,69,0:0,23	0/1:186,0,239:16,11	0/1:247,0,255:15,12	0/1:231,0,221:17,11	1/1:255,69,0:0,23	0/0:0,87,255:29,0	0/1:196,0,198:11,11	0/1:255,0,213:16,20	0/1:232,0,238:16,13	0/1:255,0,175:13,150/1:223,0,245:22,11	0/1:255,0,255:15,16	1/1:255,81,0:0,27	1/1:255,99,0:0,33	0/1:255,0,209:16,161/1:255,87,0:0,29	1/1:255,75,0:0,25	1/1:255,75,0:0,25	0/0:0,66,255:22,0	0/0:0,63,255:21,0	1/1:255,78,0:0,26	1/1:218,54,0:0,18	1/1:255,78,0:0,26	0/1:173,0,255:23,12	0/0:0,72,255:24,0	1/1:255,75,0:0,25	0/1:213,0,168:11,13	0/1:247,0,188:11,12	0/1:195,0,124:6,9	0/1:173,0,205:16,121/1:255,66,0:0,22	1/1:255,72,0:0,24	1/1:255,54,0:0,18	1/1:255,93,0:0,31	1/1:255,84,0:0,28	1/1:255,66,0:0,22	0/0:0,48,255:21,1	0/1:190,0,255:13,8	0/1:255,0,173:9,13	0/1:255,0,214:16,180/1:202,0,179:12,11	0/1:255,0,218:16,17	1/1:255,84,0:0,28	1/1:255,81,0:0,27	0/1:255,0,111:7,18	1/1:255,69,0:0,23	0/1:255,0,213:13,19	0/0:0,66,255:22,0	0/1:253,0,247:21,13	1/1:255,75,0:0,25
+    chr1	47261936	.	C	T	232.857	PASS	DP=2179;VDB=0.991573;SGB=71.95;RPBZ=0.621331;MQBZ=0.919674;MQSBZ=-0.0215108;BQBZ=10.1541;SCBZ=0.212854;FS=0;MQ0F=0;AC=17;AN=140;DP4=1145,745,173,83;MQ=59	GT:PL:AD	0/0:0,87,255:29,0	0/0:0,117,255:39,0	0/0:0,72,255:24,0	0/0:0,105,255:35,0	0/1:205,0,189:10,160/1:255,0,230:10,15	0/0:0,96,255:32,0	0/0:0,96,255:32,0	0/1:225,0,222:13,12	0/0:0,69,255:23,0	0/0:0,105,255:35,0	0/0:0,78,255:26,0	0/0:0,114,255:38,0	0/0:0,123,255:41,0	0/1:210,0,255:18,100/0:0,105,255:35,0	0/0:0,78,255:26,0	0/0:0,90,255:30,0	0/0:0,96,255:32,0	0/0:0,108,255:36,0	0/0:0,84,255:28,0	0/0:0,75,255:25,0	0/1:255,0,255:15,13	0/0:0,93,255:31,0	0/0:0,84,255:28,0	0/0:0,87,255:29,0	0/0:0,81,255:27,0	0/0:0,111,255:37,0	0/1:255,0,183:10,16	0/1:255,0,251:15,170/0:0,108,255:36,0	0/0:0,99,255:33,0	0/0:0,102,255:34,0	0/0:0,99,255:33,0	0/0:0,105,255:35,0	0/0:0,117,255:39,0	0/0:0,78,255:26,0	0/0:0,102,255:34,0	1/1:255,75,0:0,25	1/1:255,99,0:0,33	0/0:0,78,255:26,0	0/0:0,66,255:22,0	0/0:0,96,255:32,0	0/0:0,87,255:29,0	0/0:0,81,255:27,0	0/0:0,93,255:31,0	0/1:224,0,252:15,13	0/0:0,96,255:32,0	0/0:0,81,255:27,0	0/0:0,102,255:34,0	0/0:0,87,255:29,0	0/0:0,108,255:36,0	0/0:0,69,255:23,0	0/0:0,96,255:32,0	0/0:0,96,255:32,0	0/0:0,93,255:31,0	1/1:255,99,0:0,33	0/0:0,81,255:27,0	0/0:0,87,255:29,0	0/0:0,102,255:34,0	0/0:0,81,255:27,0	0/1:255,0,255:20,17	0/0:0,93,255:31,0	0/0:0,84,255:28,0	0/1:100,0,255:22,6	0/0:0,87,255:29,0	0/1:255,0,255:24,19	0/0:0,78,255:26,0	0/0:0,102,255:34,0	0/0:0,66,255:22,0
+
 At this point, you are now ready to move on to the next step.
 
+(Optional) Creating input files
+-------------------------------
+
 Optionally, in case you are interested in creating above input files on your
 own, I have also prepared "mini" BAM files for GRCh37 where the original
 sequencing data from GeT-RM have been sliced to contain genes used by PyPGx
@@ -82,13 +140,30 @@ FASTA when creating input VCF, which can be downloaded from `here
 <https://1drv.ms/u/s!Apgoq3uQ2gCqgt4qGq9YsumpVk9xJQ?e=ZewLHu>`__.
 
 Once you are finished downloading the mini BAM files and the reference FASTA
-file, let's first create input VCF:
+file, first create a text file (.txt, .tsv, .csv, or .list) containing one
+BAM file per line such that:
+
+.. code-block:: text
+
+    $ cat grch37-bam.list | head
+    /path/to/grch37-bam/NA18519_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/HG01190_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA12006_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA18484_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA07055_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA18980_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA19213_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA12813_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA19003_PyPGx.sorted.markdup.recal.bam
+    /path/to/grch37-bam/NA10831_PyPGx.sorted.markdup.recal.bam
+
+Now we can create input VCF:
 
 .. code-block:: text
 
     $ pypgx create-input-vcf \
     grch37-variants.vcf.gz \
-    /path/to/genome.fa \
+    /path/to/GRCh37/genome.fa \
     grch37-bam.list
 
 Note that this step can take some time to run. For example, it takes about 1

From 8e2e48862578e4aa7839c5b22cbaf5b65a0aab69 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 11 Apr 2022 09:24:04 +0900
Subject: [PATCH 11/32] Update docs

---
 docs/tutorials.rst | 66 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 79f0aea1..9ac2d75e 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -227,7 +227,71 @@ Above will create a number of archive files:
 In addition to these files, PyPGx will have also created two directories
 called ``copy-number-profile`` and ``allele-fraction-profile``.
 
-Now let's make sure the genotype results are correct by comparing them with the validation data:
+Let's take a look at the results:
+
+.. code-block:: text
+
+    $ pypgx print-data grch37-CYP2D6-pipeline/results.zip | head
+    	Genotype	Phenotype	Haplotype1	Haplotype2	AlternativePhase	VariantData	CNV
+    HG00589_PyPGx	*1/*21	Intermediate Metabolizer	*21;*2;	*1;	;	*21:22-42524213-C-CG:0.378;*1:22-42522613-G-C,22-42523943-A-G:0.645,0.625;*2:default;	Normal
+    NA07019_PyPGx	*1/*4	Intermediate Metabolizer	*1;	*4;*10;*74;*2;	;	*4:22-42524947-C-T:0.452;*10:22-42523943-A-G,22-42526694-G-A:1.0,0.448;*74:22-42525821-G-T:0.424;*1:22-42522613-G-C,22-42523943-A-G:0.361,1.0;*2:default;	Normal
+    NA10851_PyPGx	*1/*4	Intermediate Metabolizer	*1;	*4;*10;*74;*2;	;	*4:22-42524947-C-T:0.467;*10:22-42523943-A-G,22-42526694-G-A:0.95,0.421;*74:22-42525821-G-T:0.447;*1:22-42522613-G-C,22-42523943-A-G:0.486,0.95;*2:default;	Normal
+    NA18484_PyPGx	*1/*17	Normal Metabolizer	*1;	*17;*2;	;	*17:22-42525772-G-A:0.6;*1:22-42522613-G-C,22-42523943-A-G:0.625,0.391;*2:default;	Normal
+    NA12006_PyPGx	*4/*41	Intermediate Metabolizer	*41;*2;	*4;*10;*2;	*69;	*69:22-42526694-G-A,22-42523805-C-T:0.473,0.528;*4:22-42524947-C-T:0.448;*10:22-42523943-A-G,22-42526694-G-A:0.545,0.473;*41:22-42523805-C-T:0.528;*2:default;	Normal
+    HG00436_PyPGx	*2x2/*71	Indeterminate	*71;*1;	*2;	;	*71:22-42526669-C-T:0.433;*1:22-42522613-G-C,22-42523943-A-G:0.462,0.353;*2:default;	Duplication
+    NA19213_PyPGx	*1/*1	Normal Metabolizer	*1;	*1;	;	*1:22-42522613-G-C,22-42523943-A-G:1.0,1.0;	Normal
+    NA19207_PyPGx	*2x2/*10	Normal Metabolizer	*10;*2;	*2;	;	*10:22-42523943-A-G,22-42526694-G-A:0.366,0.25;*2:default;	Duplication
+    NA07029_PyPGx	*1/*35	Normal Metabolizer	*35;*2;	*1;	;	*1:22-42522613-G-C,22-42523943-A-G:0.596,0.476;*35:22-42526763-C-T:0.405;*2:default;	Normal
+
+You can read :ref:`readme:Results interpretation` for details on how to
+interpret the PyPGx results.
+
+Next, we can manually inspect SV calls by visualizing copy number and allele
+fraction for the CYP2D6 locus (read :ref:`readme:Structural variation
+detection` for details). For example, above results indicate that the samples
+``HG00589_PyPGx`` and ``HG00436_PyPGx`` have ``Normal`` and ``Duplication``
+as CNV calls, respectively:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 45 45
+
+   * - Sample
+     - Copy Number
+     - Allele Fraction
+   * - HG00589_PyPGx
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-copy-number.png
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-allele-fraction.png
+   * - HG00436_PyPGx
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-copy-number.png
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-allele-fraction.png
+
+If you want to prepare publication quality figures, it's strongly recommended
+to combine copy number and allele fraction profiles together:
+
+.. code-block:: text
+
+    $ pypgx plot-cn-af \
+    grch37-CYP2D6-pipeline/copy-number.zip \
+    grch37-CYP2D6-pipeline/imported-variants.zip \
+    --samples HG00589_PyPGx HG00436_PyPGx
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 90
+
+   * - Sample
+     - Profile
+   * - HG00589_PyPGx
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00589-combined.png
+   * - HG00436_PyPGx
+     - .. image:: https://raw.githubusercontent.com/sbslee/pypgx-data/main/getrm-wgs-tutorial/HG00436-combined.png
+
+Note that above also adds a fitted line on top of each copy number profile to
+display what the SV classifier actually "sees".
+
+Now let's make sure the genotype results are correct by comparing them with
+the validation data:
 
 .. code-block:: text
 

From 5dab4161442e826c418e4183f2ae012cf4c9e161 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Tue, 12 Apr 2022 14:46:16 +0900
Subject: [PATCH 12/32] Update docs

---
 docs/tutorials.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 9ac2d75e..1c656f92 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -134,10 +134,11 @@ At this point, you are now ready to move on to the next step.
 Optionally, in case you are interested in creating above input files on your
 own, I have also prepared "mini" BAM files for GRCh37 where the original
 sequencing data from GeT-RM have been sliced to contain genes used by PyPGx
-only. You can download them `here <https://1drv.ms/u/
-s!Apgoq3uQ2gCqgrovIFKJSi-ECXY9pw?e=uP5EeU>`__. You will also need reference
-FASTA when creating input VCF, which can be downloaded from `here
-<https://1drv.ms/u/s!Apgoq3uQ2gCqgt4qGq9YsumpVk9xJQ?e=ZewLHu>`__.
+only. You can download them from the shared OneDrive folder `sbslee-bucket
+<https://1drv.ms/u/s!Apgoq3uQ2gCqgt4s_Ucn8a_gk7kmOg?e=5BfM4K>`__. There, you
+can navigate to ``sbslee-bucket`` > ``pypgx`` > ``getrm-wgs-tutorial`` >
+``grch37-bam``. You will also need reference FASTA when creating input VCF, 
+which can be downloaded from ``sbslee-bucket`` > ``ref`` > ``grch37``.
 
 Once you are finished downloading the mini BAM files and the reference FASTA
 file, first create a text file (.txt, .tsv, .csv, or .list) containing one

From 805c92a81bb0db5bda4762eb1be743e1f163118a Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 20 Apr 2022 17:20:19 +0900
Subject: [PATCH 13/32] Update docs

---
 docs/tutorials.rst | 50 +++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 1c656f92..fe296e23 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -126,7 +126,8 @@ se, but we can still peek at its data:
     chr1	47261869	.	C	T	235.707	PASS	DP=1863;VDB=0.677143;SGB=5.02317;RPBZ=-2.55997;MQBZ=-8.87433;MQSBZ=3.1481;BQBZ=26.6865;SCBZ=0.647961;FS=0;MQ0F=0;AC=88;AN=140;DP4=522,174,834,311;MQ=56	GT:PL:AD	0/0:0,84,255:28,0	0/1:255,0,194:12,20	1/1:255,69,0:0,23	0/0:0,93,255:31,0	0/0:0,69,255:23,0	0/1:216,0,255:17,11	0/1:218,0,238:14,14	1/1:255,90,0:0,30	0/1:190,0,148:10,9	1/1:255,60,0:0,20	1/1:255,81,0:0,27	0/1:255,0,192:13,13	1/1:255,81,0:0,27	1/1:255,81,0:1,31	0/0:0,65,255:26,1	1/1:255,78,0:0,26	0/0:0,51,255:17,0	1/1:255,63,0:0,21	0/1:240,0,244:17,12	0/1:255,0,255:17,171/1:255,69,0:0,23	0/1:186,0,239:16,11	0/1:247,0,255:15,12	0/1:231,0,221:17,11	1/1:255,69,0:0,23	0/0:0,87,255:29,0	0/1:196,0,198:11,11	0/1:255,0,213:16,20	0/1:232,0,238:16,13	0/1:255,0,175:13,150/1:223,0,245:22,11	0/1:255,0,255:15,16	1/1:255,81,0:0,27	1/1:255,99,0:0,33	0/1:255,0,209:16,161/1:255,87,0:0,29	1/1:255,75,0:0,25	1/1:255,75,0:0,25	0/0:0,66,255:22,0	0/0:0,63,255:21,0	1/1:255,78,0:0,26	1/1:218,54,0:0,18	1/1:255,78,0:0,26	0/1:173,0,255:23,12	0/0:0,72,255:24,0	1/1:255,75,0:0,25	0/1:213,0,168:11,13	0/1:247,0,188:11,12	0/1:195,0,124:6,9	0/1:173,0,205:16,121/1:255,66,0:0,22	1/1:255,72,0:0,24	1/1:255,54,0:0,18	1/1:255,93,0:0,31	1/1:255,84,0:0,28	1/1:255,66,0:0,22	0/0:0,48,255:21,1	0/1:190,0,255:13,8	0/1:255,0,173:9,13	0/1:255,0,214:16,180/1:202,0,179:12,11	0/1:255,0,218:16,17	1/1:255,84,0:0,28	1/1:255,81,0:0,27	0/1:255,0,111:7,18	1/1:255,69,0:0,23	0/1:255,0,213:13,19	0/0:0,66,255:22,0	0/1:253,0,247:21,13	1/1:255,75,0:0,25
     chr1	47261936	.	C	T	232.857	PASS	DP=2179;VDB=0.991573;SGB=71.95;RPBZ=0.621331;MQBZ=0.919674;MQSBZ=-0.0215108;BQBZ=10.1541;SCBZ=0.212854;FS=0;MQ0F=0;AC=17;AN=140;DP4=1145,745,173,83;MQ=59	GT:PL:AD	0/0:0,87,255:29,0	0/0:0,117,255:39,0	0/0:0,72,255:24,0	0/0:0,105,255:35,0	0/1:205,0,189:10,160/1:255,0,230:10,15	0/0:0,96,255:32,0	0/0:0,96,255:32,0	0/1:225,0,222:13,12	0/0:0,69,255:23,0	0/0:0,105,255:35,0	0/0:0,78,255:26,0	0/0:0,114,255:38,0	0/0:0,123,255:41,0	0/1:210,0,255:18,100/0:0,105,255:35,0	0/0:0,78,255:26,0	0/0:0,90,255:30,0	0/0:0,96,255:32,0	0/0:0,108,255:36,0	0/0:0,84,255:28,0	0/0:0,75,255:25,0	0/1:255,0,255:15,13	0/0:0,93,255:31,0	0/0:0,84,255:28,0	0/0:0,87,255:29,0	0/0:0,81,255:27,0	0/0:0,111,255:37,0	0/1:255,0,183:10,16	0/1:255,0,251:15,170/0:0,108,255:36,0	0/0:0,99,255:33,0	0/0:0,102,255:34,0	0/0:0,99,255:33,0	0/0:0,105,255:35,0	0/0:0,117,255:39,0	0/0:0,78,255:26,0	0/0:0,102,255:34,0	1/1:255,75,0:0,25	1/1:255,99,0:0,33	0/0:0,78,255:26,0	0/0:0,66,255:22,0	0/0:0,96,255:32,0	0/0:0,87,255:29,0	0/0:0,81,255:27,0	0/0:0,93,255:31,0	0/1:224,0,252:15,13	0/0:0,96,255:32,0	0/0:0,81,255:27,0	0/0:0,102,255:34,0	0/0:0,87,255:29,0	0/0:0,108,255:36,0	0/0:0,69,255:23,0	0/0:0,96,255:32,0	0/0:0,96,255:32,0	0/0:0,93,255:31,0	1/1:255,99,0:0,33	0/0:0,81,255:27,0	0/0:0,87,255:29,0	0/0:0,102,255:34,0	0/0:0,81,255:27,0	0/1:255,0,255:20,17	0/0:0,93,255:31,0	0/0:0,84,255:28,0	0/1:100,0,255:22,6	0/0:0,87,255:29,0	0/1:255,0,255:24,19	0/0:0,78,255:26,0	0/0:0,102,255:34,0	0/0:0,66,255:22,0
 
-At this point, you are now ready to move on to the next step.
+At this point, you are now ready to move on to the next step:
+:ref:`tutorials:Genotyping genes with SV`.
 
 (Optional) Creating input files
 -------------------------------
@@ -134,38 +135,37 @@ At this point, you are now ready to move on to the next step.
 Optionally, in case you are interested in creating above input files on your
 own, I have also prepared "mini" BAM files for GRCh37 where the original
 sequencing data from GeT-RM have been sliced to contain genes used by PyPGx
-only. You can download them from the shared OneDrive folder `sbslee-bucket
-<https://1drv.ms/u/s!Apgoq3uQ2gCqgt4s_Ucn8a_gk7kmOg?e=5BfM4K>`__. There, you
-can navigate to ``sbslee-bucket`` > ``pypgx`` > ``getrm-wgs-tutorial`` >
-``grch37-bam``. You will also need reference FASTA when creating input VCF, 
-which can be downloaded from ``sbslee-bucket`` > ``ref`` > ``grch37``.
+only:
 
-Once you are finished downloading the mini BAM files and the reference FASTA
-file, first create a text file (.txt, .tsv, .csv, or .list) containing one
-BAM file per line such that:
+.. code-block:: text
+
+    $ mkdir grch37-bam
+    $ wget https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam.list
+    $ head -n 6 grch37-bam.list
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00276_PyPGx.sorted.markdup.recal.bai
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00276_PyPGx.sorted.markdup.recal.bam
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00436_PyPGx.sorted.markdup.recal.bai
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00436_PyPGx.sorted.markdup.recal.bam
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00589_PyPGx.sorted.markdup.recal.bai
+    https://storage.googleapis.com/sbslee-bucket/pypgx/getrm-wgs-tutorial/grch37-bam/HG00589_PyPGx.sorted.markdup.recal.bam
+    $ wget -i grch37-bam.list -P grch37-bam
+
+You will also need reference FASTA when creating input VCF:
 
 .. code-block:: text
 
-    $ cat grch37-bam.list | head
-    /path/to/grch37-bam/NA18519_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/HG01190_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA12006_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA18484_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA07055_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA18980_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA19213_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA12813_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA19003_PyPGx.sorted.markdup.recal.bam
-    /path/to/grch37-bam/NA10831_PyPGx.sorted.markdup.recal.bam
+    $ wget https://storage.googleapis.com/sbslee-bucket/ref/grch37/genome.fa
+    $ wget https://storage.googleapis.com/sbslee-bucket/ref/grch37/genome.fa.fai
 
-Now we can create input VCF:
+Once you are finished downloading the mini BAM files and the reference FASTA
+file, we can create input VCF:
 
 .. code-block:: text
 
     $ pypgx create-input-vcf \
     grch37-variants.vcf.gz \
-    /path/to/GRCh37/genome.fa \
-    grch37-bam.list
+    genome.fa \
+    grch37-bam/*.bam
 
 Note that this step can take some time to run. For example, it takes about 1
 hour to finish using my personal MacBook Air (M1, 2020) with 8 GB of memory.
@@ -176,7 +176,7 @@ Next, we will compute depth of coverage for genes that are known to have SV:
 
     $ pypgx prepare-depth-of-coverage \
     grch37-depth-of-coverage.zip \
-    grch37-bam.list
+    grch37-bam/*.bam
 
 This step should be quick. It finishes in less than 30 seconds with my laptop.
 
@@ -188,7 +188,7 @@ locus, which is required when converting read depth to copy number:
     $ pypgx compute-control-statistics \
     VDR \
     grch37-control-statistics-VDR.zip \
-    grch37-bam.list
+    grch37-bam/*.bam
 
 This step should be quick as well. It finishes in less than 5 seconds with my
 laptop.

From 21cf90aa6624d98bc76de86877ded6cda598483e Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 25 Apr 2022 17:08:01 +0900
Subject: [PATCH 14/32] Update CNV data for CYP2A6

---
 CHANGELOG.rst                |  2 ++
 docs/genes.rst               | 24 +++++++++++++++++++++++-
 pypgx/api/data/cnv-table.csv |  2 ++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0c6cc7cc..0721e15d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,8 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
+* Improve CNV caller for CYP2A6.
+* Add new CNV calls for CYP2A6: ``Deletion2Hom`` and ``Hybrid5``.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/docs/genes.rst b/docs/genes.rst
index 741871ea..e3946c2f 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -533,7 +533,7 @@ Below is a summary table:
      - `chr4:68640596-68676652 <https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg38&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chr4%3A68640596%2D68676652&hgsid=1251392659_FCwuNEZja7PPePnsIvfT1wF8Ke9Y>`__
      -
    * - :ref:`genes:UGT2B17`
-     - 
+     -
      - ✅
      -
      -
@@ -725,6 +725,17 @@ Below is comprehensive summary of SV described from real NGS studies:
      -
      -
      -
+   * - \*4
+     - Deletion2Hom
+     - \*4/\*4
+     -
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-3.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2A6-16.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2A6-16.png>`
+     - WGS
+     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+     - NA21093
+     -
    * - \*4
      - Deletion3Het
      - \*4/\*9
@@ -824,6 +835,17 @@ Below is comprehensive summary of SV described from real NGS studies:
      - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
      - NA20515
      -
+   * -
+     - Hybrid5
+     - Indeterminate
+     -
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-11.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2A6-17.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2A6-17.png>`
+     - WGS
+     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+     - HG00155
+     -
    * -
      - PseudogeneDuplication
      - \*1/\*18
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index be638710..bba5785b 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -14,6 +14,8 @@ CYP2A6,Duplication1
 CYP2A6,Duplication2
 CYP2A6,Duplication3
 CYP2A6,Tandem
+CYP2A6,Deletion2Hom
+CYP2A6,Hybrid5
 CYP2B6,Normal
 CYP2B6,Hybrid
 CYP2B6,Duplication

From 9c1fc08cd2a67f8400f90d0094851f0d3f164aae Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Tue, 26 Apr 2022 16:16:26 +0900
Subject: [PATCH 15/32] Update CNV data for CYP2A6:

* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``,
``Hybrid6``.
---
 CHANGELOG.rst                |  2 +-
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0721e15d..ba7dcc6e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,7 +8,7 @@ Changelog
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
 * Improve CNV caller for CYP2A6.
-* Add new CNV calls for CYP2A6: ``Deletion2Hom`` and ``Hybrid5``.
+* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/docs/genes.rst b/docs/genes.rst
index e3946c2f..7371ac89 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -846,6 +846,17 @@ Below is comprehensive summary of SV described from real NGS studies:
      - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
      - HG00155
      -
+   * -
+     - Hybrid6
+     - Indeterminate
+     -
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-12.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2A6-18.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2A6-18.png>`
+     - WGS
+     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+     - HG00141
+     -
    * -
      - PseudogeneDuplication
      - \*1/\*18
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index bba5785b..eba47c8e 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -16,6 +16,7 @@ CYP2A6,Duplication3
 CYP2A6,Tandem
 CYP2A6,Deletion2Hom
 CYP2A6,Hybrid5
+CYP2A6,Hybrid6
 CYP2B6,Normal
 CYP2B6,Hybrid
 CYP2B6,Duplication

From 8915da1c50b2b371556579eab47e0dd386ef1bfe Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 27 Apr 2022 16:34:34 +0900
Subject: [PATCH 16/32] Update CNV data for CYP2E1

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index ba7dcc6e..cf898813 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6.
+* Improve CNV caller for CYP2A6 and CYP2E1.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 
 0.14.0 (2022-04-03)

From cbdbf72a7f38b7893e0e97c10d608c6e968a59c5 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 27 Apr 2022 16:34:46 +0900
Subject: [PATCH 17/32] Fix typo in docs

---
 docs/genes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/genes.rst b/docs/genes.rst
index 7371ac89..92f4aeba 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -823,7 +823,7 @@ Below is comprehensive summary of SV described from real NGS studies:
      - WGS
      - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
      - NA18516
-     - \*34 has axons 1-4 of CYP2A7 origin and axons 5-9 of CYP2A6 origin (breakpoint in intron 4).
+     - \*34 has exons 1-4 of CYP2A7 origin and exons 5-9 of CYP2A6 origin (breakpoint in intron 4).
    * -
      - Hybrid4
      - Indeterminate

From 7c5577913581688f532568b0e9c8be50b68093f1 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 27 Apr 2022 16:37:49 +0900
Subject: [PATCH 18/32] Update docs

---
 docs/genes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/genes.rst b/docs/genes.rst
index 92f4aeba..b156a188 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -839,7 +839,7 @@ Below is comprehensive summary of SV described from real NGS studies:
      - Hybrid5
      - Indeterminate
      -
-     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-11.png>`
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-13.png>`
      - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2A6-17.png>`
      - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2A6-17.png>`
      - WGS

From c1bca76e57026d8e01a6bf9656d641846baea75b Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Wed, 27 Apr 2022 21:55:27 +0900
Subject: [PATCH 19/32] Update CNV data for SULT1A1

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index cf898813..7f18401d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6 and CYP2E1.
+* Improve CNV caller for CYP2A6, CYP2E1, SULT1A1.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 
 0.14.0 (2022-04-03)

From 1904ad98dd5dd2105695fe0ffb344859635afb39 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 10:02:14 +0900
Subject: [PATCH 20/32] Update CNV data for GSTM1

---
 CHANGELOG.rst                |  3 ++-
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 pypgx/api/genotype.py        |  2 +-
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 7f18401d..f4290b4e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,8 +7,9 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2E1, SULT1A1.
+* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
+* Add new CNV call for GSTM1: ``Normal,Deletion2``.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/docs/genes.rst b/docs/genes.rst
index b156a188..086a9da8 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -1813,6 +1813,17 @@ Below is comprehensive summary of SV described from real NGS studies:
     - `GeT-RM <https://pubmed.ncbi.nlm.nih.gov/26621101/>`__
     - NA18855
     -
+  * - \*0
+    - Normal,Deletion2
+    - \*0/\*A
+    -
+    - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-GSTM1-2.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-GSTM1-10.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-GSTM1-10.png>`
+    - WGS
+    - `GeT-RM <https://pubmed.ncbi.nlm.nih.gov/26621101/>`__
+    - NA21097
+    -
   * - \*0
     - DeletionHom
     - \*0/\*0
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index eba47c8e..a2e95aee 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -57,6 +57,7 @@ GSTM1,UpstreamDeletionHet
 GSTM1,"DeletionHet,UpstreamDeletionHet"
 GSTM1,PartialDuplication
 GSTM1,"DeletionHet,Deletion2"
+GSTM1,"Normal,Deletion2"
 GSTT1,Normal
 GSTT1,DeletionHet
 GSTT1,DeletionHom
diff --git a/pypgx/api/genotype.py b/pypgx/api/genotype.py
index 2e5a2f4c..33ea825b 100644
--- a/pypgx/api/genotype.py
+++ b/pypgx/api/genotype.py
@@ -374,7 +374,7 @@ def one_row(self, r):
         s1, s2 = core.sort_alleles([a1, a2], by='priority', gene=self.gene, assembly=self.assembly)
         if r.CNV in ['Normal', 'AssumeNormal', 'UpstreamDeletionHet']:
             result = [a1, a2]
-        elif r.CNV in ['DeletionHet', 'DeletionHet,UpstreamDeletionHet']:
+        elif r.CNV in ['DeletionHet', 'DeletionHet,UpstreamDeletionHet', 'Normal,Deletion2']:
             result = [s1, '*0']
         elif r.CNV in ['DeletionHom', 'DeletionHet,Deletion2']:
             result = ['*0', '*0']

From f5fa3fe88f59a09ff54a48054175dc3811aaa152 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 11:00:22 +0900
Subject: [PATCH 21/32] Update CNV data for UGT1A4

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f4290b4e..70dffab3 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1.
+* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 

From a74da8089b20fb9e5ee5a0de8b4b8592d6f6ec90 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 14:24:59 +0900
Subject: [PATCH 22/32] Update CNV data for UGT2B15

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 70dffab3..3990dcaa 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4.
+* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 

From fb7a74dd0cf1cb96b8143f42f6721fc6c68388f8 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 15:35:44 +0900
Subject: [PATCH 23/32] Update CNV data for UGT2B17

---
 CHANGELOG.rst                |  3 ++-
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3990dcaa..96f66d8e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,9 +7,10 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15.
+* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
+* Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.
 
 0.14.0 (2022-04-03)
 -------------------
diff --git a/docs/genes.rst b/docs/genes.rst
index 086a9da8..c5b358fa 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -2598,6 +2598,17 @@ Below is comprehensive summary of SV described from real NGS studies:
     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
     - NA19189
     -
+  * - \*2
+    - Deletion,PartialDeletion3
+    - Indeterminate
+    -
+    - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-UGT2B17-6.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-UGT2B17-7.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-UGT2B17-7.png>`
+    - WGS
+    - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+    - NA21090
+    -
   * -
     - Normal,PartialDeletion3
     - Indeterminate
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index a2e95aee..8108adbb 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -90,3 +90,4 @@ UGT2B17,"Deletion,Deletion"
 UGT2B17,"Deletion,PartialDeletion1"
 UGT2B17,"Deletion,PartialDeletion2"
 UGT2B17,"Normal,PartialDeletion3"
+UGT2B17,"Deletion,PartialDeletion3"

From c29c893fab27bd34c2a09150c7d70c8b57192e83 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 17:05:21 +0900
Subject: [PATCH 24/32] Update CNV data for CYP2D6

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 96f66d8e..0a670503 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
+* Improve CNV caller for CYP2A6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.

From c1a146ab47cfcd0429c9dd2acc3c18ce28e7808a Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Thu, 28 Apr 2022 22:47:26 +0900
Subject: [PATCH 25/32] Update CNV data for CYP2B6

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0a670503..07964b25 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
+* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.

From 1f0d98f3155c7f06f093c2216e23283955c48aa2 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 29 Apr 2022 11:50:10 +0900
Subject: [PATCH 26/32] Update CNV data for CYP2A6

---
 CHANGELOG.rst                |  2 +-
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 07964b25..28e431f2 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,7 +8,7 @@ Changelog
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
 * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
-* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``.
+* Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.
 
diff --git a/docs/genes.rst b/docs/genes.rst
index c5b358fa..835a5140 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -879,6 +879,17 @@ Below is comprehensive summary of SV described from real NGS studies:
      - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
      - NA20828
      -
+   * -
+     - PseudogeneDeletion
+     - Indeterminate
+     -
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2A6-14.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2A6-19.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2A6-19.png>`
+     - WGS
+     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+     - HG00625
+     -
 
 Filtered alleles for CYP2A6
 ---------------------------
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index 8108adbb..64347d83 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -17,6 +17,7 @@ CYP2A6,Tandem
 CYP2A6,Deletion2Hom
 CYP2A6,Hybrid5
 CYP2A6,Hybrid6
+CYP2A6,PseudogeneDeletion
 CYP2B6,Normal
 CYP2B6,Hybrid
 CYP2B6,Duplication

From 0692b55fe401272d7e66a6907bcee08d1be58e0d Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 29 Apr 2022 13:46:46 +0900
Subject: [PATCH 27/32] Update CNV data for SLC22A2

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 28e431f2..e4cb4d7c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
+* Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.

From 3faa553669feb9275154f34995c6d4a3f3b4c710 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 29 Apr 2022 15:58:01 +0900
Subject: [PATCH 28/32] Fix typo "statistcs"

---
 CHANGELOG.rst                           |  1 +
 README.rst                              |  4 ++--
 docs/cli.rst                            | 16 ++++++++--------
 docs/create.py                          |  4 ++--
 pypgx/api/pipeline.py                   |  2 +-
 pypgx/api/utils.py                      |  4 ++--
 pypgx/cli/compute_control_statistics.py |  4 ++--
 pypgx/cli/compute_copy_number.py        |  6 +++---
 pypgx/cli/run_ngs_pipeline.py           |  8 ++++----
 9 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e4cb4d7c..700f0eef 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
+* Fix the typo "statistcs" to "statistics" throughout the package.
 * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
diff --git a/README.rst b/README.rst
index 70ed10ee..d5717011 100644
--- a/README.rst
+++ b/README.rst
@@ -357,7 +357,7 @@ currently defined semantic types:
 - ``SampleTable[Results]``
     * TSV file for storing various results for each sample.
     * Requires following metadata: ``Gene``, ``Assembly``, ``SemanticType``.
-- ``SampleTable[Statistcs]``
+- ``SampleTable[Statistics]``
     * TSV file for storing control gene's various statistics on read depth for each sample. Used for converting target gene's read depth to copy number.
     * Requires following metadata: ``Control``, ``Assembly``, ``SemanticType``, ``Platform``.
 - ``VcfFrame[Consolidated]``
@@ -509,7 +509,7 @@ input data is from whole genome sequencing (WGS) or targeted sequencing
 This pipeline supports SV detection based on copy number analysis for genes
 that are known to have SV. Therefore, if the target gene is associated with
 SV (e.g. CYP2D6) it's strongly recommended to provide a
-``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in
+``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistics]`` file in
 addtion to a VCF file containing SNVs/indels. If the target gene is not
 associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can
 visit the `Genes <https://pypgx.readthedocs.io/en/latest/genes.html>`__ page
diff --git a/docs/cli.rst b/docs/cli.rst
index f35edd08..e09639ef 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -203,13 +203,13 @@ compute-control-statistics
    [Example] For the VDR gene from WGS data:
      $ pypgx compute-control-statistics \
      VDR \
-     control-statistcs.zip \
+     control-statistics.zip \
      1.bam 2.bam
    
    [Example] For a custom region from targeted sequencing data:
      $ pypgx compute-control-statistics \
      chr1:100-200 \
-     control-statistcs.zip \
+     control-statistics.zip \
      bam.list \
      --bed probes.bed
 
@@ -220,7 +220,7 @@ compute-copy-number
 
    $ pypgx compute-copy-number -h
    usage: pypgx compute-copy-number [-h] [--samples-without-sv TEXT [TEXT ...]]
-                                    read-depth control-statistcs copy-number
+                                    read-depth control-statistics copy-number
    
    Compute copy number from read depth for target gene.
    
@@ -235,7 +235,7 @@ compute-copy-number
    Positional arguments:
      read-depth            Input archive file with the semantic type
                            CovFrame[ReadDepth].
-     control-statistcs     Input archive file with the semantic type
+     control-statistics    Input archive file with the semantic type
                            SampleTable[Statistics].
      copy-number           Output archive file with the semantic type
                            CovFrame[CopyNumber].
@@ -899,7 +899,7 @@ run-ngs-pipeline
                            CovFrame[DepthOfCoverage].
      --control-statistics PATH
                            Archive file with the semantic type
-                           SampleTable[Statistcs].
+                           SampleTable[Statistics].
      --platform TEXT       Genotyping platform (default: 'WGS') (choices: 'WGS',
                            'Targeted')
      --assembly TEXT       Reference genome assembly (default: 'GRCh37')
@@ -920,7 +920,7 @@ run-ngs-pipeline
                            Do not plot copy number profile.
      --do-not-plot-allele-fraction
                            Do not plot allele fraction profile.
-     --cnv-caller PATH     Archive file with the semantic type Model[CNV]. By 
+     --cnv-caller PATH     Archive file with the semantic type Model[CNV]. By
                            default, a pre-trained CNV caller in the ~/pypgx-bundle
                            directory will be used.
    
@@ -936,7 +936,7 @@ run-ngs-pipeline
      CYP2D6-pipeline \
      --variants variants.vcf.gz \
      --depth-of-coverage depth-of-coverage.tsv \
-     --control-statistcs control-statistics-VDR.zip
+     --control-statistics control-statistics-VDR.zip
    
    [Example] To genotype the CYP2D6 gene from targeted sequencing data:
      $ pypgx run-ngs-pipeline \
@@ -944,7 +944,7 @@ run-ngs-pipeline
      CYP2D6-pipeline \
      --variants variants.vcf.gz \
      --depth-of-coverage depth-of-coverage.tsv \
-     --control-statistcs control-statistics-VDR.zip \
+     --control-statistics control-statistics-VDR.zip \
      --platform Targeted
 
 slice-bam
diff --git a/docs/create.py b/docs/create.py
index 71a8bd80..c33b0b98 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -384,7 +384,7 @@
 - ``SampleTable[Results]``
     * TSV file for storing various results for each sample.
     * Requires following metadata: ``Gene``, ``Assembly``, ``SemanticType``.
-- ``SampleTable[Statistcs]``
+- ``SampleTable[Statistics]``
     * TSV file for storing control gene's various statistics on read depth for each sample. Used for converting target gene's read depth to copy number.
     * Requires following metadata: ``Control``, ``Assembly``, ``SemanticType``, ``Platform``.
 - ``VcfFrame[Consolidated]``
@@ -536,7 +536,7 @@
 This pipeline supports SV detection based on copy number analysis for genes
 that are known to have SV. Therefore, if the target gene is associated with
 SV (e.g. CYP2D6) it's strongly recommended to provide a
-``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistcs]`` file in
+``CovFrame[DepthOfCoverage]`` file and a ``SampleTable[Statistics]`` file in
 addtion to a VCF file containing SNVs/indels. If the target gene is not
 associated with SV (e.g. CYP3A5) providing a VCF file alone is enough. You can
 visit the `Genes <https://pypgx.readthedocs.io/en/latest/genes.html>`__ page
diff --git a/pypgx/api/pipeline.py b/pypgx/api/pipeline.py
index c1b3f858..548a4466 100644
--- a/pypgx/api/pipeline.py
+++ b/pypgx/api/pipeline.py
@@ -262,7 +262,7 @@ def run_ngs_pipeline(
         depth_of_coverage.check_metadata('Assembly', assembly)
 
         if control_statistics is None:
-            raise ValueError('SV detection requires SampleTable[Statistcs]')
+            raise ValueError('SV detection requires SampleTable[Statistics]')
 
         if isinstance(control_statistics, str):
             control_statistics = sdk.Archive.from_file(control_statistics)
diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
index 9cccaa1f..8f6b1a23 100644
--- a/pypgx/api/utils.py
+++ b/pypgx/api/utils.py
@@ -367,7 +367,7 @@ def compute_control_statistics(
     Returns
     -------
     pypgx.Archive
-        Archive object with the semantic type SampleTable[Statistcs].
+        Archive object with the semantic type SampleTable[Statistics].
     """
     gene_table = core.load_gene_table()
 
@@ -427,7 +427,7 @@ def compute_copy_number(
     ----------
     read_depth : str or pypgx.Archive
         Archive file or object with the semantic type CovFrame[ReadDepth].
-    control_statistcs : str or pypgx.Archive
+    control_statistics : str or pypgx.Archive
         Archive file or object with the semandtic type
         SampleTable[Statistics].
     samples_without_sv : list, optional
diff --git a/pypgx/cli/compute_control_statistics.py b/pypgx/cli/compute_control_statistics.py
index 007499aa..b0257094 100644
--- a/pypgx/cli/compute_control_statistics.py
+++ b/pypgx/cli/compute_control_statistics.py
@@ -17,13 +17,13 @@
 [Example] For the VDR gene from WGS data:
   $ pypgx {fuc.api.common._script_name()} \\
   VDR \\
-  control-statistcs.zip \\
+  control-statistics.zip \\
   1.bam 2.bam
 
 [Example] For a custom region from targeted sequencing data:
   $ pypgx {fuc.api.common._script_name()} \\
   chr1:100-200 \\
-  control-statistcs.zip \\
+  control-statistics.zip \\
   bam.list \\
   --bed probes.bed
 """
diff --git a/pypgx/cli/compute_copy_number.py b/pypgx/cli/compute_copy_number.py
index 9b084fab..892412ff 100644
--- a/pypgx/cli/compute_copy_number.py
+++ b/pypgx/cli/compute_copy_number.py
@@ -33,8 +33,8 @@ def create_parser(subparsers):
 CovFrame[ReadDepth]."""
     )
     parser.add_argument(
-        'control_statistcs',
-        metavar='control-statistcs',
+        'control_statistics',
+        metavar='control-statistics',
         help=
 """Input archive file with the semantic type
 SampleTable[Statistics]."""
@@ -56,7 +56,7 @@ def create_parser(subparsers):
 
 def main(args):
     result = utils.compute_copy_number(
-        args.read_depth, args.control_statistcs,
+        args.read_depth, args.control_statistics,
         samples_without_sv=args.samples_without_sv
     )
     result.to_file(args.copy_number)
diff --git a/pypgx/cli/run_ngs_pipeline.py b/pypgx/cli/run_ngs_pipeline.py
index 03273333..2adf18df 100644
--- a/pypgx/cli/run_ngs_pipeline.py
+++ b/pypgx/cli/run_ngs_pipeline.py
@@ -26,7 +26,7 @@
   CYP2D6-pipeline \\
   --variants variants.vcf.gz \\
   --depth-of-coverage depth-of-coverage.tsv \\
-  --control-statistcs control-statistics-VDR.zip
+  --control-statistics control-statistics-VDR.zip
 
 [Example] To genotype the CYP2D6 gene from targeted sequencing data:
   $ pypgx {fuc.api.common._script_name()} \\
@@ -34,7 +34,7 @@
   CYP2D6-pipeline \\
   --variants variants.vcf.gz \\
   --depth-of-coverage depth-of-coverage.tsv \\
-  --control-statistcs control-statistics-VDR.zip \\
+  --control-statistics control-statistics-VDR.zip \\
   --platform Targeted
 """
 
@@ -78,7 +78,7 @@ def create_parser(subparsers):
         metavar='PATH',
         help=
 """Archive file with the semantic type
-SampleTable[Statistcs]."""
+SampleTable[Statistics]."""
     )
     parser.add_argument(
         '--platform',
@@ -150,7 +150,7 @@ def create_parser(subparsers):
         '--cnv-caller',
         metavar='PATH',
         help=
-"""Archive file with the semantic type Model[CNV]. By 
+"""Archive file with the semantic type Model[CNV]. By
 default, a pre-trained CNV caller in the ~/pypgx-bundle
 directory will be used."""
     )

From f63dfa7dd0e7b5d85c4d107dac3060f31d2b3b86 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 30 Apr 2022 21:30:43 +0900
Subject: [PATCH 29/32] Update CNV data for SULT1A1

---
 CHANGELOG.rst                |  1 +
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 3 files changed, 13 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 700f0eef..4d967a62 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -11,6 +11,7 @@ Changelog
 * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
+* Add new CNV call for SULT1A1: ``Unknown1``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.
 
 0.14.0 (2022-04-03)
diff --git a/docs/genes.rst b/docs/genes.rst
index 835a5140..a8145bf1 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -2286,6 +2286,17 @@ Below is comprehensive summary of SV described from real NGS studies:
     - `GeT-RM <https://pubmed.ncbi.nlm.nih.gov/26621101/>`__
     - NA19143
     -
+  * -
+    - Unknown1
+    - Indeterminate
+    -
+    - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-SULT1A1-3.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-SULT1A1-8.png>`
+    - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-SULT1A1-8.png>`
+    - WGS
+    - `GeT-RM <https://pubmed.ncbi.nlm.nih.gov/26621101/>`__
+    - HG01085
+    -
 
 TBXAS1
 ======
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index 64347d83..66df1aae 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -73,6 +73,7 @@ SULT1A1,DeletionHom
 SULT1A1,Duplication
 SULT1A1,Multiplication1
 SULT1A1,Multiplication2
+SULT1A1,Unknown1
 UGT1A4,Normal
 UGT1A4,Intron1DeletionA
 UGT1A4,Intron1DeletionB

From 734fadbf38780fd235ec1190d22e1626c61110bb Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Mon, 2 May 2022 17:10:26 +0900
Subject: [PATCH 30/32] Update CNV data for CYP2D6; update
 `sdk.utils.simulate_copy_number`

---
 CHANGELOG.rst                |  2 ++
 docs/genes.rst               | 11 +++++++++++
 pypgx/api/data/cnv-table.csv |  1 +
 pypgx/sdk/utils.py           |  9 ++++++++-
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 4d967a62..46bdf74d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,8 +8,10 @@ Changelog
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
 * Fix the typo "statistcs" to "statistics" throughout the package.
+* Update :meth:`sdk.utils.simulate_copy_number` method to automatically handle duplicate sample names.
 * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
+* Add new CNV call for CYP2D6: ``Tandem2F``.
 * Add new CNV call for GSTM1: ``Normal,Deletion2``.
 * Add new CNV call for SULT1A1: ``Unknown1``.
 * Add new CNV call for UGT2B17: ``Deletion,PartialDeletion3``.
diff --git a/docs/genes.rst b/docs/genes.rst
index a8145bf1..c883c20d 100644
--- a/docs/genes.rst
+++ b/docs/genes.rst
@@ -1213,6 +1213,17 @@ Below is comprehensive summary of SV described from real NGS studies:
      -
      -
      -
+   * -
+     - Tandem2F
+     - Indeterminate
+     -
+     - :download:`Model <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/gene-model-CYP2D6-16.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh37-CYP2D6-19.png>`
+     - :download:`Profile <https://raw.githubusercontent.com/sbslee/pypgx-data/main/dpsv/GRCh38-CYP2D6-19.png>`
+     - WGS
+     - `1KGP <https://www.biorxiv.org/content/10.1101/2021.02.06.430068v2>`__
+     - HG00458
+     -
    * - \*13+\*1
      - Tandem3
      - \*1/\*13+\*1
diff --git a/pypgx/api/data/cnv-table.csv b/pypgx/api/data/cnv-table.csv
index 66df1aae..06e409a5 100644
--- a/pypgx/api/data/cnv-table.csv
+++ b/pypgx/api/data/cnv-table.csv
@@ -39,6 +39,7 @@ CYP2D6,Unknown1
 CYP2D6,Unknown2
 CYP2D6,PseudogeneDeletion
 CYP2D6,PseudogeneDownstreamDel
+CYP2D6,Tandem2F
 CYP2E1,Normal
 CYP2E1,Duplication1
 CYP2E1,Duplication2
diff --git a/pypgx/sdk/utils.py b/pypgx/sdk/utils.py
index b02a93bc..54ac7c82 100644
--- a/pypgx/sdk/utils.py
+++ b/pypgx/sdk/utils.py
@@ -314,7 +314,14 @@ def simulate_copy_number(
         s = data - noise
         s[data == 0] = 0
         s[s < 0] = 0
-        target.data.df[f'{sv}_{i+1}'] = s
+
+        j = 1
+        name = f'{sv}_{i+j}'
+        while name in target.data.samples:
+            j += 1
+            name = f'{sv}_{i+j}'
+
+        target.data.df[name] = s
 
     return target
 

From abef16a149e85cb3360a88a1dc66fbb9179db3f4 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Tue, 3 May 2022 14:48:21 +0900
Subject: [PATCH 31/32] Update docs

---
 CHANGELOG.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 46bdf74d..99ed501d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,8 +1,8 @@
 Changelog
 *********
 
-0.15.0 (in development)
------------------------
+0.15.0 (2022-05-03)
+-------------------
 
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.

From 83ef1faf789cf120c56308c8997b50d202b44f79 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Tue, 3 May 2022 16:54:37 +0900
Subject: [PATCH 32/32] Update docs

---
 CHANGELOG.rst  | 2 +-
 README.rst     | 2 +-
 docs/create.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 99ed501d..318dbb75 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 * Add new optional arguments ``--genes`` and ``--exclude`` to :command:`prepare-depth-of-coverage` command.
 * Add new command :command:`slice-bam`.
 * Add new command :command:`print-data`.
-* Fix the typo "statistcs" to "statistics" throughout the package.
+* Fix typo "statistcs" to "statistics" throughout the package.
 * Update :meth:`sdk.utils.simulate_copy_number` method to automatically handle duplicate sample names.
 * Improve CNV caller for CYP2A6, CYP2B6, CYP2D6, CYP2E1, GSTM1, SLC22A2, SULT1A1, UGT1A4, UGT2B15, UGT2B17.
 * Add new CNV calls for CYP2A6: ``Deletion2Hom``, ``Hybrid5``, ``Hybrid6``, ``PseudogeneDeletion``.
diff --git a/README.rst b/README.rst
index d5717011..26b0be65 100644
--- a/README.rst
+++ b/README.rst
@@ -370,7 +370,7 @@ currently defined semantic types:
     * VcfFrame for storing target gene's phased variant data.
     * Requires following metadata: ``Platform``, ``Gene``, ``Assembly``, ``SemanticType``, ``Program``.
 
-Wroking with archive files
+Working with archive files
 --------------------------
 
 To demonstrate how easy it is to work with PyPGx archive files, below we will
diff --git a/docs/create.py b/docs/create.py
index c33b0b98..e16dc502 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -397,7 +397,7 @@
     * VcfFrame for storing target gene's phased variant data.
     * Requires following metadata: ``Platform``, ``Gene``, ``Assembly``, ``SemanticType``, ``Program``.
 
-Wroking with archive files
+Working with archive files
 --------------------------
 
 To demonstrate how easy it is to work with PyPGx archive files, below we will