From 4536543c041f219eb7ad6d76d9a3215442730a00 Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 13:01:37 -0800 Subject: [PATCH 1/3] adding new option to mergeSTR to give file with list of VCFs --- test/cmdline_tests.sh | 9 +++++++++ trtools/mergeSTR/README.rst | 3 ++- trtools/mergeSTR/mergeSTR.py | 15 +++++++++++++-- trtools/mergeSTR/tests/test_mergeSTR.py | 15 +++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/test/cmdline_tests.sh b/test/cmdline_tests.sh index 77762115..b0b751cd 100755 --- a/test/cmdline_tests.sh +++ b/test/cmdline_tests.sh @@ -171,6 +171,15 @@ FILE2=${EXDATADIR}/NA12891_chr21_popstr.sorted.vcf.gz FILE3=${EXDATADIR}/NA12892_chr21_popstr.sorted.vcf.gz runcmd_pass "mergeSTR --vcfs ${FILE1},${FILE2},${FILE3} --out ${TMPDIR}/test_merge_popstr --vcftype popstr" +# Test mergeSTR on a file with list of VCFs +FILE1=${EXDATADIR}/NA12878_chr21_hipstr.sorted.vcf.gz +FILE2=${EXDATADIR}/NA12891_chr21_hipstr.sorted.vcf.gz +FILE3=${EXDATADIR}/NA12892_chr21_hipstr.sorted.vcf.gz +echo ${FILE1} > ${TMPDIR}/vcf.list +echo ${FILE2} >> ${TMPDIR}/vcf.list +echo ${FILE3} >> ${TMPDIR}/vcf.list +runcmd_pass "mergeSTR --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" + runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12878_chr21_advntr.sorted.vcf.gz --out stdout --afreq" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12891_chr21_eh.sorted.vcf.gz --out ${TMPDIR}/stats_eh --numcalled" runcmd_pass "statSTR --vcf ${EXDATADIR}/trio_chr21_gangstr.sorted.vcf.gz --out ${TMPDIR}/stats_gangstr --numcalled --mean" diff --git a/trtools/mergeSTR/README.rst b/trtools/mergeSTR/README.rst index b648a07f..8c00b677 100644 --- a/trtools/mergeSTR/README.rst +++ b/trtools/mergeSTR/README.rst @@ -26,7 +26,8 @@ To run mergeSTR use the following command:: Required Parameters: -* :code:`--vcf `: Comma-separated list of VCF files to merge. All must have been created by the same TR genotyper. Must be bgzipped, sorted, and indexed. (See `Instructions on Compressing and Indexing VCF files`_ below) +* :code:`--vcfs `: Comma-separated list of VCF files to merge. All must have been created by the same TR genotyper. Must be bgzipped, sorted, and indexed. (See `Instructions on Compressing and Indexing VCF files`_ below) +* :code:`--vcfs-list `: As an alternative to :code:`--vcfs`, you can provide a file with a list of bgzipped/sorted/indexed VCF files (one filename per line) to merge. * :code:`--vcftype `: Type of VCF files being merged. Default = :code:`auto`. Must be one of: :code:`gangstr`, :code:`advntr`, :code:`hipstr`, :code:`eh`, :code:`popstr`. * :code:`--out `: prefix to name output files diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index 56b2d319..83224470 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -537,7 +537,10 @@ def getargs() -> Any: # pragma: no cover req_group = parser.add_argument_group("Required arguments") req_group.add_argument("--vcfs", help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", - type=str, required=True) + type=str, required=False) + req_group.add_argument("--vcfs-list", + help="File containing list of VCF files to merge. Must specify either --vcfs or --vcfs-list", + type=str, required=False) req_group.add_argument("--out", help="Prefix to name output files", type=str, required=True) req_group.add_argument("--vcftype", help="Options=%s" % [str(item) for item in trh.VcfTypes.__members__], type=str, default="auto") @@ -579,7 +582,15 @@ def main(args: Any) -> int: "directory".format(args.out)) return 1 - filenames = args.vcfs.split(",") + if args.vcfs is None and args.vcfs_list is None: + common.WARNING("Error: you must specify either --vcfs or --vcfs-list") + return 1 + + if args.vcfs is not None: + filenames = args.vcfs.split(",") + else: + filenames = [item.strip() for item in open(args.vcfs_list, "r").readlines()] + ### Check and Load VCF files ### vcfreaders = utils.LoadReaders(filenames, checkgz=True) if vcfreaders is None: diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index 82d813d8..f9c729d3 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -14,6 +14,7 @@ def args(tmpdir): args = argparse.ArgumentParser() args.vcfs = None + args.vcfs_list = None args.out = str(tmpdir / "test") args.update_sample_from_file = False args.quiet = False @@ -46,6 +47,20 @@ def __init__(self, chrom, pos, ref, alts=None, info=None): self.info = info if info is not None else {} self.vcfrecord = DummyRecord(chrom, pos, ref, self.alt_alleles, self.info) +# Test file with list of VCFs +def test_FileList(args, mrgvcfdir, tmpdir): + fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") + fname2 = os.path.join(mrgvcfdir, "test_file_gangstr2.vcf.gz") + args.vcftype = "gangstr" + listfile = str(tmpdir / "test.list") + f = open(listfile, "w") + f.write(fname1+"\n") + f.write(fname2+"\n") + f.close() + args.vcfs_list = listfile + args.vcfs = None + assert main(args)==0 + # Test right files or directory - GangSTR def test_GangSTRRightFile(args, mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") From f07a99dc51eb3811cd702137fe0c8635b7188ba2 Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 13:15:11 -0800 Subject: [PATCH 2/3] adding extra tests to make sure vcfs and vcfs-list not both specified --- test/cmdline_tests.sh | 1 + trtools/mergeSTR/mergeSTR.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/test/cmdline_tests.sh b/test/cmdline_tests.sh index b0b751cd..a4d58038 100755 --- a/test/cmdline_tests.sh +++ b/test/cmdline_tests.sh @@ -179,6 +179,7 @@ echo ${FILE1} > ${TMPDIR}/vcf.list echo ${FILE2} >> ${TMPDIR}/vcf.list echo ${FILE3} >> ${TMPDIR}/vcf.list runcmd_pass "mergeSTR --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" +runcmd_fail "mergeSTR --vcfs ${FILE1},${FILE2},${FILE3} --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12878_chr21_advntr.sorted.vcf.gz --out stdout --afreq" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12891_chr21_eh.sorted.vcf.gz --out ${TMPDIR}/stats_eh --numcalled" diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index 83224470..30335d63 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -586,6 +586,10 @@ def main(args: Any) -> int: common.WARNING("Error: you must specify either --vcfs or --vcfs-list") return 1 + if args.vcfs is not None and args.vcfs_list is not None: + common.WARNING("Error: you cannot specify both --vcfs and --vcfs-list") + return 1 + if args.vcfs is not None: filenames = args.vcfs.split(",") else: From 77cba2304a13609b032c40505135e207f68a775d Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 16:11:00 -0800 Subject: [PATCH 3/3] added test to assert VCF same when reading from filelist --- trtools/mergeSTR/tests/test_mergeSTR.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index f9c729d3..0662339b 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -52,6 +52,17 @@ def test_FileList(args, mrgvcfdir, tmpdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") fname2 = os.path.join(mrgvcfdir, "test_file_gangstr2.vcf.gz") args.vcftype = "gangstr" + + # Run with files input to vcfs + nolist_outfile = str(tmpdir / "test-gangstr") + args.out = nolist_outfile + args.vcfs = fname1 + "," + fname2 + args.vcfs_list = None + assert main(args)==0 + + # Run with files input as list + list_outfile = str(tmpdir / "test-gangstr-list") + args.out = list_outfile listfile = str(tmpdir / "test.list") f = open(listfile, "w") f.write(fname1+"\n") @@ -60,6 +71,7 @@ def test_FileList(args, mrgvcfdir, tmpdir): args.vcfs_list = listfile args.vcfs = None assert main(args)==0 + assert_same_vcf(nolist_outfile + ".vcf", list_outfile + ".vcf") # Test right files or directory - GangSTR def test_GangSTRRightFile(args, mrgvcfdir):