Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: beagle AP field #224

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
48 changes: 42 additions & 6 deletions trtools/mergeSTR/mergeSTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@
trh.VcfTypes.popstr: ["AD", "DP", "PL"],
trh.VcfTypes.advntr: ["DP", "SR", "FR", "ML"]
}
BEAGLEFORMATFIELDS = ["AP1", "AP2", "DS"]


def WriteMergedHeader(vcfw: TextIO, args: Any, readers: List[cyvcf2.VCF], cmd: str, vcftype: Union[str, trh.VcfTypes]) \
def WriteMergedHeader(vcfw: TextIO, args: Any, readers: List[cyvcf2.VCF], cmd: str, \
vcftype: Union[str, trh.VcfTypes], isbeagle: bool) \
-> Union[Tuple[List[Tuple[str, bool]], List[str]], Tuple[None, None]]:
r"""Write merged header for VCFs in args.vcfs

Expand All @@ -72,6 +73,8 @@ def WriteMergedHeader(vcfw: TextIO, args: Any, readers: List[cyvcf2.VCF], cmd: s
Command used to call this program
vcftype : str
Type of VCF files being merged
isbeagle: bool
Files being merged were generated by Beagle

Returns
-------
Expand Down Expand Up @@ -136,7 +139,10 @@ def get_header_lines(field: str, reader: cyvcf2.VCF) -> List[str]:
# Write FORMAT fields, different for each tool
useformat: List[str] = []
formats = get_header_lines('format', readers[0])
for field in FORMATFIELDS[vcftype]:
attempt_format_fields = FORMATFIELDS[vcftype].copy() # copy since we will modify
if isbeagle:
attempt_format_fields.extend(BEAGLEFORMATFIELDS)
for field in attempt_format_fields:
this_format = [line for line in formats if 'ID=' + field + ',' in line]
if len(this_format) == 0:
common.WARNING("Expected format field %s not found. Skipping" % field)
Expand Down Expand Up @@ -292,6 +298,26 @@ def HipstrKey(record: trh.TRRecord):
)
return out_alts, mappings

def CheckIdenticalAlleleOrder(mappings: List[np.ndarray]) -> bool:
r"""Check if alleles are identical sequence/order between all records being merged

Parameters
----------
mappings : list of np.ndarray
See GetAltAlleles

Returns
-------
same_alleles : bool
True if all alleles are identical between records
"""
if len(mappings) == 1: return True
for i in range(1, len(mappings)):
if len(mappings[i]) != len(mappings[0]):
return False
if not np.all(mappings[i]==mappings[0]):
return False
return True

def GetID(idval: str) -> str:
r"""Get the ID for a a record
Expand Down Expand Up @@ -479,6 +505,15 @@ def MergeRecords(readers: cyvcf2.VCF, vcftype: Union[str, trh.VcfTypes], num_sam

alt_alleles, mappings = GetAltAlleles(ref_allele, current_records, mergelist, vcftype)

# Check beagle-specific fields, and only merge if
# ref/alt alleles are identical and in same order
mergeformats = useformat.copy()
if not CheckIdenticalAlleleOrder(mappings):
for field in BEAGLEFORMATFIELDS:
if field in mergeformats:
common.WARNING("Conflicting alt alleles found at {}:{}. Skipping merging of Beagle field {}.".format(chrom, pos, field))
mergeformats.remove(field)

# Set common fields
vcfw.write(chrom) # CHROM
vcfw.write('\t')
Expand Down Expand Up @@ -511,14 +546,14 @@ def MergeRecords(readers: cyvcf2.VCF, vcftype: Union[str, trh.VcfTypes], num_sam
vcfw.write('\t')

# FORMAT - add GT to front
vcfw.write(":".join(["GT"] + useformat))
vcfw.write(":".join(["GT"] + mergeformats))

# Samples
alleles = [ref_allele] + alt_alleles
map_iter = iter(mappings)
for i in range(len(mergelist)):
if mergelist[i]:
WriteSampleData(vcfw, current_records[i].vcfrecord, alleles, useformat,
WriteSampleData(vcfw, current_records[i].vcfrecord, alleles, mergeformats,
format_type, next(map_iter))
else: # NOCALL
if num_samples[i] > 0:
Expand Down Expand Up @@ -610,14 +645,15 @@ def main(args: Any) -> int:
### Check inferred type of each is the same
try:
vcftype = mergeutils.GetAndCheckVCFType(vcfreaders, args.vcftype)
isbeagle = mergeutils.GetAndCheckBeagle(vcfreaders)
except ValueError as ve:
common.WARNING('Error: ' + str(ve))
return 1

### Set up VCF writer ###
vcfw = open(args.out + ".vcf", "w")

useinfo, useformat = WriteMergedHeader(vcfw, args, vcfreaders, " ".join(sys.argv), vcftype)
useinfo, useformat = WriteMergedHeader(vcfw, args, vcfreaders, " ".join(sys.argv), vcftype, isbeagle)

if useinfo is None or useformat is None:
common.WARNING("Error writing merged header. Quitting")
Expand Down
55 changes: 55 additions & 0 deletions trtools/mergeSTR/tests/test_mergeSTR.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from ..mergeSTR import *
from trtools.testsupport.utils import assert_same_vcf
import gzip, shutil



# Set up base argparser
Expand Down Expand Up @@ -115,6 +117,21 @@ def test_hipSTRRightFile(args, mrgvcfdir):
args.verbose = True
assert main(args)==0

# Test right files or directory - hipstr with FORMAT AP field and IMP
def test_hipSTRRightFile_Beagle(args, mrgvcfdir):
fname1 = os.path.join(mrgvcfdir, "hipstr_imputed_merge1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr_imputed_merge2.vcf.gz")
args.vcftype = "hipstr"
args.vcfs = fname1 + "," + fname2
assert main(args)==0
args.vcftype = "auto"
assert main(args)==0
args.update_sample_from_file = False
assert main(args)==0
args.verbose = True
assert main(args)==0


# Test right files or directory - ExpansionHunter
def test_ExpansionHunterRightFile(args, mrgvcfdir):
fname1 = os.path.join(mrgvcfdir, "test_file_eh1.vcf.gz")
Expand Down Expand Up @@ -156,6 +173,16 @@ def test_multiple_vcf_types(args, mrgvcfdir, capsys):
assert main(args) == 1
assert 'mixed types' in capsys.readouterr().err



def test_mixed_beagle_types(args, mrgvcfdir, capsys):
fname1 = os.path.join(mrgvcfdir, "hipstr_merge1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr_imputed_merge2.vcf.gz")
args.vcftype = "auto"
args.vcfs = fname1 + "," + fname2
assert main(args) == 1
assert 'Mix of Beagle/non-Beagle VCFs identified.' in capsys.readouterr().err

def test_duplicate_ids(args, mrgvcfdir, capsys):
fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "test_file_gangstr_dupID.vcf.gz")
Expand Down Expand Up @@ -252,6 +279,22 @@ def test_ConflictingRefs():
retval = GetRefAllele(dummy_records, [True, True, False], None)
assert retval == "CAGCAG"

# to check beagle vcf with different number of ALT alllels
def test_DifferentAltAllele(args, mrgvcfdir,capsys):
fname1 = os.path.join(mrgvcfdir, "hipstr_imputed_merge2.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr_imputed_diffALT.vcf.gz")
args.vcfs = fname1 + "," + fname2
assert main(args) == 0
assert ("Conflicting alt alleles found at" in capsys.readouterr().err)

#check if identical allele order
def test_CheckIdenticalAlleleOrder(args,mrgvcfdir,capsys):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this test should call the function CheckIdenticalAlleleOrder directly

fname1 = os.path.join(mrgvcfdir, "hipstr_imputed_merge1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr_imputed_difforder.vcf.gz")
args.vcfs = fname1 + "," + fname2
assert main(args) == 0
assert ("Conflicting alt alleles found at" in capsys.readouterr().err)

def test_GetInfoItem(capsys):
# Set up dummy records
dummy_records = []
Expand Down Expand Up @@ -351,6 +394,18 @@ def test_hipstr_output(args, mrgvcfdir):
assert main(args) == 0
assert_same_vcf(args.out + '.vcf', mrgvcfdir + "/hipstr_merged.vcf")

#test if AP field exist
def test_hipstr_output_Beagle(args, mrgvcfdir):
fname1 = os.path.join(mrgvcfdir, "hipstr_imputed_merge1.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr_imputed_merge2.vcf.gz")
args.vcftype = "hipstr"
args.vcfs = fname1 + "," + fname2
with gzip.open(mrgvcfdir+"/hipstr_imputed_merged.vcf.gz", 'r') as f_in, open("hipstr_imputed_merged.vcf", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
assert main(args) == 0
assert_same_vcf(args.out + '.vcf', "hipstr_imputed_merged.vcf")
os.remove("hipstr_imputed_merged.vcf")

def test_hipstr_output_flanking_pb_harmonization(args, mrgvcfdir):
fname1 = os.path.join(mrgvcfdir, "hipstr-harmonized-merge-contains-flanking.vcf.gz")
fname2 = os.path.join(mrgvcfdir, "hipstr-harmonized-merge-no-flanking.vcf.gz")
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
29 changes: 29 additions & 0 deletions trtools/utils/mergeutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,35 @@ def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str:
else:
raise ValueError("VCF files are of mixed types.")

def GetAndCheckBeagle(vcfs: List[CYVCF_READER]) -> bool:
"""Check if merged files are Beagle generated

If all are from Beagle, return True
If all are not from Beagle, return False
If mixed, return ValueError since in that case we will not merge

Parameters
----------
vcfs: list of cyvcf2.VCF
Multiple VCFs

Returns
-------
isbeagle : bool
Indicates if all files are/are not Beagle generated

Raises
------
TypeError
If input files are a mix of Beagle/non-Beagle
"""
is_beagle = []
for vcf in vcfs:
is_beagle.append(trh.IsBeagleVCF(vcf))
if len(set(is_beagle)) == 1:
return is_beagle[0]
else:
raise ValueError("Mix of Beagle/non-Beagle VCFs identified.")

def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]:
r"""Get the chromosome order of a record
Expand Down
Loading