Skip to content

Commit

Permalink
added vcf parsing for freebayes and sanity check for gff parsing (#17)
Browse files Browse the repository at this point in the history
* added vcf parsing for freebayes and vcf formats that have multiple variants in one line associated to one position

* updated version

* updated info for usage

* fixed non-visited keys of the info field

* lazy fix for parsing gffs

* changed comment

* rm redundant code
  • Loading branch information
jonas-fuchs authored Nov 13, 2024
1 parent d5763bd commit 1ca41d6
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
2 changes: 1 addition & 1 deletion virheat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""plot vcf data as a heatmap mapped to a virus genome"""
_program = "virheat"
__version__ = "0.7.1"
__version__ = "0.7.2"
2 changes: 1 addition & 1 deletion virheat/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_args(sysargs):
"""
parser = argparse.ArgumentParser(
prog=_program,
usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -l or -g [additional arguments]''')
usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -r -l/-g [additional arguments]''')
parser.add_argument(
"input",
nargs=2,
Expand Down
47 changes: 32 additions & 15 deletions virheat/scripts/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ def read_vcf(vcf_file, reference):

# get header and values
with open(vcf_file, "r") as f:
header_lines = [l.split("\t") for l in f if l.startswith('#CHROM')]
header_lines = [l.strip().split("\t") for l in f if l.startswith('#CHROM')]
if not header_lines:
print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} does not contain a '#CHROM' header!")
return {}
header = header_lines[0]
# get each line as frequency_lists
with open(vcf_file, "r") as f:
lines = [l.split("\t") for l in f if l.startswith(reference)]
lines = [l.strip().split("\t") for l in f if l.startswith(reference)]
# check if vcf is empty
if not lines:
print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!")
Expand All @@ -80,26 +80,38 @@ def read_vcf(vcf_file, reference):
for line in lines:
# remember keys that have an entry already
visited_keys = []
# check if there are multiple called variants at a single position
# separated by a comma
length_variants = len(line[4].split(','))
for idx, key in enumerate(header[0:6]):
vcf_dict[key].append(convert_string(line[idx]))
sublines = line[idx].split(',')
for i in range(length_variants):
try:
vcf_dict[key].append(convert_string(sublines[i]))
except IndexError:
vcf_dict[key].append(convert_string(sublines[0]))
# get mutation type
if len(line[3]) == len(line[4]):
vcf_dict["MUT_TYPE_"].append("SNV")
elif len(line[3]) < len(line[4]):
vcf_dict["MUT_TYPE_"].append("INS")
elif len(line[3]) > len(line[4]):
vcf_dict["MUT_TYPE_"].append("DEL")
mutations = line[4].split(',')
for mutation in mutations:
if len(line[3]) == len(mutation):
vcf_dict["MUT_TYPE_"].append("SNV")
elif len(line[3]) < len(mutation):
vcf_dict["MUT_TYPE_"].append("INS")
elif len(line[3]) > len(mutation):
vcf_dict["MUT_TYPE_"].append("DEL")
visited_keys.extend(header[0:6])
visited_keys.append("MUT_TYPE_")
# get data from info field
for info in line[7].split(";"):
if "=" in info:
key, val = info.split("=")
vcf_dict[key].append(convert_string(val))
val_list = val.split(',')
for value in val_list:
vcf_dict[key].append(convert_string(value))
visited_keys.append(key)
# append none for ech none visited key
# append none for each none visited key in the INFO field
for key in [k for k in vcf_dict.keys() if k not in visited_keys]:
vcf_dict[key].append(None)
vcf_dict[key].extend([None]*length_variants)

return vcf_dict

Expand Down Expand Up @@ -282,7 +294,12 @@ def parse_gff3(file, reference):
# ignore comments and last line
if not line.startswith(reference):
continue
gff_values = line.split("\t")
gff_values = line.strip().split("\t")
# sanity check that the line has a unique ID for the dict key
# this is a lazy fix as it will exclude e.g. exons without ID and
# only a parent
if not gff_values[8].startswith("ID="):
continue
# create keys
if gff_values[2] not in gff3_dict:
gff3_dict[gff_values[2]] = {}
Expand All @@ -292,10 +309,10 @@ def parse_gff3(file, reference):
# create a new dict for each ID
if identifier == "ID" and identifier not in gff3_dict:
attribute_id = val
gff3_dict[gff_values[2]][attribute_id] = {}
gff3_dict[gff_values[2]][val] = {}
# add attributes
if identifier != "ID":
gff3_dict[gff_values[2]][attribute_id][identifier] = val.replace("\n", "")
gff3_dict[gff_values[2]][attribute_id][identifier] = val
# add start, stop and strand
gff3_dict[gff_values[2]][attribute_id]["start"] = int(gff_values[3])
gff3_dict[gff_values[2]][attribute_id]["stop"] = int(gff_values[4])
Expand Down

0 comments on commit 1ca41d6

Please sign in to comment.