From 1ca41d6d727b54a960d492f322eebfd0510e096e Mon Sep 17 00:00:00 2001 From: Jonas Fuchs <78491186+jonas-fuchs@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:43:22 +0100 Subject: [PATCH] added vcf parsing for freebayes and sanity check for gff parsing (#17) * added vcf parsing for freebayes and vcf formats that have multiple variants in one line associated to one position * updated version * updated info for usage * fixed non-visited keys of the info field * lazy fix for parsing gffs * changed comment * rm redundant code --- virheat/__init__.py | 2 +- virheat/command.py | 2 +- virheat/scripts/data_prep.py | 47 ++++++++++++++++++++++++------------ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/virheat/__init__.py b/virheat/__init__.py index 02e2c78..249c695 100644 --- a/virheat/__init__.py +++ b/virheat/__init__.py @@ -1,3 +1,3 @@ """plot vcf data as a heatmap mapped to a virus genome""" _program = "virheat" -__version__ = "0.7.1" +__version__ = "0.7.2" diff --git a/virheat/command.py b/virheat/command.py index 87778db..dcb484a 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -26,7 +26,7 @@ def get_args(sysargs): """ parser = argparse.ArgumentParser( prog=_program, - usage='''\tvirheat -l or -g [additional arguments]''') + usage='''\tvirheat -r -l/-g [additional arguments]''') parser.add_argument( "input", nargs=2, diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 687b63a..65cfab8 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -55,14 +55,14 @@ def read_vcf(vcf_file, reference): # get header and values with open(vcf_file, "r") as f: - header_lines = [l.split("\t") for l in f if l.startswith('#CHROM')] + header_lines = [l.strip().split("\t") for l in f if l.startswith('#CHROM')] if not header_lines: print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} does not contain a '#CHROM' header!") return {} header = header_lines[0] # get each line as frequency_lists with open(vcf_file, "r") as f: - lines = [l.split("\t") for l in f if l.startswith(reference)] + lines = [l.strip().split("\t") for l in f if l.startswith(reference)] # check if vcf is empty if not lines: print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!") @@ -80,26 +80,38 @@ def read_vcf(vcf_file, reference): for line in lines: # remember keys that have an entry already visited_keys = [] + # check if there are multiple called variants at a single position + # separated by a comma + length_variants = len(line[4].split(',')) for idx, key in enumerate(header[0:6]): - vcf_dict[key].append(convert_string(line[idx])) + sublines = line[idx].split(',') + for i in range(length_variants): + try: + vcf_dict[key].append(convert_string(sublines[i])) + except IndexError: + vcf_dict[key].append(convert_string(sublines[0])) # get mutation type - if len(line[3]) == len(line[4]): - vcf_dict["MUT_TYPE_"].append("SNV") - elif len(line[3]) < len(line[4]): - vcf_dict["MUT_TYPE_"].append("INS") - elif len(line[3]) > len(line[4]): - vcf_dict["MUT_TYPE_"].append("DEL") + mutations = line[4].split(',') + for mutation in mutations: + if len(line[3]) == len(mutation): + vcf_dict["MUT_TYPE_"].append("SNV") + elif len(line[3]) < len(mutation): + vcf_dict["MUT_TYPE_"].append("INS") + elif len(line[3]) > len(mutation): + vcf_dict["MUT_TYPE_"].append("DEL") visited_keys.extend(header[0:6]) visited_keys.append("MUT_TYPE_") # get data from info field for info in line[7].split(";"): if "=" in info: key, val = info.split("=") - vcf_dict[key].append(convert_string(val)) + val_list = val.split(',') + for value in val_list: + vcf_dict[key].append(convert_string(value)) visited_keys.append(key) - # append none for ech none visited key + # append none for each none visited key in the INFO field for key in [k for k in vcf_dict.keys() if k not in visited_keys]: - vcf_dict[key].append(None) + vcf_dict[key].extend([None]*length_variants) return vcf_dict @@ -282,7 +294,12 @@ def parse_gff3(file, reference): # ignore comments and last line if not line.startswith(reference): continue - gff_values = line.split("\t") + gff_values = line.strip().split("\t") + # sanity check that the line has a unique ID for the dict key + # this is a lazy fix as it will exclude e.g. exons without ID and + # only a parent + if not gff_values[8].startswith("ID="): + continue # create keys if gff_values[2] not in gff3_dict: gff3_dict[gff_values[2]] = {} @@ -292,10 +309,10 @@ def parse_gff3(file, reference): # create a new dict for each ID if identifier == "ID" and identifier not in gff3_dict: attribute_id = val - gff3_dict[gff_values[2]][attribute_id] = {} + gff3_dict[gff_values[2]][val] = {} # add attributes if identifier != "ID": - gff3_dict[gff_values[2]][attribute_id][identifier] = val.replace("\n", "") + gff3_dict[gff_values[2]][attribute_id][identifier] = val # add start, stop and strand gff3_dict[gff_values[2]][attribute_id]["start"] = int(gff_values[3]) gff3_dict[gff_values[2]][attribute_id]["stop"] = int(gff_values[4])