From 1ca41d6d727b54a960d492f322eebfd0510e096e Mon Sep 17 00:00:00 2001
From: Jonas Fuchs <78491186+jonas-fuchs@users.noreply.github.com>
Date: Wed, 13 Nov 2024 09:43:22 +0100
Subject: [PATCH] added vcf parsing for freebayes and sanity check for gff
 parsing (#17)

* added vcf parsing for freebayes and vcf formats that have multiple variants in one line associated to one position

* updated version

* updated info for usage

* fixed non-visited keys of the info field

* lazy fix for parsing gffs

* changed comment

* rm redundant code
---
 virheat/__init__.py          |  2 +-
 virheat/command.py           |  2 +-
 virheat/scripts/data_prep.py | 47 ++++++++++++++++++++++++------------
 3 files changed, 34 insertions(+), 17 deletions(-)
diff --git a/virheat/__init__.py b/virheat/__init__.py
index 02e2c78..249c695 100644
--- a/virheat/__init__.py
+++ b/virheat/__init__.py
@@ -1,3 +1,3 @@
 """plot vcf data as a heatmap mapped to a virus genome"""
 _program = "virheat"
-__version__ = "0.7.1"
+__version__ = "0.7.2"
diff --git a/virheat/command.py b/virheat/command.py
index 87778db..dcb484a 100644
--- a/virheat/command.py
+++ b/virheat/command.py
@@ -26,7 +26,7 @@ def get_args(sysargs):
     """
     parser = argparse.ArgumentParser(
         prog=_program,
-        usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -l or -g [additional arguments]''')
+        usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -r -l/-g [additional arguments]''')
     parser.add_argument(
         "input",
         nargs=2,
diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
index 687b63a..65cfab8 100644
--- a/virheat/scripts/data_prep.py
+++ b/virheat/scripts/data_prep.py
@@ -55,14 +55,14 @@ def read_vcf(vcf_file, reference):
 
     # get header and values
     with open(vcf_file, "r") as f:
-        header_lines = [l.split("\t") for l in f if l.startswith('#CHROM')]
+        header_lines = [l.strip().split("\t") for l in f if l.startswith('#CHROM')]
         if not header_lines:
             print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} does not contain a '#CHROM' header!")
             return {}
         header = header_lines[0]
     # get each line as frequency_lists
     with open(vcf_file, "r") as f:
-        lines = [l.split("\t") for l in f if l.startswith(reference)]
+        lines = [l.strip().split("\t") for l in f if l.startswith(reference)]
     # check if vcf is empty
     if not lines:
         print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!")
@@ -80,26 +80,38 @@ def read_vcf(vcf_file, reference):
     for line in lines:
         # remember keys that have an entry already
         visited_keys = []
+        # check if there are multiple called variants at a single position
+        # separated by a comma
+        length_variants = len(line[4].split(','))
         for idx, key in enumerate(header[0:6]):
-            vcf_dict[key].append(convert_string(line[idx]))
+            sublines = line[idx].split(',')
+            for i in range(length_variants):
+                try:
+                    vcf_dict[key].append(convert_string(sublines[i]))
+                except IndexError:
+                    vcf_dict[key].append(convert_string(sublines[0]))
         # get mutation type
-        if len(line[3]) == len(line[4]):
-            vcf_dict["MUT_TYPE_"].append("SNV")
-        elif len(line[3]) < len(line[4]):
-            vcf_dict["MUT_TYPE_"].append("INS")
-        elif len(line[3]) > len(line[4]):
-            vcf_dict["MUT_TYPE_"].append("DEL")
+        mutations = line[4].split(',')
+        for mutation in mutations:
+            if len(line[3]) == len(mutation):
+                vcf_dict["MUT_TYPE_"].append("SNV")
+            elif len(line[3]) < len(mutation):
+                vcf_dict["MUT_TYPE_"].append("INS")
+            elif len(line[3]) > len(mutation):
+                vcf_dict["MUT_TYPE_"].append("DEL")
         visited_keys.extend(header[0:6])
         visited_keys.append("MUT_TYPE_")
         # get data from info field
         for info in line[7].split(";"):
             if "=" in info:
                 key, val = info.split("=")
-                vcf_dict[key].append(convert_string(val))
+                val_list = val.split(',')
+                for value in val_list:
+                    vcf_dict[key].append(convert_string(value))
                 visited_keys.append(key)
-        # append none for ech none visited key
+        # append none for each none visited key in the INFO field
         for key in [k for k in vcf_dict.keys() if k not in visited_keys]:
-            vcf_dict[key].append(None)
+            vcf_dict[key].extend([None]*length_variants)
 
     return vcf_dict
 
@@ -282,7 +294,12 @@ def parse_gff3(file, reference):
             # ignore comments and last line
             if not line.startswith(reference):
                 continue
-            gff_values = line.split("\t")
+            gff_values = line.strip().split("\t")
+            # sanity check that the line has a unique ID for the dict key
+            # this is a lazy fix as it will exclude e.g. exons without ID and
+            # only a parent
+            if not gff_values[8].startswith("ID="):
+                continue
             # create keys
             if gff_values[2] not in gff3_dict:
                 gff3_dict[gff_values[2]] = {}
@@ -292,10 +309,10 @@ def parse_gff3(file, reference):
                 # create a new dict for each ID
                 if identifier == "ID" and identifier not in gff3_dict:
                     attribute_id = val
-                    gff3_dict[gff_values[2]][attribute_id] = {}
+                    gff3_dict[gff_values[2]][val] = {}
                 # add attributes
                 if identifier != "ID":
-                    gff3_dict[gff_values[2]][attribute_id][identifier] = val.replace("\n", "")
+                    gff3_dict[gff_values[2]][attribute_id][identifier] = val
             # add start, stop and strand
             gff3_dict[gff_values[2]][attribute_id]["start"] = int(gff_values[3])
             gff3_dict[gff_values[2]][attribute_id]["stop"] = int(gff_values[4])