From 257d2309c8eb85cd63e8d8ab3830c027b7925180 Mon Sep 17 00:00:00 2001
From: miseminger <madeline.iseminger@gmail.com>
Date: Mon, 29 Jul 2024 17:54:16 -0700
Subject: [PATCH] change 'gene' attribute to 'gene_name' in VCF

---
 bin/functions.py | 6 +++---
 bin/vcf2gvf.py   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bin/functions.py b/bin/functions.py
index 53550dc9..5051c39e 100755
--- a/bin/functions.py
+++ b/bin/functions.py
@@ -3,7 +3,7 @@
 import logging
 
 # standard variables used by all scripts
-empty_attributes = 'ID=;Name=;alias=;gene=;gene_symbol=;protein_name=;protein_symbol=;\
+empty_attributes = 'ID=;Name=;alias=;gene_name=;gene_symbol=;protein_name=;protein_symbol=;\
     protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \
     mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \
     Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \
@@ -506,7 +506,7 @@ def map_pos_to_gene_protein(pos, GENE_PROTEIN_POSITIONS_DICT):
             protein_symbol = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_alias"]
             protein_id = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_id"]
             transcript_id = GENE_PROTEIN_POSITIONS_DICT[entry]["locus_tag"]
-
+    
             # fill in attributes for mutations in this CDS region
             cds_mask = df[pos_column].astype(int).between(start, end, inclusive="both")
             df.loc[cds_mask, "gene"] = gene
@@ -542,7 +542,7 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT):
     df.loc[:, 'alias'] = 'n/a'
 
     # get list of all NSP, 3CL, and PlPro proteins in the file:
-    alias_mask = (df['gene'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a')
+    alias_mask = (df['gene_symbol'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a')
     nsps_list = sorted(list(set(df[alias_mask]['mat_pep'].tolist())))
     if len(nsps_list) > 0:
         
diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py
index f0c72068..2a65f795 100755
--- a/bin/vcf2gvf.py
+++ b/bin/vcf2gvf.py
@@ -8,7 +8,7 @@
 files. Required user input is a VCF file.
     
 The attributes completed by this script are: 
-['ID', 'Name', 'gene', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep',
+['ID', 'Name', 'gene_name', 'gene_symbol', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep',
 'mat_pep_desc','mat_pep_acc', 'ro', 'ao', 'dp', 'sample_size', 'Reference_seq',
 'Variant_seq', 'nt_name', 'aa_name', 'hgvs_nt', 'hgvs_aa', 'hgvs_alias', 'vcf_gene', 'mutation_type',
 'viral_lineage', 'alternate_frequency', 'transcript_id']
@@ -83,7 +83,7 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size):
     # add gene and protein attributes from JSON
     json_df = map_pos_to_gene_protein(
         vcf_df['POS'].astype(int), GENE_PROTEIN_POSITIONS_DICT)
-    new_gvf["gene"] = json_df["gene"]
+    new_gvf["gene_name"] = json_df["gene"]
     new_gvf["gene_symbol"] = json_df["gene"]
     new_gvf["protein_name"] = json_df["protein_name"]
     new_gvf["protein_symbol"] = json_df["protein_symbol"]