From 257d2309c8eb85cd63e8d8ab3830c027b7925180 Mon Sep 17 00:00:00 2001 From: miseminger Date: Mon, 29 Jul 2024 17:54:16 -0700 Subject: [PATCH] change 'gene' attribute to 'gene_name' in VCF --- bin/functions.py | 6 +++--- bin/vcf2gvf.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/functions.py b/bin/functions.py index 53550dc9..5051c39e 100755 --- a/bin/functions.py +++ b/bin/functions.py @@ -3,7 +3,7 @@ import logging # standard variables used by all scripts -empty_attributes = 'ID=;Name=;alias=;gene=;gene_symbol=;protein_name=;protein_symbol=;\ +empty_attributes = 'ID=;Name=;alias=;gene_name=;gene_symbol=;protein_name=;protein_symbol=;\ protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \ mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \ Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \ @@ -506,7 +506,7 @@ def map_pos_to_gene_protein(pos, GENE_PROTEIN_POSITIONS_DICT): protein_symbol = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_alias"] protein_id = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_id"] transcript_id = GENE_PROTEIN_POSITIONS_DICT[entry]["locus_tag"] - + # fill in attributes for mutations in this CDS region cds_mask = df[pos_column].astype(int).between(start, end, inclusive="both") df.loc[cds_mask, "gene"] = gene @@ -542,7 +542,7 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT): df.loc[:, 'alias'] = 'n/a' # get list of all NSP, 3CL, and PlPro proteins in the file: - alias_mask = (df['gene'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a') + alias_mask = (df['gene_symbol'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a') nsps_list = sorted(list(set(df[alias_mask]['mat_pep'].tolist()))) if len(nsps_list) > 0: diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py index f0c72068..2a65f795 100755 --- a/bin/vcf2gvf.py +++ b/bin/vcf2gvf.py @@ -8,7 +8,7 @@ files. Required user input is a VCF file. The attributes completed by this script are: -['ID', 'Name', 'gene', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep', +['ID', 'Name', 'gene_name', 'gene_symbol', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep', 'mat_pep_desc','mat_pep_acc', 'ro', 'ao', 'dp', 'sample_size', 'Reference_seq', 'Variant_seq', 'nt_name', 'aa_name', 'hgvs_nt', 'hgvs_aa', 'hgvs_alias', 'vcf_gene', 'mutation_type', 'viral_lineage', 'alternate_frequency', 'transcript_id'] @@ -83,7 +83,7 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size): # add gene and protein attributes from JSON json_df = map_pos_to_gene_protein( vcf_df['POS'].astype(int), GENE_PROTEIN_POSITIONS_DICT) - new_gvf["gene"] = json_df["gene"] + new_gvf["gene_name"] = json_df["gene"] new_gvf["gene_symbol"] = json_df["gene"] new_gvf["protein_name"] = json_df["protein_name"] new_gvf["protein_symbol"] = json_df["protein_symbol"]