From bfd06d3550b29aa8eb03db5ec15866ab72a6fb7f Mon Sep 17 00:00:00 2001 From: miseminger Date: Tue, 30 Jul 2024 12:20:59 -0700 Subject: [PATCH] rename 'alias_protein' to 'mat_pep' --- bin/gvf2indexandlog.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/gvf2indexandlog.py b/bin/gvf2indexandlog.py index fd2e6781..9b33d7d7 100755 --- a/bin/gvf2indexandlog.py +++ b/bin/gvf2indexandlog.py @@ -49,7 +49,7 @@ def parse_args(): # create index from GVF # make empty index df - index_cols=['pos', 'mutation', 'hgvs_aa_mutation', 'hgvs_nt_mutation', 'gene_name', 'gene_symbol', 'protein_name', 'protein_symbol', 'alias', 'hgvs_alias', 'alias_protein', 'Pokay_annotation', 'lineages'] + index_cols=['pos', 'mutation', 'hgvs_aa_mutation', 'hgvs_nt_mutation', 'gene_name', 'gene_symbol', 'protein_name', 'protein_symbol', 'alias', 'hgvs_alias', 'mat_pep', 'Pokay_annotation', 'lineages'] index = pd.DataFrame(np.empty((gvf.shape[0], len(index_cols))), columns=index_cols) # populate index df with gvf info index['pos'] = gvf['#start'] @@ -58,8 +58,7 @@ def parse_args(): index['hgvs_nt_mutation'] = gvf['hgvs_nt'] index['alias'] = gvf['alias'] index['hgvs_alias'] = gvf['hgvs_alias'] - index['alias_protein'] = 'n/a' - index.loc[index['alias']!='n/a', 'alias_protein'] = gvf['mat_pep'] + index['mat_pep'] = gvf['mat_pep'] index['gene_name'] = gvf['gene_name'] index['gene_symbol'] = gvf['gene_symbol'] index['protein_name'] = gvf['protein_name'] @@ -77,7 +76,7 @@ def parse_args(): # fill in 'new_mutations' column like: "gene:mutation" log['new_mutations'] = log["gene_symbol"] + ":" + log["mutation"] # for orf1ab mutations, fill in 'new_mutations' column like: "gene:mutation / nsp:alias" - log.loc[log['alias']!='n/a', 'new_mutations'] = log['new_mutations'] + " / " + log["alias_protein"] + ":" + log["alias"] + log.loc[log['alias']!='n/a', 'new_mutations'] = log['new_mutations'] + " / " + log["mat_pep"] + ":" + log["alias"] # drop duplicates (there shouldn't be any) log = log[['pos', 'new_mutations', 'lineages']].drop_duplicates() # drop any NaN rows