From ec976f8413c1ac56da11d44dd5bc258fed439729 Mon Sep 17 00:00:00 2001 From: miseminger Date: Tue, 30 Jul 2024 11:55:32 -0700 Subject: [PATCH] update to match new functional annotation format --- bin/addfunctions2gvf.py | 7 +++++-- bin/splitmutationnames_gvf.py | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/addfunctions2gvf.py b/bin/addfunctions2gvf.py index bed503a2..7cf4d2a9 100755 --- a/bin/addfunctions2gvf.py +++ b/bin/addfunctions2gvf.py @@ -50,14 +50,18 @@ def add_pokay_annotations(gvf, annotation_file): # load functional annotations spreadsheet df = pd.read_csv(annotation_file, sep='\t', header=0) + df['author'] = df['author'].fillna('UNKNOWN') # remove any leading/trailing spaces for column in df.columns: + df[column] = df[column].fillna('') df[column] = df[column].astype(str).str.strip() # merge annotated vcf and functional annotation files by 'Name' and 'protein_symbol' - df = df.rename(columns={"original mutation description": "Name", "amino acid mutation alias":"Pokay_alias", 'variant functional effect':"function_category", \ + df = df.rename(columns={"original mutation description": "Name", 'variant functional effect':"function_category", \ 'variant functional effect description':"function_description", 'URL':"source", 'protein symbol':'protein_symbol'}) df['citation'] = df['author'] + ' et al. (' + df['publication year'].str.replace(".0", "", regex=False) + ')' + df_columns = functional_attributes + ["Name", "protein_symbol"] + df = df[df_columns] merged_df = pd.merge(gvf, df, on=['Name', 'protein_symbol'], how='left') #, 'alias' @@ -117,7 +121,6 @@ def add_pokay_annotations(gvf, annotation_file): # replace NaNs in df with empty string merged_df = merged_df.fillna('') - # merge attributes back into a single column merged_df = rejoin_attributes(merged_df, empty_attributes) diff --git a/bin/splitmutationnames_gvf.py b/bin/splitmutationnames_gvf.py index 4a9fb078..7f14f0fd 100755 --- a/bin/splitmutationnames_gvf.py +++ b/bin/splitmutationnames_gvf.py @@ -51,8 +51,10 @@ def parse_args(): # expand #attributes into columns to edit separately gvf = separate_attributes(gvf) - # split names in "Names" attribute into separate rows - gvf = split_names(args.names_to_split, gvf, col_to_split='Name') + # if names_to_split tsv is given, use it to split up the multi-amino acid names + if args.names_to_split != None: + # split names in "Names" attribute into separate rows + gvf = split_names(args.names_to_split, gvf, col_to_split='Name') # rename IDs: rows with the same entry in 'Name' # get the same ID