update to match new functional annotation format

cidgoh · Jul 30, 2024 · ec976f8 · ec976f8
1 parent baf3cb0
commit ec976f8
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 4 deletions.
diff --git a/bin/addfunctions2gvf.py b/bin/addfunctions2gvf.py
@@ -50,14 +50,18 @@ def add_pokay_annotations(gvf, annotation_file):
 
     # load functional annotations spreadsheet
     df = pd.read_csv(annotation_file, sep='\t', header=0)
+    df['author'] = df['author'].fillna('UNKNOWN')
     # remove any leading/trailing spaces
     for column in df.columns:
+        df[column] = df[column].fillna('')
         df[column] = df[column].astype(str).str.strip()
 
     # merge annotated vcf and functional annotation files by 'Name' and 'protein_symbol'
-    df = df.rename(columns={"original mutation description": "Name", "amino acid mutation alias":"Pokay_alias", 'variant functional effect':"function_category", \
+    df = df.rename(columns={"original mutation description": "Name", 'variant functional effect':"function_category", \
                             'variant functional effect description':"function_description", 'URL':"source", 'protein symbol':'protein_symbol'})
     df['citation'] = df['author'] + ' et al. (' + df['publication year'].str.replace(".0", "", regex=False) + ')'
+    df_columns = functional_attributes + ["Name", "protein_symbol"]
+    df = df[df_columns]
 
     merged_df = pd.merge(gvf, df, on=['Name', 'protein_symbol'], how='left') #, 'alias'
 
@@ -117,7 +121,6 @@ def add_pokay_annotations(gvf, annotation_file):
 
     # replace NaNs in df with empty string
     merged_df = merged_df.fillna('')
-
     # merge attributes back into a single column
     merged_df = rejoin_attributes(merged_df, empty_attributes)
 

diff --git a/bin/splitmutationnames_gvf.py b/bin/splitmutationnames_gvf.py
@@ -51,8 +51,10 @@ def parse_args():
     # expand #attributes into columns to edit separately
     gvf = separate_attributes(gvf)
 
-    # split names in "Names" attribute into separate rows
-    gvf = split_names(args.names_to_split, gvf, col_to_split='Name')
+    # if names_to_split tsv is given, use it to split up the multi-amino acid names
+    if args.names_to_split != None:
+        # split names in "Names" attribute into separate rows
+        gvf = split_names(args.names_to_split, gvf, col_to_split='Name')
 
     # rename IDs: rows with the same entry in 'Name'
     # get the same ID