Skip to content

Commit

Permalink
update to match new functional annotation format
Browse files Browse the repository at this point in the history
  • Loading branch information
miseminger committed Jul 30, 2024
1 parent baf3cb0 commit ec976f8
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
7 changes: 5 additions & 2 deletions bin/addfunctions2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,18 @@ def add_pokay_annotations(gvf, annotation_file):

# load functional annotations spreadsheet
df = pd.read_csv(annotation_file, sep='\t', header=0)
df['author'] = df['author'].fillna('UNKNOWN')
# remove any leading/trailing spaces
for column in df.columns:
df[column] = df[column].fillna('')
df[column] = df[column].astype(str).str.strip()

# merge annotated vcf and functional annotation files by 'Name' and 'protein_symbol'
df = df.rename(columns={"original mutation description": "Name", "amino acid mutation alias":"Pokay_alias", 'variant functional effect':"function_category", \
df = df.rename(columns={"original mutation description": "Name", 'variant functional effect':"function_category", \
'variant functional effect description':"function_description", 'URL':"source", 'protein symbol':'protein_symbol'})
df['citation'] = df['author'] + ' et al. (' + df['publication year'].str.replace(".0", "", regex=False) + ')'
df_columns = functional_attributes + ["Name", "protein_symbol"]
df = df[df_columns]

merged_df = pd.merge(gvf, df, on=['Name', 'protein_symbol'], how='left') #, 'alias'

Expand Down Expand Up @@ -117,7 +121,6 @@ def add_pokay_annotations(gvf, annotation_file):

# replace NaNs in df with empty string
merged_df = merged_df.fillna('')

# merge attributes back into a single column
merged_df = rejoin_attributes(merged_df, empty_attributes)

Expand Down
6 changes: 4 additions & 2 deletions bin/splitmutationnames_gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def parse_args():
# expand #attributes into columns to edit separately
gvf = separate_attributes(gvf)

# split names in "Names" attribute into separate rows
gvf = split_names(args.names_to_split, gvf, col_to_split='Name')
# if names_to_split tsv is given, use it to split up the multi-amino acid names
if args.names_to_split != None:
# split names in "Names" attribute into separate rows
gvf = split_names(args.names_to_split, gvf, col_to_split='Name')

# rename IDs: rows with the same entry in 'Name'
# get the same ID
Expand Down

0 comments on commit ec976f8

Please sign in to comment.