Skip to content

Commit

Permalink
adding new strains to include list and updating clades. fixed an erro…
Browse files Browse the repository at this point in the history
…r in clade assignments to assign only new strains that are in the metadata
  • Loading branch information
lmoncla committed Feb 7, 2025
1 parent ab637d1 commit 94d2a92
Show file tree
Hide file tree
Showing 7 changed files with 2,131 additions and 21 deletions.
1,014 changes: 1,014 additions & 0 deletions clade-labeling/h5n1-clades.tsv

Large diffs are not rendered by default.

1,076 changes: 1,076 additions & 0 deletions clade-labeling/h5nx-clades.tsv

Large diffs are not rendered by default.

46 changes: 27 additions & 19 deletions clade-labeling/update-clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,34 +27,42 @@
sequences = args.sequences
subtype = args.subtype

def find_new_strains(clades_file, new_strains):
old_strains = pd.read_csv(clades_file, sep="\t")['name']
new_strains = pd.read_csv(new_strains, sep="\t")['strain']

def find_new_strains(clades_file, metadata):
old_strains = pd.read_csv(clades_file, sep="\t")['name'].tolist()
all_strains = pd.read_csv(metadata, sep="\t")['strain'].tolist()
new_strains = []

# if metadata strain not in old strains, return as new strain
for a in all_strains:
if a not in old_strains:
new_strains.append(a)


# union of the series -> combination of values present in either set 1 or set 2; so, sum of all elements
union = pd.Series(np.union1d(new_strains, old_strains))

# intersection of the series -> elements that are shared between the 2
intersect = pd.Series(np.intersect1d(new_strains, old_strains))

# uncommon elements in both the series; union - all elements of union that are in intersect
notcommonseries = union[~union.isin(intersect)]

new_strains = notcommonseries.tolist()

# union = pd.Series(np.union1d(all_strains, old_strains))
#
# # intersection of the series -> elements that are shared between the 2
# intersect = pd.Series(np.intersect1d(all_strains, old_strains))
#
# # uncommon elements in both the series; union - all elements of union that are in intersect
# notcommonseries = union[~union.isin(intersect)]
#
#new_strains = notcommonseries.tolist()
print(len(new_strains))
#print(new_strains)
return(new_strains)



def separate_new_strains(new_strains_list, input_fasta, output_fasta):
with open(output_fasta, "w") as outfile:
outfile.write("")

for seq in SeqIO.parse(input_fasta, "fasta"):
strain = seq.description.split("|")[0]
strain = seq.description.split("|")[0].strip()
full_sequence_header = seq.description

if strain in new_strains_list:
if strain in new_strains_list:
with open(output_fasta, "a") as outfile:
outfile.write(">" + full_sequence_header + "\n" + str(seq.seq) + "\n")

Expand Down Expand Up @@ -82,8 +90,8 @@ def append_new_clades(new_clades, old_clades):
separate_new_strains(new_strains, sequences, new_strains_fasta)


"""run label and reformat the output"""
print("\nrunning LABEL to assign clades to", len(new_strains), "new strains of", subtype)
# """run label and reformat the output"""
# print("\nrunning LABEL to assign clades to", len(new_strains), "new strains of", subtype)
os.system('flu-amd-label-2023-05-05/./LABEL -D {new_strains_fasta} {label_output} H5v2023'.format(new_strains_fasta=new_strains_fasta, label_output=label_output))
os.system('python clade-labeling/check-LABEL-annotations.py --label_output {label_output}_final.txt --output {new_clades_file}'.format(new_clades_file=new_clades_file, label_output=label_output))

Expand Down
1 change: 1 addition & 0 deletions config/h5n1/include_strains_h5n1_2y.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ A/Chicken/BC/FAV-0266-1/2024
A/Turkey/BC/FAV-0267-1/2024
A/Wisconsin/179/2024
A/Iowa/124/2024
A/England/480160/2025

# cat outbreak
A/tiger/USA/038158001/2024
Expand Down
4 changes: 3 additions & 1 deletion config/h5n1/include_strains_h5n1_all-time.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,6 @@ A/British_Columbia/PHL-2032/2024
A/Chicken/BC/FAV-0285-1/2024
A/Chicken/BC/FAV-0268-1/2024
A/Chicken/BC/FAV-0266-1/2024
A/Turkey/BC/FAV-0267-1/2024
A/Turkey/BC/FAV-0267-1/2024
A/England/480160/2025

9 changes: 8 additions & 1 deletion config/h5nx/include_strains_h5nx_2y.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ A/Chicken/BC/FAV-0268-1/2024
A/Chicken/BC/FAV-0266-1/2024
A/Louisiana/12/2024
A/Turkey/BC/FAV-0267-1/2024
A/England/480160/2025



# US dairy cattle outbreak
Expand Down Expand Up @@ -383,4 +385,9 @@ A/cat/USA/037985004/2024
A/cat/USA/037985008/2024
A/cat/USA/037986004/2024
A/cat/USA/037986006/2024
A/cat/California/038279001/2024
A/cat/California/038279001/2024

# important low paths
A/Ruddy Turnstone/Delaware/549/2024
A/Ruddy Turnstone/Delaware/550/2024
A/Laughing Gull/Delaware/554/2024
2 changes: 2 additions & 0 deletions config/h5nx/include_strains_h5nx_all-time.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,6 @@ A/Chicken/BC/FAV-0285-1/2024
A/Chicken/BC/FAV-0268-1/2024
A/Chicken/BC/FAV-0266-1/2024
A/Turkey/BC/FAV-0267-1/2024
A/England/480160/2025


0 comments on commit 94d2a92

Please sign in to comment.