Skip to content

Commit

Permalink
change +'/'+ by oS.path.join, change system table creation and fix Ca…
Browse files Browse the repository at this point in the history
…sFinder as a type of system
  • Loading branch information
FlorianTesson committed Apr 9, 2024
1 parent 0681c86 commit 6576132
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 50 deletions.
26 changes: 14 additions & 12 deletions defense_finder_posttreat/best_solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@

from macsypy.serialization import TsvSystemSerializer


def get(tmp_dir):
results = os.listdir(tmp_dir)
acc = pd.DataFrame()
for family_dir in results:
family_path = os.path.join(tmp_dir, family_dir)

if is_file_empty(family_path+'/best_solution.tsv') == False :
print(parse_best_solution(family_path).dtypes)
acc = pd.concat([acc , parse_best_solution(family_path)])
if acc.empty == True :

if is_file_empty(os.path.join(family_path, 'best_solution.tsv')) is False:
acc = pd.concat([acc, parse_best_solution(family_path)])
if acc.empty is True:
acc = pd.DataFrame(columns=get_best_solution_keys('\t'))
print(acc.dtypes)
return format_best_solution(acc)


def is_file_empty(path):
prev_line = ''
with open(path, 'r') as f:
Expand All @@ -29,14 +29,15 @@ def is_file_empty(path):
return True
return False


def parse_best_solution(dir):
"""
:param dir: the macsyfinder result directory path
:type dir: str
"""
delimiter = '\t'
data=pd.read_table(dir+'/best_solution.tsv',sep=delimiter,comment='#')
data = pd.read_table(os.path.join(dir, 'best_solution.tsv'), sep=delimiter, comment='#')

return data


Expand All @@ -45,8 +46,9 @@ def get_best_solution_keys(delimiter='\t'):


def format_best_solution(p):
p['type'] = p.model_fqn.map(lambda x : x.split('/')[-2])
p['subtype'] = p.model_fqn.map(lambda x : x.split('/')[-1])
p=p.sort_values('hit_pos')
p['type'] = p.model_fqn.map(lambda x: x.split('/')[-2])
p['subtype'] = p.model_fqn.map(lambda x: x.split('/')[-1])
p.loc[p['type'] == 'CasFinder', 'type'] = 'Cas'
p = p.sort_values('hit_pos')

return(p)
return p
10 changes: 2 additions & 8 deletions defense_finder_posttreat/df_genes.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
import os
from defense_finder_posttreat import best_solution

def export_defense_finder_genes(defense_finder_genes, outdir, filename):
defense_finder_genes.to_csv(outdir+'/'+filename+'_defense_finder_genes.tsv',sep='\t',index=False)


def write_defense_finder_genes(defense_finder_genes_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_genes.tsv')
defense_finder_genes_list.to_csv('filepath',sep='\t',index=False)

def export_defense_finder_genes(defense_finder_genes, outdir, filename):
defense_finder_genes.to_csv(os.path.join(outdir, filename+'_defense_finder_genes.tsv'), sep='\t', index=False)
33 changes: 19 additions & 14 deletions defense_finder_posttreat/df_hmmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,43 @@


def remove_duplicates(hmmer_hits):

hmmer_hits=hmmer_hits.sort_values('hit_score',ascending=False).drop_duplicates(['hit_id','gene_name'])
# Remove duplicates of hit_id if they have the same gene name
# For R-M and Retron (multi HMM) only one hit is conserved
hmmer_hits = hmmer_hits.sort_values('hit_score', ascending=False).drop_duplicates(['hit_id', 'gene_name'])

return hmmer_hits


def export_defense_finder_hmmer_hits(tmp_dir, outdir, filename):
paths = get_hmmer_paths(tmp_dir)
hmmer_hits = pd.DataFrame()

for path in paths:
d = parse_hmmer_results_file(path)
if d.empty == False:
hmmer_hits = pd.concat([hmmer_hits , remove_duplicates(d)])
if hmmer_hits.empty == True:
hmmer_hits=pd.DataFrame(columns=get_hmmer_keys())
hmmer_hits=remove_duplicates(hmmer_hits)
hmmer_hits=hmmer_hits.sort_values('hit_score')
if d.empty is False:
hmmer_hits = pd.concat([hmmer_hits, remove_duplicates(d)])
if hmmer_hits.empty is True:
hmmer_hits = pd.DataFrame(columns=get_hmmer_keys())

hmmer_hits = remove_duplicates(hmmer_hits)
hmmer_hits = hmmer_hits.sort_values('hit_score')

write_defense_finder_hmmer(hmmer_hits, outdir, filename)


def write_defense_finder_hmmer(hmmer_hits, outdir, filename):
hmmer_hits.to_csv(outdir+"/"+filename+"_defense_finder_hmmer.tsv",sep='\t',index=False)
hmmer_hits.to_csv(os.path.join(outdir, filename+"_defense_finder_hmmer.tsv"), sep='\t', index=False)


def get_hmmer_keys():
return ['hit_id', 'replicon', 'hit_pos', 'hit_sequence_length', 'gene_name','i_eval','hit_score','hit_profile_cov','hit_seq_cov','hit_begin_match','hit_end_match']
return ['hit_id', 'replicon', 'hit_pos', 'hit_sequence_length', 'gene_name', 'i_eval', 'hit_score', 'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match', 'hit_end_match']


def parse_hmmer_results_file(path):
data = pd.read_table(path, sep='\t',comment='#',names=get_hmmer_keys())
data = pd.read_table(path, sep='\t', comment='#', names=get_hmmer_keys())
return data


def get_hmmer_paths(results_dir):
family_dirs = os.listdir(results_dir)
files = []
Expand All @@ -44,4 +50,3 @@ def get_hmmer_paths(results_dir):
if entry.name.endswith('extract') and entry.is_file():
files.append(entry)
return list(map(lambda i: i.path, files))

31 changes: 15 additions & 16 deletions defense_finder_posttreat/df_systems.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
import os
import pandas as pd


def export_defense_finder_systems(defense_finder_genes, outdir, filename):
systems = build_defense_finder_systems(defense_finder_genes)
systems.to_csv(outdir+'/'+filename+'_defense_finder_systems.tsv',sep='\t',index=False)
systems.to_csv(os.path.join(outdir, filename + '_defense_finder_systems.tsv'), sep='\t', index=False)


def build_defense_finder_systems(defense_finder_genes):
sys=defense_finder_genes.drop_duplicates('sys_id')[['sys_id' , 'type' , 'subtype']]

sys_beg=defense_finder_genes.sort_values('hit_pos').drop_duplicates('sys_id').rename({'hit_id' : 'sys_beg'},axis=1)[['sys_id','sys_beg']]
sys_end=defense_finder_genes.sort_values('hit_pos' , ascending=False).drop_duplicates('sys_id').rename({'hit_id' : 'sys_end'},axis=1)[['sys_id','sys_end']]
protein_in_syst=defense_finder_genes.groupby('sys_id').hit_id.apply(lambda x: ",".join(x.sort_values())).reset_index().rename({'hit_id':'protein_in_syst'},axis=1)
name_of_profiles_in_sys=defense_finder_genes.groupby('sys_id').gene_name.apply(lambda x : ",".join(x.sort_values())).reset_index().rename({'hit_id' : 'protein_in_syst'},axis = 1)
genes_count=defense_finder_genes.sys_id.value_counts().reset_index()
genes_count.columns=['sys_id','genes_count']
if defense_finder_genes.empty is False:
out = defense_finder_genes.groupby(['sys_id', 'type', 'subtype'])[['hit_id', 'hit_pos']].apply(lambda x: ",".join(x.sort_values('hit_pos').hit_id.to_list())).reset_index()
out.columns = ['sys_id', 'type', 'subtype', 'protein_in_syst']
out['sys_beg'] = out.protein_in_syst.map(lambda x: x.split()[0])
out['sys_end'] = out.protein_in_syst.map(lambda x: x.split()[-1])

genes_count = defense_finder_genes.sys_id.value_counts().reset_index()
genes_count.columns = ['sys_id', 'genes_count']
out = out.merge(genes_count, on='sys_id')

out=sys.merge(sys_beg,on = 'sys_id')
out=out.merge(sys_end,on = 'sys_id')
out=out.merge(protein_in_syst,on = 'sys_id')
out=out.merge(genes_count,on = 'sys_id')
out=out.merge(name_of_profiles_in_sys,on = 'sys_id')
name_of_profiles_in_sys = defense_finder_genes.groupby('sys_id').gene_name.apply(lambda x: ",".join(x.sort_values())).reset_index().rename({'gene_name': 'name_of_profiles_in_sys'}, axis=1)
out = out.merge(name_of_profiles_in_sys, on='sys_id')
else:
out = pd.DataFrame(columns=['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys'])

return(out)

return out[['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']]

0 comments on commit 6576132

Please sign in to comment.