-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign_translate_phylo.py
161 lines (122 loc) · 5.66 KB
/
align_translate_phylo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from collections import Counter
import subprocess
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from collections import Counter
def write_consensus_to_fasta(consensus, output_fasta, output_protein_fasta):
nucleotide_records = []
protein_records = []
for name, seq in consensus.items():
# Create nucleotide SeqRecord
nucleotide_record = SeqRecord(Seq(seq), id=f"{name}_nucleotide")
nucleotide_records.append(nucleotide_record)
# Translate to amino acid sequence
protein_seq = Seq(seq).translate()
# Create protein SeqRecord
protein_record = SeqRecord(protein_seq, id=f"{name}_protein")
protein_records.append(protein_record)
# Write nucleotide sequences to fasta
SeqIO.write(nucleotide_records, output_fasta, "fasta")
# Write protein sequences to fasta
SeqIO.write(protein_records, output_protein_fasta, "fasta")
def create_consensus(aligned_sequences, threshold=0.0):
consensus = {}
for name, seqs in aligned_sequences.items():
consensus_seq = []
min_length = min(len(seq) for seq in seqs if len(seq) > 0)
if min_length == 0:
print(f"Warning: Skipped {name} due to zero-length sequence")
continue
for i in range(min_length):
bases_at_i = [seq[i] for seq in seqs if len(seq) > i]
total_count = len(bases_at_i)
base_count = Counter(bases_at_i)
# Calculate fractions and find those above the threshold
above_threshold = {base: count / total_count for base, count in base_count.items() if (count / total_count) >= threshold}
if above_threshold:
most_common_base, _ = max(above_threshold.items(), key=lambda x: x[1])
consensus_seq.append(most_common_base)
else:
# Handle ambiguous base, or simply add 'N' if no base meets the threshold
consensus_seq.append('N')
consensus[name] = "".join(consensus_seq)
return consensus
# Create a combined initial FASTA with both reference and consensus sequences
def create_combined_fasta(reference_fasta, consensus_fasta, output_fasta):
# Read the consensus sequences from the provided fasta file
consensus_records = list(SeqIO.parse(consensus_fasta, "fasta"))
# Read the reference sequences from the provided fasta file
reference_records = list(SeqIO.parse(reference_fasta, "fasta"))
# Combine both lists
combined_records = reference_records + consensus_records
# Write to output fasta file
SeqIO.write(combined_records, output_fasta, "fasta")
def parse_sam(sam_file):
aligned_sequences = {}
with open(sam_file, "r") as f:
for line in f:
if line.startswith("@"):
continue
fields = line.strip().split("\t")
rname = fields[2] # RNAME field
seq = fields[9] # SEQ field
#print(f"Debug: rname={rname}, seq={seq}") # Debugging line
if seq == '*': # Skip lines without sequence data
continue
if rname in aligned_sequences:
aligned_sequences[rname].append(seq)
else:
aligned_sequences[rname] = [seq]
return aligned_sequences
def create_fasta(aligned_sequences, output_fasta):
records = []
for name, seqs in aligned_sequences.items():
combined_seq = "".join(seqs) # Assuming that you want to concatenate sequences
records.append(SeqRecord(Seq(combined_seq), id=name))
SeqIO.write(records, output_fasta, "fasta")
def run_mafft(input_fasta, output_fasta):
print("Running MAFFT alignment...")
result = subprocess.run(["mafft", input_fasta], stdout=open(output_fasta, "w"))
if result.returncode != 0:
print("Error running MAFFT.")
exit(1)
#def create_tree(aligned_fasta, output_tree):
# root_name = "Root sequence Name"
# print("Creating phylogenetic tree...")
# subprocess.run(["FastTree", "-outgroup", root_name, aligned_fasta], stdout=open(output_tree, "w"))
def create_tree_with_iqtree(aligned_fasta, output_tree, outgroup):
print("Creating phylogenetic tree with IQ-TREE...")
subprocess.run(["iqtree2", "-s", aligned_fasta, "-o", outgroup, "-nt", "AUTO", "-pre", output_treeIQ])
# Check if files exist
if not os.path.exists("reference.fasta") or not os.path.exists("input.fastq.gz"):
print("Reference or input files not found.")
exit(1)
# Align sequences with minimap2
print("Running Minimap2 alignment...")
subprocess.run(["minimap2", "-ax", "map-ont", "-o", "output.sam", "reference.fasta", "input.fastq.gz"])
# Parse SAM file
aligned_sequences = parse_sam("output.sam")
# Check if we have aligned sequences
if not aligned_sequences:
print("No aligned sequences found.")
exit(1)
# Create consensus sequences
consensus = create_consensus(aligned_sequences)
# Write the consensus sequences and their translations to fasta files
write_consensus_to_fasta(consensus, "consensus_nucleotide.fasta", "consensus_protein.fasta")
# Create initial FASTA with reference and consensus sequences
initial_fasta = "initial_combined.fasta"
create_combined_fasta("reference.fasta", consensus_fasta, initial_fasta)
# Run MAFFT for alignment
aligned_fasta = "aligned.fasta"
run_mafft(initial_fasta, aligned_fasta)
# Create tree
#output_tree = "tree.newick"
#create_tree(aligned_fasta, output_tree)
# Create tree with IQ-TREE and specify outgroup
outgroup = "Outgroup name"
output_treeIQ = "tree2"
create_tree_with_iqtree(aligned_fasta, output_treeIQ, outgroup)
print("All steps completed successfully!")