-
Notifications
You must be signed in to change notification settings - Fork 3
/
msa_hmm.py
200 lines (156 loc) · 7.38 KB
/
msa_hmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import logging
import tempfile
import argparse
import sqlite3
import os
import util
from Bio import SeqIO
from Bio.AlignIO import read as read_alignment
from subprocess import run
def correct_revcom(hmmfile, inseqs):
"""
Read a list of SeqIO records using hmmer with and without reverse complementing the sequence,
check which one to keep(highest "*" count returned from with align_score()),
and return all in the right orientation
:param hmmfile: HMM model file
:param inseqs: list of sequences
:return:
"""
# Read each sequence record from fasta file
sequences = []
log_reverse = 0
for record in inseqs:
# Run align_score with original sequence
count_1, alignment_original = align_score(record, hmmfile)
logger.debug(f'Forward alignment score {count_1} for {record.id}')
# Reverse complement the sequence and align
record.seq = record.seq.reverse_complement()
count_2, alignment_reverse = align_score(record, hmmfile)
logger.debug(f'Reverse complemented alignment score {count_2} for {record.id}')
# Check if the first count is higher
if count_1 >= count_2:
# Reverse the reverse complement
record.seq = record.seq.reverse_complement()
sequences.append(record)
else:
# Add alignment to list
sequences.append(record)
# Log counter
log_reverse += 1
logger.info(f'Corrected {log_reverse} reverse complemented sequences out of {len(sequences)}')
return sequences
def align_write(sequences, outfile, conn):
"""
Aligns the reverse complement-corrected sequences. Remaps barcode_id to process_id using a database
lookup by way of the provided conn. Writes output to outfile.
:param sequences:
:param outfile:
:param conn:
:return:
"""
# Save the sequence to the temporary file
with open(f'{outfile}.tmp1', mode='w+') as temp_fasta:
for seq in sequences:
temp_fasta.write(f'>{seq.id}\n')
temp_fasta.write(f'{seq.seq}\n')
# Align with hmmalign, capture and parse output as phylip
run(['hmmalign', '--trim', '-o', f'{outfile}.tmp2', '--outformat', 'phylip', hmmfile, f'{outfile}.tmp1'])
aligned = read_alignment(f'{outfile}.tmp2', 'phylip')
os.remove(f'{outfile}.tmp1')
os.remove(f'{outfile}.tmp2')
# Map barcode_id to process_id and write to outfile
with open(f'{outfile}', mode='a') as output:
for seq in aligned:
# Need to lookup the process IDs in the database
bid = seq.id
process_id = conn.execute(f'SELECT processid FROM barcode WHERE barcode_id={bid}').fetchone()
output.write(f'>{process_id[0]}\n')
output.write(f'{seq.seq}\n')
logger.info(f'Wrote/appended aligned sequences to {outfile}')
def align_score(record, hmmfile):
"""
Uses a Hidden Markov Model to align a sequence using hmmalign. This gives an stockholm file as output.
Returns the number of "*" in the stockholm file. Reads the output file and saves its contents in the
variable 'alignment'.
:param record: a sequence from the fasta input
:param hmmfile: HMM model file
:return: int number of "*" in a stockholm file, content of the hmmalignment output file
"""
# Open two temporary files to store a fasta sequence and a stockholm sequence
with tempfile.NamedTemporaryFile(mode='w+') as temp_fasta, tempfile.NamedTemporaryFile(mode='w+') as temp_stockholm:
# Save the sequence to the temporary file
SeqIO.write(record, temp_fasta.name, 'fasta')
# Run hmm align, read the aligned sequence
run(['hmmalign', '--trim', '-o', temp_stockholm.name, hmmfile, temp_fasta.name])
alignment = read_alignment(temp_stockholm.name, "stockholm")
seq = alignment[0]
# Return probability colum of the stockholm file
quality_string = alignment.column_annotations['posterior_probability']
count = 0
# Count . and * characters
dot_count = quality_string.count('.')
star_count = quality_string.count('*')
# Give value 0 to . and value 10 to *
count += dot_count * 0
count += star_count * 10
# Add all numbers
digit_sum = sum(int(char) for char in quality_string if char.isdigit())
# Add numbers to the values calculated from . and *
count += digit_sum
# Calculate average count to account for gaps (0's)
average_count = count / len(quality_string)
return average_count, seq
def merge_files(ingroup, outgroup, conn):
"""
Merges the provided input files and maps the process IDs to barcode IDs, which are shorter.
This is useful because the alignment steps truncate IDs. At a later stage we map them back
to process IDs. In both cases this is done via the database, hence the connection object needs
to be provided here. In the case of the ingroup file, the process ID is assumed to be the 3rd
item in a pipe-separated list on the definition line. In the outgroup file, it is the entire
first word of the definition line, i.e. the record ID according to SeqIO.
:param ingroup: FASTA file
:param outgroup: FASTA file
:param conn: Database connection
"""
merged = []
# ingroup has defline where process ID is 3rd element in pipe-separated list
for record in SeqIO.parse(ingroup, 'fasta'):
procid = record.id.split('|')[2]
record.id = procid
merged.append(record)
# outgroup already has this, just needs appending
for record in SeqIO.parse(outgroup, 'fasta'):
merged.append(record)
# map all to barcode ID
for record in merged:
procid = record.id
query = f'select barcode_id from barcode where processid="{procid}";'
record.id = str(conn.execute(query).fetchone()[0])
# return result
return merged
if __name__ == '__main__':
# Define command line arguments
parser = argparse.ArgumentParser(description='Required command line arguments.')
parser.add_argument('-m', '--model', required=True, help='Location of HMM file, should match the marker')
parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)')
parser.add_argument('-i', '--ingroup', required=True, help='Input unaligned ingroup FASTA file')
parser.add_argument('-g', '--outgroup', required=True, help='Unaligned outgroup FASTA file')
parser.add_argument('-o', '--output', required=True, help='Output aligned FASTA file')
parser.add_argument('-d', '--db', required=True, help="SQLite database")
args = parser.parse_args()
# Configure logger
logger = util.get_formatted_logger('msa_hmm', args.verbosity)
# Connect to the database (creates a new file if it doesn't exist)
logger.info(f"Going to connect to database {args.db}")
connection = sqlite3.connect(args.db)
# Merge input files with deflines mapped to barcode_id
logger.info(f"Going to merge input files {args.ingroup} and {args.outgroup}")
uncorrected = merge_files(args.ingroup, args.outgroup, connection)
# Report the location of the HMM file
hmmfile = args.model
logger.info(f"Going to use Hidden Markov Model from {hmmfile}")
# Do the reverse complement correction
logger.info('Going to correct any reverse-complemented sequences')
corrected = correct_revcom(hmmfile, uncorrected)
# Write alignment
align_write(corrected, args.output, connection)