Skip to content

Commit

Permalink
Fix bugs related to lowercase bases
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Jan 10, 2021
1 parent 26fb425 commit 25b9768
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 8 deletions.
22 changes: 21 additions & 1 deletion test/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,20 @@ def test_load_fasta_4():
assert seqs[1][1].startswith('ATTCTCAGAATGGCGTAG')


def test_load_fasta_5():
seqs = trycycler.misc.load_fasta('test/test_misc/lowercase.fasta')
assert len(seqs) == 2
assert seqs[0][0] == 'A'
assert seqs[0][1].startswith('TTGCCTGTAGTCGGGACC')
assert seqs[1][0] == 'B'
assert seqs[1][1].startswith('ATTCTCAGAATGGCGTAG')


def test_get_default_thread_count():
assert 1 <= trycycler.misc.get_default_thread_count() <= 16


def test_write_seq_to_fasta():
def test_write_seq_to_fasta_1():
with tempfile.TemporaryDirectory() as temp_dir:
filename = pathlib.Path(temp_dir) / 'temp.fasta'
trycycler.misc.write_seq_to_fasta('CAGAATGGCGT', 'name', filename)
Expand All @@ -213,6 +222,17 @@ def test_write_seq_to_fasta():
assert seqs[0][1] == 'CAGAATGGCGT'


def test_write_seq_to_fasta_2():
# Same test, but with lowercase bases in input (should be made uppercase in saved file).
with tempfile.TemporaryDirectory() as temp_dir:
filename = pathlib.Path(temp_dir) / 'temp.fasta'
trycycler.misc.write_seq_to_fasta('CAgaaTgGcgt', 'name', filename)
seqs = trycycler.misc.load_fasta(filename)
assert len(seqs) == 1
assert seqs[0][0] == 'name'
assert seqs[0][1] == 'CAGAATGGCGT'


def test_reverse_complement_1():
assert trycycler.misc.reverse_complement('GGGGaaaaaaaatttatatat') == 'atatataaattttttttCCCC'

Expand Down
4 changes: 4 additions & 0 deletions test/test_misc/lowercase.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>A info
ttgcctgtagtcgggaccccgtgactaggaaagcaatcagcgactaacaggcggagaccgtctatagcgcacggggtgtagttggctattactgatctct
>B stuff
attctcagaatggcgtagtattcatatttgttcgtagcccgcctccgtacatgttattgtgctcatcggtggcctgcgccgtggggagtgcaaaacgtgg
14 changes: 14 additions & 0 deletions test/test_msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,17 @@ def test_msa_3():

assert aligned_seqs == [('A', 'ATGTAAAGGTTCCGGGGCACTTAGCAGCTCCACAAATCCATT-CCAACCTATA'),
('B', 'ATGTAAAGGTT--GGGGCACTTAGCA-CTCCACACATCCATTGCCAACCTATA')]


def test_msa_4():
# Same sequences, but with lower case bases (should be made uppercase in alignment).
seqs = [('A', 'ATGtAAAGGTTcCGGGGCACttAGCaGCTCCACAaAtCcATTCCAACcTaTA'),
('B', 'ATGTaAaGGtTgGgGCAcTTAGCACTCCaCACATcCAttGCCaACCTATA')]
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir = pathlib.Path(temp_dir)
save_input_sequences(temp_dir, seqs)
trycycler.msa.msa(create_args(temp_dir, kmer=32, step=1000, lookahead=10000, threads=8))
aligned_seqs = load_fasta(temp_dir / '3_msa.fasta')

assert aligned_seqs == [('A', 'ATGTAAAGGTTCCGGGGCACTTAGCAGCTCCACAAATCCATT-CCAACCTATA'),
('B', 'ATGTAAAGGTT--GGGGCACTTAGCA-CTCCACACATCCATTGCCAACCTATA')]
2 changes: 1 addition & 1 deletion trycycler/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def complete_linkage(seqs, seq_names, depths, distances, threshold, out_dir):
seq_depth = depths[name]
with open(seq_fasta, 'wt') as f:
f.write(f'>{name}\n')
f.write(f'{seq}\n')
f.write(f'{seq.upper()}\n')
log_lines.append((seq_fasta, seq_length, seq_depth))

for seq_fasta, seq_length, seq_depth in log_lines:
Expand Down
2 changes: 1 addition & 1 deletion trycycler/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ def save_seqs_to_fasta(seqs, filename, extra_newline=True):
with open(filename, 'wt') as fasta:
for name, seq in seqs.items():
fasta.write(f'>{name}\n')
fasta.write(f'{seq}\n')
fasta.write(f'{seq.upper()}\n')
if extra_newline:
log()

Expand Down
4 changes: 2 additions & 2 deletions trycycler/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def load_fasta(fasta_filename, include_full_header=False):
sequence = []
name = line[1:]
else:
sequence.append(line)
sequence.append(line.upper())
if name:
if include_full_header:
fasta_seqs.append((name.split()[0], name, ''.join(sequence)))
Expand All @@ -154,7 +154,7 @@ def get_default_thread_count():
def write_seq_to_fasta(seq, name, filename):
with open(filename, 'wt') as f:
f.write(f'>{name}\n')
f.write(f'{seq}\n')
f.write(f'{seq.upper()}\n')


REV_COMP_DICT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g',
Expand Down
4 changes: 2 additions & 2 deletions trycycler/msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def merge_pieces(temp_dir: pathlib.Path, cluster_dir, seqs):
for f in msa_fasta_files:
parts = dict(load_fasta(f))
for n in seq_names:
aligned_seq_parts[n].append(parts[n])
aligned_seq_parts[n].append(parts[n].upper())
aligned_seqs = {}
for n in seq_names:
aligned_seqs[n] = ''.join(aligned_seq_parts[n])
Expand All @@ -184,7 +184,7 @@ def merge_pieces(temp_dir: pathlib.Path, cluster_dir, seqs):
# Sanity check: the MSA sequences should match the original sequences.
for n in seq_names:
msa_minus_dashes = final_seqs[n].replace('-', '')
assert seqs[n] == msa_minus_dashes
assert seqs[n].upper() == msa_minus_dashes

# Save the full MSA to file.
final_msa_fasta_filename = cluster_dir / '3_msa.fasta'
Expand Down
2 changes: 1 addition & 1 deletion trycycler/reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def save_seqs_to_fasta(seqs, filename):
with open(filename, 'wt') as fasta:
for name, seq in seqs.items():
fasta.write(f'>{name}\n')
fasta.write(f'{seq}\n')
fasta.write(f'{seq.upper()}\n')
log()


Expand Down

0 comments on commit 25b9768

Please sign in to comment.