Skip to content

Commit

Permalink
Added tests: finding telomeres on genomes
Browse files Browse the repository at this point in the history
  • Loading branch information
bricoletc committed Jul 10, 2024
1 parent de3c12b commit 426dcf8
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 6 deletions.
25 changes: 20 additions & 5 deletions delfies/seq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class Orientation(Enum):
forward = 1
reverse = 2
forward = "+"
reverse = "-"


@dataclass
Expand Down Expand Up @@ -49,6 +49,20 @@ def find_all_occurrences_in_genome(
seq_regions: Intervals,
interval_window_size: int,
) -> Intervals:
"""
Developer note:
- The point of `interval_window_size` is to produce an interval
inside which softclip starts in aligned reads will be recorded.
This is used in `G2S` breakpoint detection mode. Larger values allow for
more breakpoint 'fuzziness'.
- The goal of this function is to return the 0-based position on chromosomes of the start of
telomere arrays. This enables `G2S` breakpoints to be detected, by looking
for softclips in aligned reads starting at/near the beginning of telomere arrays.
This function currently assumes that, at assembled telomere arrays,
the forward-oriented telomere starts at the 5' end of such arrays, while the
reverse-oriented telomere sequence ends at the 3' end of such arrays.
"""
result = list()
patterns = {
Orientation.forward: re.compile(query_sequence),
Expand All @@ -63,17 +77,18 @@ def find_all_occurrences_in_genome(
else:
relative_to_absolute = 0
target_seq = genome_fasta[seq_region.name]
chrom_length = len(genome_fasta[seq_region.name])
for orientation, pattern in patterns.items():
for match in pattern.finditer(str(target_seq)):
if orientation is Orientation.forward:
interval_midpoint = match.start()
else:
interval_midpoint = match.end()
interval_midpoint = match.end() - 1
interval_midpoint += relative_to_absolute
new_interval = Interval(
name=seq_region.name,
start=interval_midpoint - interval_window_size,
end=interval_midpoint + interval_window_size,
start=max(0, interval_midpoint - interval_window_size),
end=min(chrom_length - 1, interval_midpoint + interval_window_size),
)
result.append(new_interval)
return result
113 changes: 112 additions & 1 deletion tests/test_seq_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from delfies.seq_utils import cyclic_shifts, rev_comp
from tempfile import TemporaryDirectory
from pathlib import Path

from pyfastx import Fasta

from delfies.seq_utils import cyclic_shifts, rev_comp, find_all_occurrences_in_genome
from delfies.interval_utils import Interval


def test_rev_comp():
Expand All @@ -19,3 +25,108 @@ def test_cyclic_shifts():
]
result = cyclic_shifts(str_to_shift)
assert result == expected_shifts


class TestFindOccurrencesInGenome:
default_chrom_name = "chr1"
default_query = "TTAGGC"
default_query_revcomp_array = rev_comp(default_query) * 9
len_default_query_revcomp_array = len(default_query_revcomp_array)
default_reference = (
f">{default_chrom_name}\n{default_query_revcomp_array}{default_query * 10}\n"
)
default_search_regions = [Interval(default_chrom_name)]

@classmethod
def setup_class(cls):
cls.temp_dir = TemporaryDirectory()
cls.temp_fasta = Path(cls.temp_dir.name) / "temp.fasta"

@classmethod
def teardown_class(cls):
cls.temp_dir.cleanup()

def make_fasta(self, input_string=default_reference):
with self.temp_fasta.open("w") as ofstream:
ofstream.write(input_string)
return Fasta(str(self.temp_fasta), build_index=True)

def test_find_all_occs_no_hits(self):
fasta = self.make_fasta()
result = find_all_occurrences_in_genome(
"AATTTTTTAAA", fasta, self.default_search_regions, interval_window_size=0
)
assert result == []

def test_find_all_occs_hits_whole_chrom(self):
fasta = self.make_fasta()

# Forward only hits
result = find_all_occurrences_in_genome(
self.default_query * 10,
fasta,
self.default_search_regions,
interval_window_size=0,
)
expected_forward_start = self.len_default_query_revcomp_array
expected = [Interval(self.default_chrom_name, expected_forward_start, expected_forward_start)]
assert result == expected

# Forward and reverse hits
result = find_all_occurrences_in_genome(
self.default_query * 9,
fasta,
self.default_search_regions,
interval_window_size=0,
)
expected_reverse_start = self.len_default_query_revcomp_array - 1
expected = [
Interval(self.default_chrom_name, expected_forward_start, expected_forward_start),
Interval(self.default_chrom_name, expected_reverse_start, expected_reverse_start),
]
assert result == expected

def test_find_all_occs_hits_in_region(self):
region_start = self.len_default_query_revcomp_array
search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))]
fasta = self.make_fasta()
result = find_all_occurrences_in_genome(
self.default_query, fasta, search_regions, interval_window_size=0
)
expected = [Interval(self.default_chrom_name, region_start, region_start)]
assert result == expected

def test_find_all_occs_hits_with_window_inside_genome(self):
region_start = self.len_default_query_revcomp_array
search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))]
fasta = self.make_fasta()
contained_window_size = 2
result = find_all_occurrences_in_genome(
self.default_query,
fasta,
search_regions,
interval_window_size=contained_window_size,
)
expected = [
Interval(
self.default_chrom_name,
region_start - contained_window_size,
region_start + contained_window_size,
)
]
assert result == expected

def test_find_all_occs_hits_with_window_outside_genome(self):
region_start = self.len_default_query_revcomp_array
search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))]
fasta = self.make_fasta()
overflowing_window_size = 400
result = find_all_occurrences_in_genome(
self.default_query,
fasta,
search_regions,
interval_window_size=overflowing_window_size,
)
expected_end = len(self.default_query) * 19 - 1
expected = [Interval(self.default_chrom_name, 0, expected_end)]
assert result == expected

0 comments on commit 426dcf8

Please sign in to comment.