From 426dcf874c1b0deb22e36f2a7cc72c4caebd07b0 Mon Sep 17 00:00:00 2001 From: Brice Letcher Date: Wed, 10 Jul 2024 18:44:56 +0200 Subject: [PATCH] Added tests: finding telomeres on genomes --- delfies/seq_utils.py | 25 +++++++-- tests/test_seq_utils.py | 113 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 132 insertions(+), 6 deletions(-) diff --git a/delfies/seq_utils.py b/delfies/seq_utils.py index 955ab12..41bb5b1 100644 --- a/delfies/seq_utils.py +++ b/delfies/seq_utils.py @@ -8,8 +8,8 @@ class Orientation(Enum): - forward = 1 - reverse = 2 + forward = "+" + reverse = "-" @dataclass @@ -49,6 +49,20 @@ def find_all_occurrences_in_genome( seq_regions: Intervals, interval_window_size: int, ) -> Intervals: + """ + Developer note: + - The point of `interval_window_size` is to produce an interval + inside which softclip starts in aligned reads will be recorded. + This is used in `G2S` breakpoint detection mode. Larger values allow for + more breakpoint 'fuzziness'. + - The goal of this function is to return the 0-based position on chromosomes of the start of + telomere arrays. This enables `G2S` breakpoints to be detected, by looking + for softclips in aligned reads starting at/near the beginning of telomere arrays. + + This function currently assumes that, at assembled telomere arrays, + the forward-oriented telomere starts at the 5' end of such arrays, while the + reverse-oriented telomere sequence ends at the 3' end of such arrays. + """ result = list() patterns = { Orientation.forward: re.compile(query_sequence), @@ -63,17 +77,18 @@ def find_all_occurrences_in_genome( else: relative_to_absolute = 0 target_seq = genome_fasta[seq_region.name] + chrom_length = len(genome_fasta[seq_region.name]) for orientation, pattern in patterns.items(): for match in pattern.finditer(str(target_seq)): if orientation is Orientation.forward: interval_midpoint = match.start() else: - interval_midpoint = match.end() + interval_midpoint = match.end() - 1 interval_midpoint += relative_to_absolute new_interval = Interval( name=seq_region.name, - start=interval_midpoint - interval_window_size, - end=interval_midpoint + interval_window_size, + start=max(0, interval_midpoint - interval_window_size), + end=min(chrom_length - 1, interval_midpoint + interval_window_size), ) result.append(new_interval) return result diff --git a/tests/test_seq_utils.py b/tests/test_seq_utils.py index 0cae514..d5e9dfc 100644 --- a/tests/test_seq_utils.py +++ b/tests/test_seq_utils.py @@ -1,4 +1,10 @@ -from delfies.seq_utils import cyclic_shifts, rev_comp +from tempfile import TemporaryDirectory +from pathlib import Path + +from pyfastx import Fasta + +from delfies.seq_utils import cyclic_shifts, rev_comp, find_all_occurrences_in_genome +from delfies.interval_utils import Interval def test_rev_comp(): @@ -19,3 +25,108 @@ def test_cyclic_shifts(): ] result = cyclic_shifts(str_to_shift) assert result == expected_shifts + + +class TestFindOccurrencesInGenome: + default_chrom_name = "chr1" + default_query = "TTAGGC" + default_query_revcomp_array = rev_comp(default_query) * 9 + len_default_query_revcomp_array = len(default_query_revcomp_array) + default_reference = ( + f">{default_chrom_name}\n{default_query_revcomp_array}{default_query * 10}\n" + ) + default_search_regions = [Interval(default_chrom_name)] + + @classmethod + def setup_class(cls): + cls.temp_dir = TemporaryDirectory() + cls.temp_fasta = Path(cls.temp_dir.name) / "temp.fasta" + + @classmethod + def teardown_class(cls): + cls.temp_dir.cleanup() + + def make_fasta(self, input_string=default_reference): + with self.temp_fasta.open("w") as ofstream: + ofstream.write(input_string) + return Fasta(str(self.temp_fasta), build_index=True) + + def test_find_all_occs_no_hits(self): + fasta = self.make_fasta() + result = find_all_occurrences_in_genome( + "AATTTTTTAAA", fasta, self.default_search_regions, interval_window_size=0 + ) + assert result == [] + + def test_find_all_occs_hits_whole_chrom(self): + fasta = self.make_fasta() + + # Forward only hits + result = find_all_occurrences_in_genome( + self.default_query * 10, + fasta, + self.default_search_regions, + interval_window_size=0, + ) + expected_forward_start = self.len_default_query_revcomp_array + expected = [Interval(self.default_chrom_name, expected_forward_start, expected_forward_start)] + assert result == expected + + # Forward and reverse hits + result = find_all_occurrences_in_genome( + self.default_query * 9, + fasta, + self.default_search_regions, + interval_window_size=0, + ) + expected_reverse_start = self.len_default_query_revcomp_array - 1 + expected = [ + Interval(self.default_chrom_name, expected_forward_start, expected_forward_start), + Interval(self.default_chrom_name, expected_reverse_start, expected_reverse_start), + ] + assert result == expected + + def test_find_all_occs_hits_in_region(self): + region_start = self.len_default_query_revcomp_array + search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))] + fasta = self.make_fasta() + result = find_all_occurrences_in_genome( + self.default_query, fasta, search_regions, interval_window_size=0 + ) + expected = [Interval(self.default_chrom_name, region_start, region_start)] + assert result == expected + + def test_find_all_occs_hits_with_window_inside_genome(self): + region_start = self.len_default_query_revcomp_array + search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))] + fasta = self.make_fasta() + contained_window_size = 2 + result = find_all_occurrences_in_genome( + self.default_query, + fasta, + search_regions, + interval_window_size=contained_window_size, + ) + expected = [ + Interval( + self.default_chrom_name, + region_start - contained_window_size, + region_start + contained_window_size, + ) + ] + assert result == expected + + def test_find_all_occs_hits_with_window_outside_genome(self): + region_start = self.len_default_query_revcomp_array + search_regions = [Interval(self.default_chrom_name, start=region_start, end=region_start + len(self.default_query))] + fasta = self.make_fasta() + overflowing_window_size = 400 + result = find_all_occurrences_in_genome( + self.default_query, + fasta, + search_regions, + interval_window_size=overflowing_window_size, + ) + expected_end = len(self.default_query) * 19 - 1 + expected = [Interval(self.default_chrom_name, 0, expected_end)] + assert result == expected