diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..b69bca1 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,176 @@ +name: Static analysis and tests + +on: + push: + branches: + - main + - dev + pull_request: + branches: + - main + - dev + +env: + VENV_PATH: ~/.venv + REPORT_PATH: tests/data/reports/ + + +jobs: + setup-env: + name: Setup and cache environment + permissions: + contents: read + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python 3.10 + uses: actions/setup-python@v2 + with: + python-version: 3.10 + + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: ~/.venv + key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} + + - name: Install dependencies + run: | + python -m venv ~/.venv + source ~/.venv/bin/activate + python -m pip install --upgrade pip + python -m pip install . + python -m pip install -r test-requirements.txt + + formatting: + name: Formatting + permissions: + contents: read + needs: setup-env + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Restore pip dependencies + uses: actions/cache@v2 + with: + path: ~/.venv + key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} + + - name: Check imports and formatting + run: | + source ~/.venv/bin/activate + isort --check-only --diff --profile black . + black --check --diff . + + Linting: + name: Linting + permissions: + contents: read + needs: setup-env + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Restore pip dependencies + uses: actions/cache@v2 + with: + path: ~/.venv + key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} + + # C0301: Line too long + - name: Run pylint + run: | + source ~/.venv/bin/activate + pylint --fail-under=9 --disable=C0301 AmpliGone/ tests/ > ${{ env.REPORT_PATH }}pylint-report.txt + + # E501: Line too long, W503: Line break before binary operator, E203: Whitespace before ':' + # The last two make it non-PEP8 compliant, but are automatically done by black. + - name: Run flake8 + run: | + source ~/.venv/bin/activate + flake8 --ignore=E501,W503,E203 AmpliGone/ tests/ --output-file=${{ env.REPORT_PATH }}flake8-report.txt + + static-checking: + name: Static checking + permissions: + contents: read + needs: setup-env + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Restore pip dependencies + uses: actions/cache@v2 + with: + path: ~/.venv + key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} + + - name: Run mypy + run: | + source ~/.venv/bin/activate + mypy --disallow-untyped-defs --disallow-incomplete-defs --ignore-missing-imports --disallow-untyped-decorators --strict-equality \ + --warn-redundant-casts --warn-unused-ignores --warn-return-any --warn-unreachable AmpliGone/ tests/ > ${{ env.REPORT_PATH }}mypy-report.txt + + - name: Run bandit + run: | + source ~/.venv/bin/activate + bandit -r AmpliGone/ -f json -o ${{ env.REPORT_PATH }}bandit-report.json + + run-tests: + name: Run Tests + permissions: + contents: read + needs: setup-env + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Restore pip dependencies + uses: actions/cache@v2 + with: + path: ~/.venv + key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} + + - name: Run tests + run: | + source ~/.venv/bin/activate + pytest -sv --cov=AmpliGone/ --cov-report=xml:tests/data/reports/coverage.xml tests/ + + - name: Upload coverage report + if: always() + uses: actions/upload-artifact@v3 + with: + name: coverage-report + path: tests/data/reports/coverage.xml + + sonarcloud: + name: SonarCloud Scan + needs: run-tests + runs-on: ubuntu-latest + if: always() + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Shallow clones should be disabled for SonarCloud analysis + + - name: Download coverage report + uses: actions/download-artifact@v3 + with: + name: coverage-report + path: tests/data/coverage_reports/coverage.xml + + - name: SonarCloud Scan + if: always() + uses: sonarsource/sonarcloud-github-action@master + env: + SONAR_TOKEN: ${{ secrets.SONARCLOUD_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # needed to get PR info \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2789de3..3002f83 100644 --- a/.gitignore +++ b/.gitignore @@ -45,7 +45,6 @@ htmlcov/ .coverage.* .cache nosetests.xml -coverage.xml *.cover *.py,cover .hypothesis/ @@ -132,4 +131,6 @@ dmypy.json # IDE .vscode/ test/ -notes.txt \ No newline at end of file +notes.txt + +*:Zone.Identifier \ No newline at end of file diff --git a/AmpliGone/__init__.py b/AmpliGone/__init__.py index 955210f..d35a1ee 100644 --- a/AmpliGone/__init__.py +++ b/AmpliGone/__init__.py @@ -1,2 +1,16 @@ +""" +AmpliGone: A tool which accurately finds and removes primer sequences from NGS reads in an amplicon experiment. + +Attributes +---------- +__version__ : str + The current version of the AmpliGone package. + +__prog__ : str + The name of the AmpliGone program. +""" + +# pylint: disable=C0103 +# AmpliGone package name is not following snake_case naming convention __version__ = "1.3.0" __prog__ = "AmpliGone" diff --git a/AmpliGone/__main__.py b/AmpliGone/__main__.py index 28678b5..fa421b8 100644 --- a/AmpliGone/__main__.py +++ b/AmpliGone/__main__.py @@ -9,11 +9,11 @@ from collections import defaultdict from concurrent.futures import ProcessPoolExecutor from itertools import chain -from typing import Callable, List, Set, Tuple +from typing import Callable import pandas as pd import parmap -from rich import print +from rich import print as pprint from rich.console import Console from rich.progress import Progress, SpinnerColumn @@ -21,8 +21,8 @@ import AmpliGone.alignmentpreset as AlignmentPreset from AmpliGone import __prog__, __version__ from AmpliGone.args import get_args -from AmpliGone.cut_reads import CutReads -from AmpliGone.fasta2bed import CoordinateListsToBed, find_or_read_primers +from AmpliGone.cut_reads import cut_reads +from AmpliGone.fasta2bed import coord_lists_to_bed, find_or_read_primers from AmpliGone.io_ops import SequenceReads, write_output from AmpliGone.log import log @@ -80,7 +80,7 @@ def check_loaded_index( def primer_df_to_primer_index( primer_df: pd.DataFrame, bind_virtual_primer: bool = True -) -> Tuple[defaultdict, defaultdict]: +) -> tuple[defaultdict, defaultdict]: """ Convert primer DataFrame to primer index dictionaries. @@ -93,7 +93,7 @@ def primer_df_to_primer_index( Returns ------- - Tuple[defaultdict, defaultdict] + tuple[defaultdict, defaultdict] A tuple of two defaultdicts representing the forward and reverse primer indices. Raises @@ -122,10 +122,10 @@ def primer_df_to_primer_index( ) sys.exit(1) - forward_dict = defaultdict(set) - reverse_dict = defaultdict(set) + forward_dict: defaultdict[str, set[None | int]] = defaultdict(set) + reverse_dict: defaultdict[str, set[None | int]] = defaultdict(set) - reference_set: Set[str] = set(primer_df["ref"].unique()) + reference_set: set[str] = set(primer_df["ref"].unique()) for refid in reference_set: forward_dict[refid] = set() reverse_dict[refid] = set() @@ -181,18 +181,12 @@ def coordinates_to_index( for _, refid, start, end in forward_primers_df[ ["ref", "start", "end"] ].itertuples(): - refid: str - start: int - end: int forward_dict[refid].update( coordinates_to_index(forward_primers_df, refid, start, end) ) # iterate over reverse_primers_df and add the coordinates between "start" and "end" to the reverse_dict for _, refid, start, end in reverse_primer_df[["ref", "start", "end"]].itertuples(): - refid: str - start: int - end: int reverse_dict[refid].update( coordinates_to_index(reverse_primer_df, refid, start, end) ) @@ -284,9 +278,9 @@ def correct_fragment_lookaround_size(args: argparse.Namespace) -> argparse.Names def parallel_dispatcher( indexed_reads: SequenceReads, args: argparse.Namespace, - primer_sets: Tuple[defaultdict, defaultdict], + primer_sets: tuple[defaultdict, defaultdict], preset: str, - matrix: List[int], + matrix: list[int], ) -> pd.DataFrame: """ Wrapping function that actually calls the parallelization function to process the primer removal process of the reads. @@ -297,11 +291,11 @@ def parallel_dispatcher( The indexed reads to be processed. args : argparse.Namespace The command-line arguments. - primer_sets : Tuple[defaultdict, defaultdict] + primer_sets : tuple[defaultdict, defaultdict] The primer sequences to be removed. preset : str The preset configuration for processing. - matrix : List[int] + matrix : list[int] The matrix for processing. Returns @@ -316,12 +310,12 @@ def parallel_dispatcher( record=False, ), transient=True, - disable=True if args.verbose is True or args.quiet is True else False, + disable=args.quiet or args.verbose, ) as progress: progress.add_task("[yellow]Removing primer sequences...", total=None) processed_reads = parallel( indexed_reads.frame, - CutReads, + cut_reads, args.threads, primer_sets, args.reference, @@ -340,10 +334,10 @@ def parallel( frame: pd.DataFrame, function: Callable[..., pd.DataFrame], workers: int, - primer_sets: Tuple[defaultdict, defaultdict], + primer_sets: tuple[defaultdict, defaultdict], reference: str, preset: str, - scoring: List[int], + scoring: list[int], fragment_lookaround_size: int, amplicon_type: str, ) -> pd.DataFrame: @@ -358,13 +352,13 @@ def parallel( The function to apply to the DataFrame. workers : int The number of workers to use for parallel processing. - primer_df : Tuple[defaultdict, defaultdict] + primer_df : tuple[defaultdict, defaultdict] A tuple containing the indexes of the primer coordinates to remove. reference : str The reference sequence to use for alignment. preset : str The preset to use for alignment. - scoring : List[int] + scoring : list[int] The scoring matrix to use for alignment. The size of the fragment lookaround. fragment_lookaround_size : int @@ -378,7 +372,7 @@ def parallel( """ frame_split = [frame.iloc[i::workers] for i in range(workers)] tr = [*range(workers)] - return pd.concat( + df = pd.concat( parmap.map( function, zip(frame_split, tr), @@ -388,19 +382,56 @@ def parallel( scoring, fragment_lookaround_size, amplicon_type, - workers, pm_processes=workers, ) ) + # parmap.map sometimes returns Any, but we know it's a DataFrame + if not isinstance(df, pd.DataFrame): + raise TypeError(f"{df} should be a DataFrame") + return df -def main(): +def main(provided_args: list[str] | None = None) -> None: + """ + Main function to process command-line arguments and execute the AmpliGone tool. + + Parameters + ---------- + provided_args : list of str, optional + A list of command-line arguments to parse. If None, the arguments will be taken from sys.argv. + + Returns + ------- + None + + Notes + ----- + This function orchestrates the entire process of reading input files, processing reads, and writing output files. + It performs the following steps: + 1. Parses the command-line arguments using the `get_args` function. + 2. Validates the provided arguments and sets the logging level. + 3. Loads the input reads and primers using concurrent futures for parallel execution. + 4. Checks the loaded reads and primers, and adjusts thread count if necessary. + 5. Processes the reads to remove primer sequences using parallel processing. + 6. Logs the results and writes the output files. + + Examples + -------- + >>> import sys + >>> sys.argv = ['script.py', '--input', 'input.fasta', '--primers', 'primers.fasta', '--reference', 'reference.fasta', '--output', 'output.bed'] + >>> main() + """ + if provided_args: + args = get_args(provided_args) + else: + args = get_args(sys.argv[1:]) + if len(sys.argv[1:]) < 1: - print( + pprint( f"{__prog__} was called but no arguments were given, please try again.\nUse '{__prog__} -h' to see the help document" ) sys.exit(1) - args = get_args(sys.argv[1:]) + # check if verbose and quiet aren't both set if args.verbose is True and args.quiet is True: log.error( @@ -485,7 +516,7 @@ def main(): axis=1, ) ] - CoordinateListsToBed(filtered_primer_df, args.export_primers) + coord_lists_to_bed(filtered_primer_df, args.export_primers) processed_reads = processed_reads.drop(columns=["Removed_coordinates"]) diff --git a/AmpliGone/alignmentmatrix.py b/AmpliGone/alignmentmatrix.py index 45d9806..412864d 100644 --- a/AmpliGone/alignmentmatrix.py +++ b/AmpliGone/alignmentmatrix.py @@ -1,11 +1,63 @@ +""" +This module provides functions to calculate and validate scoring matrices for sequence alignment. + +Functions +--------- +get_scoring_matrix(input_matrix: Optional[List[str]]) -> List[int] + Calculate the scoring matrix based on the input matrix. + +_input_to_dict(input_matrix: Optional[List[str]]) -> Optional[Dict[str, int]] + Convert the input matrix to a dictionary. + +_valid_scoring_list_length(input_list: List[str]) -> bool + Check if the length of the input list is either 4, 6, or 7. + +_scoring_has_negative_values(input_list: List[int]) -> bool + Check if the input list contains any negative values. + +_sort_matrix_dict(matrix_dict: Dict[str, int], required_4: List[str], required_6: List[str], required_7: List[str]) -> Dict[str, int] + Sort the given matrix dictionary based on the provided order of keys. + +_get_ordered_values(matrix_dict: Dict[str, int], required_4: List[str], required_6: List[str], required_7: List[str]) -> List[int] + Get the ordered values from the matrix dictionary based on the number of keys present. + +_log_invalid_combination_error(matrix_keys: List[str], required_keys: List[str]) -> None + Log an error message and exit the program when an invalid combination of scoring matrix keys is encountered. + +_validate_matrix_combinations(matrix_dict: Dict[str, int]) -> List[int] + Validate the combinations of matrix values in the given matrix dictionary. + +Notes +----- +This module is designed to handle the calculation and validation of scoring matrices used in sequence alignment. It ensures that the input matrices are in the correct format, contain valid values, and have the appropriate length. The module also provides functions to sort and order the matrix values based on predefined requirements. + +Examples +-------- +>>> input_matrix = ['match=1', 'mismatch=2', 'gap_o1=3', 'gap_e1=4'] +>>> get_scoring_matrix(input_matrix) +[1, 2, 3, 4] + +>>> input_matrix = ['match=1', 'mismatch=2', 'gap_o1=3', 'gap_e1=4', 'gap_o2=5', 'gap_e2=6'] +>>> get_scoring_matrix(input_matrix) +[1, 2, 3, 4, 5, 6] + +>>> input_matrix = ['match=1', 'mismatch=2', 'gap_o1=3', 'gap_e1=4', 'gap_o2=5', 'gap_e2=6', 'mma=7'] +>>> get_scoring_matrix(input_matrix) +[1, 2, 3, 4, 5, 6, 7] + +>>> input_matrix = ['match=1', 'mismatch=2', 'gap_o1=3', 'gap_e1=4', 'gap_o2=5', 'gap_e2=6', 'mma=-7'] +>>> get_scoring_matrix(input_matrix) +SystemExit: Given scoring matrix contains a negative value. The scoring matrix may only contain non-negative integers. Please check your input and try again. +""" + import os import sys -from typing import Dict, List +from typing import Dict, List, Optional from AmpliGone.log import log -def get_scoring_matrix(input_matrix: List[str] | None) -> List[int]: +def get_scoring_matrix(input_matrix: Optional[List[str]]) -> List[int]: """ Calculate the scoring matrix based on the input matrix. @@ -74,7 +126,7 @@ def get_scoring_matrix(input_matrix: List[str] | None) -> List[int]: return _validate_matrix_combinations(matrix_dict) -def _input_to_dict(input_matrix: List[str] | None) -> Dict[str, int] | None: +def _input_to_dict(input_matrix: Optional[List[str]]) -> Optional[Dict[str, int]]: """ Convert the input matrix to a dictionary. diff --git a/AmpliGone/alignmentpreset.py b/AmpliGone/alignmentpreset.py index afb3ed8..c100969 100644 --- a/AmpliGone/alignmentpreset.py +++ b/AmpliGone/alignmentpreset.py @@ -1,3 +1,50 @@ +""" +This module provides functions and classes for determining the optimal alignment preset for sequencing reads. + +Functions +--------- +get_alignment_preset(input_args: argparse.Namespace, indexed_reads: SequenceReads) -> str + Get the alignment preset for the given reads. + +find_preset(threads: int, data: pd.DataFrame) -> str + Find the preset based on the statistics calculated from the input data. + +_qual_to_ord_dispatcher(qdata: str, threads: int) -> List[int] + Convert a string of characters to a list of ASCII values minus 33 using multiple threads. + +_process_chunk(chunk: str) -> List[int] + Converts each character in the given chunk to its corresponding ASCII value minus 33. + +_determine_preset(avg_len: float, avg_qual: float, quality_range: int, length_range: int) -> str + Determine the preset based on the calculated statistics. + +_calc_avg_read_length(sequence_list: List[str]) -> float + Calculate the average length of a list of sequences. + +_calc_avg_read_qual(quality_list: List[int]) -> float + Calculate the average quality score of a list of quality scores. + +_get_unique_quality_scores(quality_list: List[int]) -> int + Return the number of unique quality scores in a list. + +_get_unique_read_lengths(sequence_list: List[str]) -> int + Return the number of unique lengths of strings in a list. + +_sequence_statistics_dispatcher(reads_list: List[str], ordinal_qualities_list: List[int], threads: int) -> Tuple[float, float, int, int] + Calculate sequence statistics using multiple threads. + +Notes +----- +This module is designed to handle sequencing reads and determine the optimal alignment preset based on the quality and length of the reads. It uses parallel processing to efficiently handle large datasets and calculate necessary statistics. + +Examples +-------- +>>> import pandas as pd +>>> data = pd.DataFrame({'Sequence': ['ATCG', 'GCTA'], 'Qualities': ['!@#$%', '&*()']}) +>>> find_preset(4, data) +'sr' +""" + import argparse from concurrent.futures import ProcessPoolExecutor from typing import Generator, List, Tuple @@ -42,6 +89,9 @@ def get_alignment_preset( """ if input_args.alignment_preset is not None: + # this check is mostly for mypy to understand that alignment_preset is not Any + if not isinstance(input_args.alignment_preset, str): + raise TypeError("alignment_preset should be a string") return input_args.alignment_preset log.info("Finding optimal alignment-preset for the given reads") sample_size = min(len(indexed_reads.tuples), 15000) @@ -120,14 +170,14 @@ def _extract_read_data(data: pd.DataFrame) -> Tuple[List[str], str]: return list_of_reads, qualities_str reads_list, quality_data = _extract_read_data(data) - ord_quality_list: List[int] = _qual_to_ord_dispatcher(quality_data, threads) + ord_quality_list: list[int] = _qual_to_ord_dispatcher(quality_data, threads) avg_len, avg_qual, quality_range, length_range = _sequence_statistics_dispatcher( reads_list, ord_quality_list, threads ) return _determine_preset(avg_len, avg_qual, quality_range, length_range) -def _qual_to_ord_dispatcher(qdata: str, threads) -> List[int]: +def _qual_to_ord_dispatcher(qdata: str, threads: int) -> list[int]: """ Convert a string of characters to a list of ASCII values minus 33 using multiple threads. @@ -183,8 +233,8 @@ def _create_chunks(lst: str, n: int) -> Generator: yield lst[start:end] start = end - qdata_chunks: List[str] = list(_create_chunks(qdata, threads)) - ordinal_quality_list: List[int] = [] + qdata_chunks: list[str] = list(_create_chunks(qdata, threads)) + ordinal_quality_list: list[int] = [] with ProcessPoolExecutor(max_workers=threads) as pool: results = pool.map(_process_chunk, qdata_chunks) for result in results: @@ -192,7 +242,7 @@ def _create_chunks(lst: str, n: int) -> Generator: return ordinal_quality_list -def _process_chunk(chunk: str): +def _process_chunk(chunk: str) -> list[int]: """ Converts each character in the given chunk to its corresponding ASCII value minus 33. @@ -332,7 +382,7 @@ def _is_long_read(avg_len: float) -> bool: # this is probably 'short read' illumina NextSeq data # --> set the 'SR' preset return "sr" - ##! previous if-statement is not False. + # ! previous if-statement is not False. # this is probably 'long read' illumina MiSeq data # --> the 'SR' preset still applies but we keep it split # in case a custom set of parameters is necessary in the future @@ -341,7 +391,7 @@ def _is_long_read(avg_len: float) -> bool: # this is probably oxford nanopore data # --> set the preset to 'map-ont' return "map-ont" - ##! previous if-statement is not True. + # ! previous if-statement is not True. # this might be very 'unstable' nextseq data, # or from a platform we currently dont really support officially. # fallback to 'sr' preset diff --git a/AmpliGone/args.py b/AmpliGone/args.py index b2b0657..3b6984e 100644 --- a/AmpliGone/args.py +++ b/AmpliGone/args.py @@ -1,3 +1,23 @@ +""" +This module provides functionality for parsing command-line arguments for the AmpliGone tool using the argparse library. +It includes custom argument validation functions, a flexible argument formatter, and a rich argument parser for enhanced +command-line interface (CLI) experience. + +Functions +--------- +get_args(givenargs: List[str]) -> argparse.Namespace + Parses the given command-line arguments and returns them as an argparse namespace. + +Classes +------- +FlexibleArgFormatter(argparse.HelpFormatter) + A subclass of argparse.HelpFormatter that improves the formatting of help text. + +RichParser(argparse.ArgumentParser) + A subclass of argparse.ArgumentParser that uses rich.print for displaying messages. + +""" + import argparse import multiprocessing import os @@ -52,7 +72,7 @@ def check_file_extensions(allowed_extensions: Iterable[str], fname: str) -> str: parser.error(f"File {fname} doesn't end with one of {allowed_extensions}") return os.path.abspath(fname) - def check_file_exists(fname: str) -> str | None: + def check_file_exists(fname: str) -> str: """Check if the given file `fname` exists and return the absolute path. Parameters @@ -74,10 +94,11 @@ def check_file_exists(fname: str) -> str | None: if os.path.isfile(fname): return fname parser.error(f'Error: File "{fname}" does not exist.') + raise argparse.ArgumentTypeError(f'Error: File "{fname}" does not exist.') parser = RichParser( prog=f"[bold]{__prog__}[/bold]", - usage=f"[bold]{__prog__}[/bold] \[required options] \[optional arguments]", + usage=f"[bold]{__prog__}[/bold] \\[required options] \\[optional arguments]", description=f"[bold underline]{__prog__}[/bold underline]: An accurate and efficient tool to remove primers from NGS reads in reference-based experiments", formatter_class=FlexibleArgFormatter, add_help=False, @@ -271,12 +292,15 @@ class FlexibleArgFormatter(argparse.HelpFormatter): * Changes the behaviour of the metavar to be only printed once per long AND shorthand argument, instead of printing the metavar multiple times for every possible flag. """ - def __init__(self, prog): + def __init__(self, prog: str) -> None: term_width = shutil.get_terminal_size().columns max_help_position = min(max(24, term_width // 2), 80) super().__init__(prog, max_help_position=max_help_position) - def _get_help_string(self, action): + # action is actually an argparse._StoreAction object, which is a subclass of argparse.Action + # _StoreAction violates the Liskov Substitution Principle + # see: https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides + def _get_help_string(self, action: argparse.Action) -> str: """ """ help_text = action.help if ( @@ -286,9 +310,12 @@ def _get_help_string(self, action): and action.default is not None ): help_text += f"\n ([underline]default: {str(action.default)}[/underline])" + if not help_text: + raise AssertionError("Help text should always be present") return help_text - def _format_action_invocation(self, action): + # see comment above + def _format_action_invocation(self, action: argparse.Action) -> str: """ """ if not action.option_strings or action.nargs == 0: return super()._format_action_invocation(action) @@ -296,17 +323,19 @@ def _format_action_invocation(self, action): args_string = self._format_args(action, default) return ", ".join(action.option_strings) + " " + args_string - def _split_lines(self, text, width): + def _split_lines(self, text: str, width: int) -> list[str]: return self._para_reformat(text, width) - def _fill_text(self, text, width, indent): + def _fill_text(self, text: str, width: int, _: str) -> str: lines = self._para_reformat(text, width) return "\n".join(lines) - def _indents(self, line): + def _indents(self, line: str) -> tuple[int, int]: """Return line indent level and "sub_indent" for bullet list text.""" - - indent = len(re.match(r"( *)", line).group(1)) + matched_line = re.match(r"( *)", line) + if not matched_line: + raise AssertionError("Line should always match this regex pattern: ( *)") + indent = len(matched_line.group(1)) if list_match := re.match(r"( *)(([*\-+>]+|\w+\)|\w+\.) +)", line): sub_indent = indent + len(list_match.group(2)) else: @@ -314,14 +343,14 @@ def _indents(self, line): return (indent, sub_indent) - def _split_paragraphs(self, text): + def _split_paragraphs(self, text: str) -> list[str]: """Split text in to paragraphs of like-indented lines.""" text = textwrap.dedent(text).strip() text = re.sub("\n\n[\n]+", "\n\n", text) last_sub_indent = None - paragraphs = [] + paragraphs: list[str] = [] for line in text.splitlines(): (indent, sub_indent) = self._indents(line) is_text = re.search(r"[^\s]", line) is not None @@ -334,10 +363,11 @@ def _split_paragraphs(self, text): last_sub_indent = sub_indent if is_text else None return paragraphs - def _para_reformat(self, text, width): + def _para_reformat(self, text: str, width: int) -> list[str]: """Reformat text, by paragraph.""" - paragraphs = [] + paragraphs: list[str] = [] + for paragraph in self._split_paragraphs(text): (indent, sub_indent) = self._indents(paragraph) diff --git a/AmpliGone/cut_reads.py b/AmpliGone/cut_reads.py index 8ca0682..6230f4d 100644 --- a/AmpliGone/cut_reads.py +++ b/AmpliGone/cut_reads.py @@ -1,27 +1,105 @@ +""" +This module provides functions to cut read sequences based on primer locations and reference mapping. It includes +functions to handle sequence reads, quality strings, and CIGAR information to accurately cut and process reads. + +Functions +--------- +cut_read(seq: str, qual: str, position_needs_cutting: Callable[..., bool], primer_list: Tuple[int, ...], position_on_reference: int, cut_direction: int, read_direction: int, cigar: List[List[int]], query_start: int, query_end: int, fragment_lookaround_size: int) -> Tuple[str, str, List[int], int, int] + Cut a read sequence and quality string based on read-direction, CIGAR information, orientation, and fragment position. + +cut_reads(data: Tuple[pd.DataFrame, int], primer_sets: Tuple[defaultdict, defaultdict], reference: str, preset: str, scoring: List[int], fragment_lookaround_size: int, amplicon_type: str) -> pd.DataFrame + Cut reads based on primer locations and reference mapping. + +Notes +----- +- The `cut_read` function processes individual read sequences and quality strings, cutting them based on the provided + parameters. +- The `cut_reads` function processes a DataFrame of reads, cutting them based on primer locations and reference mapping. +- The module uses the `mappy` library for sequence alignment and the `pandas` library for data manipulation. +- The `position_in_or_before_primer` and `position_in_or_after_primer` functions are used to determine if a position + needs cutting based on primer locations. +""" + +# pylint: disable=E1120 +# pylint doesnt understand that the position_in_or_after_primer function is not being called the lines around 174 import os from collections import defaultdict +from dataclasses import dataclass from typing import Callable, List, Tuple +# mappy is a C extension, so it is added to the pylint extension allow list import mappy as mp import pandas as pd from AmpliGone.log import log -from .cutlery import PositionInOrAfterPrimer, PositionInOrBeforePrimer +from .cutlery import position_in_or_after_primer, position_in_or_before_primer + + +@dataclass +class Read: + """ + A class to represent a sequencing read. + + Attributes + ---------- + name : str + The name or identifier of the read. + seq : str + The nucleotide sequence of the read. + qual : str + The quality scores of the read, encoded as a string. + """ + + name: str + seq: str + qual: str + + +@dataclass +class CuttingParameters: + """ + A class to represent the parameters required for cutting reads. + + Attributes + ---------- + position_needs_cutting : Callable[..., bool] + A function that determines if a position needs cutting based on various criteria. + + primer_list : Tuple[int, ...] + A tuple containing the list of primer positions. + + position_on_reference : int + The position on the reference sequence. + + cut_direction : int + The direction of the cut (e.g., 1 for forward, -1 for reverse). + + read_direction : int + The direction of the read (e.g., 1 for forward, -1 for reverse). + + cigar : List[List[int]] + The CIGAR string representing the alignment, as a list of operations. + + query_range : dict[str, int] + A dictionary containing the start and end positions of the query sequence. + + fragment_lookaround_size : int + The size of the fragment lookaround region. + """ + + position_needs_cutting: Callable[..., bool] + primer_list: Tuple[int, ...] + position_on_reference: int + cut_direction: int + read_direction: int + cigar: List[List[int]] + query_range: dict[str, int] + fragment_lookaround_size: int def cut_read( - seq: str, - qual: str, - PositionNeedsCutting: Callable[..., bool], - primer_list: Tuple[int, ...], - position_on_reference: int, - cut_direction: int, - read_direction: int, - cigar: List[List[int]], - query_start: int, - query_end: int, - fragment_lookaround_size: int, + read: Read, params: CuttingParameters ) -> Tuple[str, str, List[int], int, int]: """ Cut a read sequence and quality string based read-direction, cigar-information, orientation, and fragment position. @@ -30,24 +108,34 @@ def cut_read( ---------- seq : str The read sequence. + qual : str The quality string. + PositionNeedsCutting : Callable[..., bool] A function that returns True if the position on the reference needs to be cut. + primer_list : Tuple[int, ...] A tuple of integers representing the positions of primers on the reference. + position_on_reference : int The position on the reference where the read sequence starts. + cut_direction : int The direction in which the read sequence needs to be cut. + read_direction : int The direction in which the read sequence is read. + cigar : List[List[int]] A list of lists representing the CIGAR string. + query_start : int The start position of the read sequence on the query. + query_end : int The end position of the read sequence on the query. + fragment_lookaround_size : int The size of the fragment lookaround. @@ -60,46 +148,126 @@ def cut_read( removed_coords = [] # Whether to start at the end or at the start of the read sequence - if read_direction == cut_direction: + if params.read_direction == params.cut_direction: # Start at the position that first matches the reference (skip soft clipped regions) - position_on_sequence = query_start + position_on_sequence = params.query_range["start"] else: # End at the position that last matches the reference (skip soft clipped regions) - position_on_sequence = query_end + position_on_sequence = params.query_range["end"] - for cigar_len, cigar_type in cigar: + for cigar_len, cigar_type in params.cigar: while cigar_len > 0 and ( - PositionNeedsCutting( - position_on_reference, primer_list, fragment_lookaround_size + params.position_needs_cutting( + params.position_on_reference, + params.primer_list, + params.fragment_lookaround_size, ) or cigar_type not in (0, 7) # always end with a match ): cigar_len -= 1 - removed_coords.append(position_on_reference) + removed_coords.append(params.position_on_reference) # Increment position on sequence if match/insert (in seq)/match(seq)/mismatch(seq) if cigar_type in (0, 1, 7, 8): - position_on_sequence += read_direction * cut_direction + position_on_sequence += params.read_direction * params.cut_direction # Increment position on reference if match/deletion (in seq)/match(seq)/mismatch(seq) if cigar_type in (0, 2, 7, 8): - position_on_reference += cut_direction - if not PositionNeedsCutting( - position_on_reference, primer_list, fragment_lookaround_size + params.position_on_reference += params.cut_direction + if not params.position_needs_cutting( + params.position_on_reference, + params.primer_list, + params.fragment_lookaround_size, ) and cigar_type in (0, 7): break - if read_direction == cut_direction: - seq = seq[position_on_sequence:] - qual = qual[position_on_sequence:] - query_end -= position_on_sequence - return seq, qual, removed_coords, query_start, query_end - seq = seq[:position_on_sequence] - qual = qual[:position_on_sequence] - return seq, qual, removed_coords, query_start, query_end + if params.read_direction == params.cut_direction: + read.seq = read.seq[position_on_sequence:] + read.qual = read.qual[position_on_sequence:] + params.query_range["end"] -= position_on_sequence + return ( + read.seq, + read.qual, + removed_coords, + params.query_range["start"], + params.query_range["end"], + ) + read.seq = read.seq[:position_on_sequence] + read.qual = read.qual[:position_on_sequence] + return ( + read.seq, + read.qual, + removed_coords, + params.query_range["start"], + params.query_range["end"], + ) + + +def log_cache_info(index: int, total_reads: int, _threadnumber: int) -> None: + """ + Logs cache information for the primer position functions. + + Parameters + ---------- + index : int + The current index of the read being processed. + + total_reads : int + The total number of reads to be processed. + + _threadnumber : int + The thread number of the current process. + + Returns + ------- + None + This function does not return any value. It logs the cache information. + + Notes + ----- + This function logs the completion percentage of read processing and the cache usage and hit ratio + for the `position_in_or_before_primer` and `position_in_or_after_primer` functions. It also handles + potential division by zero errors when calculating cache hit ratios. + """ + completion_percentage = round(index / total_reads * 100) + maxsize = position_in_or_before_primer.cache_info().maxsize + currsize = position_in_or_before_primer.cache_info().currsize + cache_usage_before = ( + currsize / maxsize * 100 if maxsize is not None and currsize is not None else 0 + ) + maxsize = position_in_or_after_primer.cache_info().maxsize + currsize = position_in_or_after_primer.cache_info().currsize + cache_usage_after = ( + currsize / maxsize * 100 if maxsize is not None and currsize is not None else 0 + ) + # TODO: clean up this section of safely dividing by zero + cache_misses = position_in_or_before_primer.cache_info().misses + cache_hit_ratio_before = ( + (position_in_or_before_primer.cache_info().hits / cache_misses) + if cache_misses != 0 + else 0 + ) + cache_misses = position_in_or_after_primer.cache_info().misses + cache_hit_ratio_after = ( + (position_in_or_after_primer.cache_info().hits / cache_misses) + if cache_misses != 0 + else 0 + ) + log.debug( + # mypy doesnt understand that the position_in_or_before_primer has a __qualname__ attribute, + # because it thinks its the wrapper (lru_cache) function, which does not have a __qualname__ attribute + f"Thread {_threadnumber} @ processID {os.getpid()}\t::\t" + f"Reads processing {completion_percentage}% complete.\n\t" + f"MODULE {position_in_or_before_primer.__module__}.{position_in_or_before_primer.__qualname__} " # type: ignore[attr-defined] + f"CACHE INFORMATION\n\t\tCache size usage = {cache_usage_before:.2f}%\n\t\t" + f"Cache hit ratio = {cache_hit_ratio_before:.2f}%\n\t" + f"MODULE {position_in_or_after_primer.__module__}.{position_in_or_after_primer.__qualname__} " + f"CACHE INFORMATION\n\t\tCache size usage = {cache_usage_after:.2f}%\n\t\t" + f"Cache hit ratio = {cache_hit_ratio_after:.2f}%" + ) -def CutReads( +def cut_reads( data: Tuple[pd.DataFrame, int], primer_sets: Tuple[defaultdict, defaultdict], reference: str, @@ -107,7 +275,6 @@ def CutReads( scoring: List[int], fragment_lookaround_size: int, amplicon_type: str, - workers: int, ) -> pd.DataFrame: """ Cut reads based on primer locations and reference mapping. @@ -117,18 +284,25 @@ def CutReads( data : Tuple[pd.DataFrame, int] A tuple containing a pandas DataFrame with columns "Readname", "Sequence", and "Qualities", and an integer representing the thread number. + primer_sets : Tuple[defaultdict, defaultdict] A tuple containing two defaultdicts, one for forward primers and one for reverse primers. These defaultdicts contain the primer coordinates to remove. + reference : str The reference genome sequence. + preset : str The preset used for minimap2 alignment. + scoring : List[int] The scoring matrix used for minimap2 alignment. + fragment_lookaround_size : int The number of bases to look around a fragment when cutting reads. + amplicon_type : str The type of amplicon, either "end-to-end", "end-to-mid", or "fragmented". + workers : int The number of workers to use for parallel processing. @@ -138,21 +312,20 @@ def CutReads( A pandas DataFrame with columns "Readname", "Sequence", "Qualities", and "Removed_coordinates", representing the processed reads and the coordinates that were removed. """ - Frame, _threadnumber = data + frame, _threadnumber = data log.debug( - f"Initiated thread {_threadnumber} @ process ID {os.getpid()} :: Processing {len(Frame)} reads." + f"Initiated thread {_threadnumber} @ process ID {os.getpid()} :: Processing {len(frame)} reads." ) - FWDict, RVDict = primer_sets + fw_dict, rv_dict = primer_sets - Aln = mp.Aligner( + aligner = mp.Aligner( reference, preset=preset, best_n=1, scoring=scoring, extra_flags=0x4000000, # Distinguish between match and mismatch: MM_F_EQX flag in minimap2 ) - processed_readnames = [] processed_sequences = [] processed_qualities = [] @@ -161,48 +334,24 @@ def CutReads( max_iter = ( 10 # If more iterations are needed, the sequence is discarded (not recorded) ) - total_reads = len(Frame) + total_reads = len(frame) for index, (_, name, seq, qual) in enumerate( - Frame[["Readname", "Sequence", "Qualities"]].itertuples(), 1 + frame[["Readname", "Sequence", "Qualities"]].itertuples(), 1 ): - name: str - seq: str - qual: str - if total_reads >= 10 and index % (total_reads // 10) == 0 and log.level == 10: - completion_percentage = round(index / total_reads * 100) - maxsize = PositionInOrBeforePrimer.cache_info().maxsize - currsize = PositionInOrBeforePrimer.cache_info().currsize - cache_usage_before = ( - currsize / maxsize * 100 - if maxsize is not None and currsize is not None - else 0 - ) - maxsize = PositionInOrAfterPrimer.cache_info().maxsize - currsize = PositionInOrAfterPrimer.cache_info().currsize - cache_usage_after = ( - currsize / maxsize * 100 - if maxsize is not None and currsize is not None - else 0 - ) - # todo: clean up this section of safely dividing by zero - cache_misses = PositionInOrBeforePrimer.cache_info().misses - cache_hit_ratio_before = ( - (PositionInOrBeforePrimer.cache_info().hits / cache_misses) - if cache_misses != 0 - else 0 - ) - cache_misses = PositionInOrAfterPrimer.cache_info().misses - cache_hit_ratio_after = ( - (PositionInOrAfterPrimer.cache_info().hits / cache_misses) - if cache_misses != 0 - else 0 - ) - log.debug( - f"Thread {_threadnumber} @ processID {os.getpid()}\t::\tReads processing {completion_percentage}% complete.\n\tMODULE {PositionInOrBeforePrimer.__module__}.{PositionInOrBeforePrimer.__qualname__} CACHE INFORMATION\n\t\tCache size usage = {cache_usage_before:.2f}%\n\t\tCache hit ratio = {cache_hit_ratio_before:.2f}%\n\tMODULE {PositionInOrAfterPrimer.__module__}.{PositionInOrAfterPrimer.__qualname__} CACHE INFORMATION\n\t\tCache size usage = {cache_usage_after:.2f}%\n\t\tCache hit ratio = {cache_hit_ratio_after:.2f}%" - ) + if ( + total_reads >= 10 and index % (total_reads // 10) == 0 and log.level == 10 + ): # TODO: explain this + log_cache_info(index, total_reads, _threadnumber) + + if len(seq) < 42: + # Length of the read has to be at least ~42bp because the default k-mer size for the short reads preset (sr) is 21. + + log.debug(f"Read with name '{name}' is too short to be processed.") + continue - removed_coords_fw = [] - removed_coords_rv = [] + read = Read(name, seq, qual) + removed_coords_fw: list[int | None] = [] + removed_coords_rv: list[int | None] = [] previous_seq: str = "impossible" cutting_is_done = False @@ -210,32 +359,32 @@ def CutReads( if cutting_is_done: break - for hit in Aln.map( - seq + for hit in aligner.map( + read.seq ): # Yields only one (or no) hit, as the aligner object was initiated with best_n=1 - if len(seq) < 5 and len(qual) < 5: + if len(read.seq) < 5 and len(read.qual) < 5: cutting_is_done = True break - if seq == previous_seq: - processed_readnames.append(name) - processed_sequences.append(seq) - processed_qualities.append(qual) + if read.seq == previous_seq: + processed_readnames.append(read.name) + processed_sequences.append(read.seq) + processed_qualities.append(read.qual) removed_coords_per_read.append( removed_coords_fw + removed_coords_rv ) cutting_is_done = True break - previous_seq = seq + previous_seq = read.seq # Fetch the primer coordinates that correspond to the reference that the read maps to # we're using tuples here because they are hashable - FWTuple: Tuple[int, ...] = tuple(FWDict[hit.ctg]) - RVTuple: Tuple[int, ...] = tuple(RVDict[hit.ctg]) + fw_tuple: Tuple[int, ...] = tuple(fw_dict[hit.ctg]) + rv_tuple: Tuple[int, ...] = tuple(rv_dict[hit.ctg]) - if not FWTuple or not RVTuple: + if not fw_tuple or not rv_tuple: log.debug( f"Thread {_threadnumber} @ processID {os.getpid()}\t::\tRead with name '{name}' aligns to '{hit.ctg}', but there are no primers affiliated with '{hit.ctg}'." ) @@ -249,19 +398,19 @@ def CutReads( or (amplicon_type == "end-to-mid" and hit.strand == 1) or amplicon_type == "fragmented" ): - seq, qual, removed_fw, qstart, qend = cut_read( - seq, - qual, - PositionNeedsCutting=PositionInOrBeforePrimer, - primer_list=FWTuple, + params = CuttingParameters( + position_needs_cutting=position_in_or_before_primer, + primer_list=fw_tuple, position_on_reference=hit.r_st, cut_direction=1, read_direction=hit.strand, cigar=hit.cigar, - query_start=qstart, - query_end=qend, + query_range={"start": qstart, "end": qend}, fragment_lookaround_size=fragment_lookaround_size, ) + read.seq, read.qual, removed_fw, qstart, qend = cut_read( + read, params + ) removed_coords_fw.extend(removed_fw) if ( @@ -269,19 +418,19 @@ def CutReads( or (amplicon_type == "end-to-mid" and hit.strand == -1) or amplicon_type == "fragmented" ): - seq, qual, removed_rv, qstart, qend = cut_read( - seq, - qual, - PositionNeedsCutting=PositionInOrAfterPrimer, - primer_list=RVTuple, + params = CuttingParameters( + position_needs_cutting=position_in_or_after_primer, + primer_list=rv_tuple, position_on_reference=hit.r_en, cut_direction=-1, read_direction=hit.strand, cigar=list(reversed(hit.cigar)), - query_start=qstart, - query_end=qend, + query_range={"start": qstart, "end": qend}, fragment_lookaround_size=fragment_lookaround_size, ) + read.seq, read.qual, removed_rv, qstart, qend = cut_read( + read, params + ) removed_coords_rv.extend(removed_rv) return pd.DataFrame( diff --git a/AmpliGone/cutlery.py b/AmpliGone/cutlery.py index 53d430d..85f674e 100644 --- a/AmpliGone/cutlery.py +++ b/AmpliGone/cutlery.py @@ -1,9 +1,38 @@ +""" +This module provides functions to determine if a position is within a specified distance of primer positions. + +Functions +--------- +position_in_or_before_primer(pos: int, clist: List[int], max_lookaround: int) -> bool + Determine if a position is within the maximum distance of the closest position in the list of primer positions + and the position is less than or equal to the closest position in the list. + +position_in_or_after_primer(pos: int, clist: List[int], max_lookaround: int) -> bool + Determine if a position is within the maximum distance of the closest position in the list of primer positions + and the position is greater than or equal to the closest position in the list. + +Notes +----- +These functions use caching to improve performance for repeated calls with the same arguments. The cache size is set to a maximum of 2,000,000 entries. + +Examples +-------- +>>> from cutlery import position_in_or_before_primer, position_in_or_after_primer +>>> primer_positions = [100, 200, 300] +>>> position_in_or_before_primer(150, primer_positions, 50) +True +>>> position_in_or_after_primer(250, primer_positions, 50) +True +""" + from functools import lru_cache from typing import List @lru_cache(maxsize=2000000) -def PositionInOrBeforePrimer(pos: int, clist: List[int], max_lookaround: int) -> bool: +def position_in_or_before_primer( + pos: int, clist: List[int], max_lookaround: int +) -> bool: """ Determine if a position is within the maximum distance of the closest position in the list of primer positions and the position is less than or equal to the closest position in the list. @@ -24,13 +53,18 @@ def PositionInOrBeforePrimer(pos: int, clist: List[int], max_lookaround: int) -> less than or equal to the closest position in the list, False otherwise. """ - d = lambda x: abs(x - pos) - near = min(clist, key=d, default=0) + + def _default(x: int) -> int: + return abs(x - pos) + + near = min(clist, key=_default, default=0) return abs(pos - near) < max_lookaround and pos <= near @lru_cache(maxsize=2000000) -def PositionInOrAfterPrimer(pos: int, clist: List[int], max_lookaround: int) -> bool: +def position_in_or_after_primer( + pos: int, clist: List[int], max_lookaround: int +) -> bool: """ Determine if a position is within the maximum distance of the closest position in the list of primer positions and the position is greater than or equal to the closest position in the list. @@ -51,6 +85,9 @@ def PositionInOrAfterPrimer(pos: int, clist: List[int], max_lookaround: int) -> greater than or equal to the closest position in the list, False otherwise. """ - d = lambda x: abs(x - pos) - near = min(clist, key=d, default=0) + + def _default(x: int) -> int: + return abs(x - pos) + + near = min(clist, key=_default, default=0) return abs(pos - near) < max_lookaround and pos >= near diff --git a/AmpliGone/fasta2bed.py b/AmpliGone/fasta2bed.py index 917b274..e147598 100644 --- a/AmpliGone/fasta2bed.py +++ b/AmpliGone/fasta2bed.py @@ -1,3 +1,68 @@ +""" +This module provides functions and classes for processing sequencing reads and primers, including finding ambiguous options, parsing CIGAR strings, counting CIGAR errors, and generating coordinates for primers. + +Functions +--------- +find_ambiguous_options(seq: str) -> List[str] + Find all possible unambiguous sequences from a sequence containing ambiguous nucleotides. + +parse_cigar_obj(cig_obj: Cigar) -> Tuple[str, str] + Parse a Cigar object and return the original cigar string and a cleaned cigar string. + +count_cigar_errors(cigar: str) -> int + Count the number of errors (insertions, deletions, and mismatches) in a CIGAR string. + +get_coords(seq: str, ref_seq: str, err_rate: float = 0.1) -> Tuple[str, int, int, int] + Get the coordinates of the best primer option for a given sequence. + +find_or_read_primers(primerfile: str, referencefile: str, err_rate: float) -> pd.DataFrame + Find or read primers from a given file. + +choose_best_fitting_coordinates(fw_coords: Tuple[str, int, int, int], rv_coords: Tuple[str, int, int, int]) -> Tuple[str, int, int, int] | None + Compares the forward and reverse coordinates and returns the best fitting coordinates based on their scores. + +coord_list_gen(primerfile: str, referencefile: str, err_rate: float = 0.1) -> Generator[Dict[str, Union[str, int]], None, None] + Generate a list of coordinates for primers found in a reference sequence. + +coord_lists_to_bed(df: pd.DataFrame, outfile: str) -> None + Write the coordinates in BED format to a file. + +parse_args(args: list[str] | None = None) -> argparse.Namespace + Parse command-line arguments. + +main(args: list[str] | None = None) -> None + Main function to process the command-line arguments and generate the BED file with primer coordinates. + +Notes +----- +This module is designed to handle the processing of sequencing reads and primers, including finding ambiguous options, parsing CIGAR strings, counting CIGAR errors, and generating coordinates for primers. It includes functions to read primers from files, generate coordinates, and write the results to a BED file. The main function orchestrates the entire process based on command-line arguments. + +Examples +-------- +>>> from fasta2bed import find_ambiguous_options, parse_cigar_obj, count_cigar_errors, get_coords, find_or_read_primers, choose_best_fitting_coordinates, coord_list_gen, coord_lists_to_bed +>>> seq = "ATGCR" +>>> find_ambiguous_options(seq) +['ATGCA', 'ATGCG'] + +>>> cig_obj = Cigar("10M2D5M") +>>> parse_cigar_obj(cig_obj) +('10M2D5M', '10M2D5M') + +>>> count_cigar_errors("10=2I1=5D3=") +7 + +>>> seq = "ATCG" +>>> ref_seq = "ATCGATCG" +>>> get_coords(seq, ref_seq, err_rate=0.1) +('ATCG', 0, 4, 100) + +>>> primerfile = "primers.fasta" +>>> referencefile = "reference.fasta" +>>> err_rate = 0.1 +>>> df = find_or_read_primers(primerfile, referencefile, err_rate) +>>> coord_lists_to_bed(df, "output.bed") +""" + import argparse import os import re @@ -302,7 +367,7 @@ def find_or_read_primers( log.info("Primer coordinates are given in BED format, skipping primer search") return read_bed(primerfile) return pd.DataFrame( - CoordListGen( + coord_list_gen( primerfile=primerfile, referencefile=referencefile, err_rate=err_rate, @@ -372,7 +437,7 @@ def choose_best_fitting_coordinates( return best_fitting or None -def CoordListGen( +def coord_list_gen( primerfile: str, referencefile: str, err_rate: float = 0.1, @@ -434,15 +499,15 @@ def CoordListGen( primers = list(SeqIO.parse(primerfile, "fasta")) - ref_file = list(SeqIO.parse(referencefile, "fasta")) - ref_seq = [str(ref.seq) for ref in ref_file] - ref_id = [ref.id for ref in ref_file] + ref_files = list(SeqIO.parse(referencefile, "fasta")) + ref_seqs = [str(ref.seq) for ref in ref_files] + ref_ids = [ref.id for ref in ref_files] # The loop in a loop here is not a particularly efficient way of doing this. # But this is the easiest implementation for now, and it's not like this is a particularly # cpu or time intensive process anyway. # Might come back to this when there's more time to create a better solution. - for ref_seq, ref_id in zip(ref_seq, ref_id): + for ref_seq, ref_id in zip(ref_seqs, ref_ids): log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]") for primer in primers: seq = str(primer.seq) @@ -477,19 +542,19 @@ def CoordListGen( if not represent_as_score: score = percentage - yield dict( - ref=ref_id, - start=start, - end=end, - name=primer.id, - score=score, - strand=strand, - seq=seq, - revcomp=revcomp, - ) + yield { + "ref": ref_id, + "start": start, + "end": end, + "name": primer.id, + "score": score, + "strand": strand, + "seq": seq, + "revcomp": revcomp, + } -def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: +def coord_lists_to_bed(df: pd.DataFrame, outfile: str) -> None: """ Write the coordinates in BED format to a file. @@ -517,17 +582,53 @@ def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: >>> CoordinateListsToBed(df, 'regions.bed') """ - return df[["ref", "start", "end", "name", "score", "strand"]].to_csv( + df[["ref", "start", "end", "name", "score", "strand"]].to_csv( outfile, sep="\t", na_rep=".", header=False, index=False ) -if __name__ == "__main__": - import argparse +def parse_args(args: list[str] | None = None) -> argparse.Namespace: + """ + Parse command-line arguments. - args = argparse.ArgumentParser() + Parameters + ---------- + args : list of str, optional + A list of command-line arguments to parse. If None, the arguments will be taken from sys.argv. - args.add_argument( + Returns + ------- + argparse.Namespace + An argparse.Namespace object containing the parsed command-line arguments. + + Notes + ----- + This function defines and parses the command-line arguments for the script. The following arguments are supported: + - --primers: The path to the FASTA file containing primers. This argument is required. + - --reference: The path to the FASTA file with the reference sequence. This argument is required. + - --output: The path to the output BED file with coordinates of the primers. This argument is required. + - --primer-mismatch-rate: The fraction of mismatches a primer can have with respect to the reference. Defaults to 0.1. + - --verbose: A flag to enable verbose output for debugging purposes. + + Examples + -------- + >>> import sys + >>> sys.argv = ['script.py', '--primers', 'primers.fasta', '--reference', 'reference.fasta', '--output', 'output.bed'] + >>> args = parse_args() + >>> args.primers + 'primers.fasta' + >>> args.reference + 'reference.fasta' + >>> args.output + 'output.bed' + >>> args.primer_mismatch_rate + 0.1 + >>> args.verbose + False + """ + parser = argparse.ArgumentParser() + + parser.add_argument( "--primers", metavar="File", type=str, @@ -535,7 +636,7 @@ def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: required=True, ) - args.add_argument( + parser.add_argument( "--reference", metavar="File", type=str, @@ -543,14 +644,14 @@ def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: required=True, ) - args.add_argument( + parser.add_argument( "--output", metavar="File", type=str, help="The output BED file with coordinates of the primers.", required=True, ) - args.add_argument( + parser.add_argument( "--primer-mismatch-rate", metavar="File", type=float, @@ -558,20 +659,50 @@ def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: default=0.1, ) - args.add_argument( + parser.add_argument( "--score-representation", action="store_true", help="Present the alignment score in the bed file instead of the match-percentage for each primer option (default).", ) - args.add_argument( + parser.add_argument( "--verbose", action="store_true", help="Print debug information", ) - flags = args.parse_args() + return parser.parse_args(args) + +def main(args: list[str] | None = None) -> None: + """ + Main function to process the command-line arguments and generate the BED file with primer coordinates. + + Parameters + ---------- + args : list of str, optional + A list of command-line arguments to parse. If None, the arguments will be taken from sys.argv. + + Returns + ------- + None + + Notes + ----- + This function orchestrates the entire process of reading primers, aligning them to the reference sequence, + and writing the coordinates to a BED file. It performs the following steps: + 1. Parses the command-line arguments using the `parse_args` function. + 2. Sets the logging level to DEBUG if the verbose flag is set. + 3. Reads or finds the primers using the `find_or_read_primers` function. + 4. Writes the coordinates of the primers to the output BED file using the `coord_lists_to_bed` function. + + Examples + -------- + >>> import sys + >>> sys.argv = ['script.py', '--primers', 'primers.fasta', '--reference', 'reference.fasta', '--output', 'output.bed'] + >>> main() + """ + flags = parse_args(args) if flags.verbose: log.setLevel("DEBUG") @@ -582,4 +713,8 @@ def CoordinateListsToBed(df: pd.DataFrame, outfile: str) -> None: represent_as_score=flags.score_representation, ) - CoordinateListsToBed(df, flags.output) + coord_lists_to_bed(df, flags.output) + + +if __name__ == "__main__": + main() diff --git a/AmpliGone/io_ops.py b/AmpliGone/io_ops.py index eb2f83e..6b36e3f 100644 --- a/AmpliGone/io_ops.py +++ b/AmpliGone/io_ops.py @@ -1,3 +1,58 @@ +""" +This module provides various input/output operations for the AmpliGone package. + +Functions +--------- +read_bed(filename: str) -> pd.DataFrame + Reads a BED file and returns a pandas DataFrame. + +output_file_opener(output_file: str, threads: int) -> TextIO | PgzipFile + Opens an output file for writing, with optional gzip compression. + +write_output(output: str, read_records: List[Dict[Hashable, Any]], threads: int) -> None + Writes the reads to the output file. + +Classes +------- +SequenceReads + A class for reading and indexing sequence reads from FASTQ or BAM files. + + Methods + ------- + __init__(self, inputfile: str) + Initializes the SequenceReads object and reads the input file. + _read_fastq(self, inputfile: str) -> None + Reads a FASTQ file and stores the reads. + _read_bam(self, inputfile: str) -> None + Reads a BAM file and stores the reads. + _is_fastq(self, filename: str) -> bool + Checks if the given file is a FASTQ file. + _is_zipped(self, filename: str) -> bool + Checks if the given file is a gzipped file. + _is_bam(self, filename: str) -> bool + Checks if the given file is a BAM file. + _load_bam(self, inputfile: str) -> AlignmentFile + Loads a BAM file and returns a AlignmentFile object. + _open_gzip_fastq_file(self, filename: str) -> TextIO + Opens a gzip file for reading and returns an opened file object. + _open_fastq_file(self, filename: str) -> TextIO + Opens a FASTQ file for reading and returns an opened file object. + _fastq_opener(self, inputfile: str) -> TextIO + Opens a FASTQ file for reading, with optional gzip decompression. + _flip_strand(self, seq: str, qual: str) -> Tuple[str, str] + Returns the reverse complement of a DNA sequence and its quality score. + +Examples +-------- +>>> bed_df = read_bed('path/to/file.bed') +>>> print(bed_df.head()) + +>>> seq_reads = SequenceReads('path/to/file.fastq') +>>> print(seq_reads.frame.head()) + +>>> write_output("output.txt", [{"Readname": "read1", "Sequence": "ATCG", "Qualities": "20"}], 4) +""" + import gzip import os import pathlib @@ -6,8 +61,8 @@ import pandas as pd import pgzip -import pysam from pgzip import PgzipFile +from pysam.libcalignmentfile import AlignmentFile from AmpliGone.log import log @@ -47,14 +102,14 @@ def read_bed(filename: str) -> pd.DataFrame: usecols=range(6), header=None, names=["ref", "start", "end", "name", "score", "strand"], - dtype=dict( - ref=str, - start="Int64", - end="Int64", - name=str, - score=str, - strand=str, - ), + dtype={ + "ref": str, + "start": "Int64", + "end": "Int64", + "name": str, + "score": str, + "strand": str, + }, ) primer_df = primer_df[ ~( @@ -67,9 +122,51 @@ def read_bed(filename: str) -> pd.DataFrame: class SequenceReads: + """ + A class for reading and indexing sequence reads from FASTQ or BAM files. + + Attributes + ---------- + tuples : list[tuple[str, str, str] | None] + A list to store the read name, sequence, and quality score tuples. + frame : pd.DataFrame + A DataFrame to store the indexed reads. + + Methods + ------- + __init__(self, inputfile: str) + Initializes the SequenceReads object and reads the input file. + _read_fastq(self, inputfile: str) -> None + Reads a FASTQ file and stores the reads. + _read_bam(self, inputfile: str) -> None + Reads a BAM file and stores the reads. + _is_fastq(self, filename: str) -> bool + Checks if the given file is a FASTQ file. + _is_zipped(self, filename: str) -> bool + Checks if the given file is a gzipped file. + _is_bam(self, filename: str) -> bool + Checks if the given file is a BAM file. + _load_bam(self, inputfile: str) -> AlignmentFile + Loads a BAM file and returns a AlignmentFile object. + _open_gzip_fastq_file(self, filename: str) -> TextIO + Opens a gzip file for reading and returns an opened file object. + _open_fastq_file(self, filename: str) -> TextIO + Opens a FASTQ file for reading and returns an opened file object. + _fastq_opener(self, inputfile: str) -> TextIO + Opens a FASTQ file for reading, with optional gzip decompression. + _flip_strand(self, seq: str, qual: str) -> Tuple[str, str] + Returns the reverse complement of a DNA sequence and its quality score. + + Examples + -------- + >>> seq_reads = SequenceReads('path/to/file.fastq') + >>> print(seq_reads.frame.head()) + + """ + def __init__(self, inputfile: str): log.debug(f"Starting INDEXREADS process\t@ ProcessID {os.getpid()}") - self.tuples = [] + self.tuples: list[tuple[str, str, str] | None] = [] if self._is_fastq(inputfile): log.debug("INDEXREADS :: Parsing reads from FASTQ file") self._read_fastq(inputfile) @@ -202,9 +299,9 @@ def _is_bam(self, filename: str) -> bool: """ return ".bam" in pathlib.Path(filename).suffixes - def _load_bam(self, inputfile: str) -> pysam.AlignmentFile: + def _load_bam(self, inputfile: str) -> AlignmentFile: """ - Load a BAM file and return a pysam.AlignmentFile object. + Load a BAM file and return a AlignmentFile object. Parameters ---------- @@ -213,7 +310,7 @@ def _load_bam(self, inputfile: str) -> pysam.AlignmentFile: Returns ------- - pysam.AlignmentFile + AlignmentFile A file object for reading the BAM file. Examples @@ -223,7 +320,7 @@ def _load_bam(self, inputfile: str) -> pysam.AlignmentFile: ... print(read) """ - return pysam.AlignmentFile(inputfile, "rb") + return AlignmentFile(inputfile, "rb") def _open_gzip_fastq_file(self, filename: str) -> TextIO: """ @@ -274,7 +371,7 @@ def _open_fastq_file(self, filename: str) -> TextIO: !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 """ - return open(filename, "rt") + return open(filename, "rt", encoding="utf-8") def _fastq_opener(self, inputfile: str) -> TextIO: """ @@ -348,9 +445,32 @@ def _flip_strand(self, seq: str, qual: str) -> Tuple[str, str]: def output_file_opener(output_file: str, threads: int) -> TextIO | PgzipFile: + """ + Open an output file for writing, with optional gzip compression. + + Parameters + ---------- + output_file : str + The path to the output file. If the file extension is '.gz', the file will be opened with gzip compression. + threads : int + The number of threads to use for writing the output file when using gzip compression. + + Returns + ------- + TextIO | PgzipFile + An opened file object for writing. If the file is gzipped, a PgzipFile object is returned; otherwise, a standard TextIO object is returned. + + Examples + -------- + >>> with output_file_opener("output.txt", 4) as f: + ... f.write("This is a test.") + ... + >>> with output_file_opener("output.txt.gz", 4) as f: + ... f.write("This is a gzipped test.") + """ if ".gz" in output_file: return pgzip.open(output_file, "wt", compresslevel=6, thread=threads) - return open(output_file, "w") + return open(output_file, "w", encoding="utf-8") def write_output( @@ -382,11 +502,11 @@ def write_output( >>> write_output("output.txt", [{"Readname": "read1", "Sequence": "ATCG", "Qualities": "20"}], 4) """ with output_file_opener(output, threads) as fileout: - for index, k in enumerate(read_records): - for key in read_records[index]: + for read_record in read_records: + for key in read_record: if key == "Readname": - fileout.write("@" + read_records[index][key] + "\n") + fileout.write("@" + read_record[key] + "\n") elif key == "Sequence": - fileout.write(read_records[index][key] + "\n" + "+" + "\n") + fileout.write(read_record[key] + "\n" + "+" + "\n") elif key == "Qualities": - fileout.write(read_records[index][key] + "\n") + fileout.write(read_record[key] + "\n") diff --git a/AmpliGone/log.py b/AmpliGone/log.py index 3684487..e1becbb 100644 --- a/AmpliGone/log.py +++ b/AmpliGone/log.py @@ -1,3 +1,36 @@ +""" +This module sets up a central logging object using the Rich library for enhanced logging output. + +Imports +-------- +import logging + Standard Python logging module. +from rich.highlighter import NullHighlighter + Import NullHighlighter from the Rich library to disable highlighting. +from rich.logging import RichHandler + Import RichHandler from the Rich library to handle logging with Rich's features. + +Variables +--------- +FORMAT : str + The format string for log messages. +log : logging.Logger + The central logging object configured to use RichHandler. + +Notes +----- +This module configures the logging system to use RichHandler from the Rich library, which provides enhanced logging output with features like rich text formatting and better readability. The logging level is set to DEBUG, and the log messages are formatted to include the date and time. + +Examples +-------- +>>> from log import log +>>> log.debug("This is a debug message.") +>>> log.info("This is an info message.") +>>> log.warning("This is a warning message.") +>>> log.error("This is an error message.") +>>> log.critical("This is a critical message.") +""" + import logging from rich.highlighter import NullHighlighter @@ -6,7 +39,7 @@ # Central logging object using Rich's logging library FORMAT = "%(message)s" logging.basicConfig( - level="INFO", + level="DEBUG", format=FORMAT, datefmt="[%d/%m/%y %H:%M:%S]", handlers=[ diff --git a/CHANGELOG.md b/CHANGELOG.md index cad335d..4ecb81c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [1.3.1](https://github.com/RIVM-bioinformatics/AmpliGone/compare/v1.3.0...v1.3.1) (2024-04-03) + + +### Bug Fixes + +* don't replace NA name values when reading BED file ([6d0c192](https://github.com/RIVM-bioinformatics/AmpliGone/commit/6d0c192a0b997d98778c2a2f0281e19208aedcac)) + ## [1.3.0](https://github.com/RIVM-bioinformatics/AmpliGone/compare/v1.2.1...v1.3.0) (2023-08-08) diff --git a/CITATION.cff b/CITATION.cff index 93df80c..a509d1c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -15,7 +15,7 @@ authors: Environment (RIVM) orcid: 'https://orcid.org/0009-0002-6384-3446' - name: "The RIVM-IDS Bioinformatics team" -version: 1.3.0 #x-release-please-version +version: 1.3.1 #x-release-please-version doi: 10.5281/zenodo.7684307 identifiers: - type: doi diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 0000000..7c311ec --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,15 @@ +sonar.sources=AmpliGone +sonar.tests=tests + +sonar.projectKey=RIVM-bioinformatics_AmpliGone +sonar.organization=rivm-bioinformatics +sonar.host.url=https://sonarcloud.io + +sonar.language=python +sonar.sourceEncoding=UTF-8 +sonar.python.version=3.10 +sonar.python.coverage.reportPaths=tests/data/reports/coverage.xml +sonar.python.pylint.reportPaths=tests/data/reports/pylint-report.txt +sonar.python.bandit.reportPaths=tests/data/reports/bandit-report.json +sonar.python.flake8.reportPaths=tests/data/reports/flake8-report.txt +sonar.python.mypy.reportPaths=tests/data/reports/mypy-report.txt \ No newline at end of file diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..c98654f --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,8 @@ +pytest==8.3.2 +pytest-cov==6.0.0 +pylint==3.3.1 +black==24.8.0 +isort==5.13.2 +mypy==1.13.0 +flake8==7.1.1 +bandit==1.7.10 \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/config.yaml b/tests/config.yaml new file mode 100644 index 0000000..6c14bcb --- /dev/null +++ b/tests/config.yaml @@ -0,0 +1,115 @@ +# Synthetic data: +# reference is 200 nucleotides long +# The primers are 10 and match the first and last 10 nucleotides of the reads. +# reads are 2 sequences, raw reads 120 nucleotides, without primers 100. +# processed reads have a 20 nucleotide overlap with each other. + + +happy_sars_cov_2: + pipeline_args: + --input: "tests/data/inputs_outputs/sars-cov-2.fastq" + --output: "tests/data/inputs_outputs/sars-cov-2-output.fastq" + --reference: "tests/data/references/SARS-CoV-2-reference.fasta" + --primers: "tests/data/primers/ARTIC-V5.3.2.fasta" + --amplicon-type: "end-to-end" + test_args: + comparison_file: "tests/data/expected_outputs/sars-cov-2-eo.fastq" + fails: False + expected_log_message: "writing output files" + +happy_synthetic: + pipeline_args: + --input: "tests/data/inputs_outputs/synthetic.fastq" + --output: "tests/data/inputs_outputs/synthetic-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + comparison_file: "tests/data/expected_outputs/synthetic-eo.fastq" + fails: False + expected_log_message: "removed a total of [bold cyan]40[/bold cyan] nucleotides" + +happy_synthetic_bed: + pipeline_args: + --input: "tests/data/inputs_outputs/synthetic.fastq" + --output: "tests/data/inputs_outputs/synthetic-bed-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.bed" + --amplicon-type: "end-to-end" + test_args: + comparison_file: "tests/data/expected_outputs/synthetic-eo.fastq" + fails: False + expected_log_message: "removed a total of [bold cyan]40[/bold cyan] nucleotides" + +synthetic_mismatched_primers: + pipeline_args: + --input: "tests/data/inputs_outputs/synthetic.fastq" + --output: "tests/data/inputs_outputs/synthetic-mismatched-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic-no-match.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + expected_log_message: "ampligone was unable to match any primers to the reference" + +empty_input: + pipeline_args: + --input: "tests/data/inputs_outputs/empty.fastq" + --output: "tests/data/inputs_outputs/empty-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + expected_log_message: "Empty input file" + +empty_reference: + pipeline_args: + --input: "tests/data/inputs_outputs/synthetic.fastq" + --output: "tests/data/inputs_outputs/synthetic-output.fastq" + --reference: "tests/data/references/empty.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + +empty_primers: + pipeline_args: + --input: "tests/data/inputs_outputs/synthetic.fastq" + --output: "tests/data/inputs_outputs/synthetic-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/empty.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + +corrupted_input: + pipeline_args: + --input: "tests/data/inputs_outputs/corrupted.fastq" + --output: "tests/data/inputs_outputs/corrupted-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + +wrong_format_input: + pipeline_args: + --input: "tests/data/inputs_outputs/wrong-format.fastq" + --output: "tests/data/inputs_outputs/wrong-format-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + +too_short_sequences: + pipeline_args: + --input: "tests/data/inputs_outputs/too-short.fastq" + --output: "tests/data/inputs_outputs/too-short-output.fastq" + --reference: "tests/data/references/synthetic.fasta" + --primers: "tests/data/primers/synthetic.fasta" + --amplicon-type: "end-to-end" + test_args: + fails: True + diff --git a/tests/data/expected_outputs/sars-cov-2-eo.fastq b/tests/data/expected_outputs/sars-cov-2-eo.fastq new file mode 100644 index 0000000..8ca2e48 --- /dev/null +++ b/tests/data/expected_outputs/sars-cov-2-eo.fastq @@ -0,0 +1,20 @@ +@SRR30635841.4 +ATATTCTGAGCCCTGTGATGAATCAACAGTTTGAGTTGGTAGTCCCAAAATCTTTGAGGCTACAGCATTCTGTGAATTATAAGGTGAAATAAAGACAGCTTTTCTCCAAGCAGGGTTACGTGTAAGGAATTCTCTTACCACGCCTATTTGTGGCCTGTTAATTGCAGATGAAACATCATGCGTGATAACACCCTTATAAAACATTTTAAAGCATTGAGCTGATTTGTCTTTATGTGCTTTAAGCTTATTATCATAAACCAAAGCACTCACAGTG ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCC*CCCCCC*CCCCCCC5CCCCCCCCC +@SRR30635841.5 +TACTCTGCAAGAAGTAGACTAAAGCATAAAGATAGAGAAAAGGGGCTTCAAGGCCAGCAGCAACGAGCAAAAGGTGTGAGTAAACTGTTACAAACAACAACAGCAAGTTGCAAACAAAGTGAACACCCTTGGAGAGTGCTAGTTGCCATCTCTTTTTGAGAGTTATGATTTTGGAAGCGCTCTGAAAAACAGCAAGAAGTGCAACGCCAACAATAAGCCATCCGAAAGGGAGTGAGGCTTGTATCGGTATCGTTGCAGTAGCGCGAAC ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCC5CCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5 +@SRR30635841.2 +TTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGGTAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCA ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCC5CCCCCCCCCCCCCCCCCC5CCCCC*C5CCCCC* +@SRR30635841.3 +GAGTGTTGGGTATAAGCCAGTAATTCTAACATAGTGCTCTTGTGGCACTAGTGTAGGTGCACTTAATGGCATTACTGTATGTGATGTCAGCACAAAATAATCACCAACATTTAATTTGTAAGTTGTTGTACCTCGGTAAACAACAGCATCACCATAGTCACCTTTTTCAAAGGTGTACTCTCCTATTTGTACTTTACTGTTTTTAGTTACACGATAACCAGTAAAGACATAATTTCGGTTAAGTGGTGGTCTAGGTTTACCAACTTCCCATG ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCC**CCCCCC5C +@SRR30635841.1 +TTATTAACAATAAGTAGGGACTGGGTCTTCGAATCTAAAGTAGTACCAAAAATCCAGCCTCTTATTATGTTAGACTTCTCAATGGAAGCAAAATAAACACCATCATTAAATGGTAGGACAGGGTTATCAAACCTCTTAGTACCATTGGTCCCAGATATAACATGGAACCAAGTAACATTGGAAAAGAAAGGTAAGAACAAGTCCTGAGTTGAATGTAAAACTGAGGATCTGAAAACTTTGTCAGGGTAATAAACACCACGTGTGAAAG ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCC*C5C*CCCCCCCCCC5CC diff --git a/tests/data/expected_outputs/synthetic-eo.fastq b/tests/data/expected_outputs/synthetic-eo.fastq new file mode 100644 index 0000000..105fae3 --- /dev/null +++ b/tests/data/expected_outputs/synthetic-eo.fastq @@ -0,0 +1,8 @@ +@read_number_2_last_120_of_ref +CCTCGTGGGGCCTACACCTGACCAGGAGCCGCACTGACAGGACCACGCTTCATCATAACTTTGGCGGCTGGGCAACGGATTTAATGGTACATAACTCATC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@read_number_1_first_120_of_ref +TCTAGGGAGTGACGTGGACCCCGGATTGATACAGGATCACATGTAGAAAAGGTAGTCGGACAAGTTACCGCTACCCTCGACCTCGTGGGGCCTACACCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/tests/data/inputs_outputs/corrupted.fastq b/tests/data/inputs_outputs/corrupted.fastq new file mode 100644 index 0000000..d3dc897 Binary files /dev/null and b/tests/data/inputs_outputs/corrupted.fastq differ diff --git a/tests/data/inputs_outputs/empty.fastq b/tests/data/inputs_outputs/empty.fastq new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/inputs_outputs/influenza.fastq b/tests/data/inputs_outputs/influenza.fastq new file mode 100644 index 0000000..618ed73 --- /dev/null +++ b/tests/data/inputs_outputs/influenza.fastq @@ -0,0 +1,20 @@ +@b253b01d-3c76-4251-bbd1-9652881ceb2f +ACGCGTGATCAGCAAAAGCGGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCCAAAGAAATACCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCTATACGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCATCCCAATAGGTAAAATATGCGACGGTCCCCATCAGATCCTTGATGGAGGGAACTGCCACACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCGTTCAAAATAAGGAATGGGACCTTTTTGTTGAACGAAGCAGAGCCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGAATTCACTAGTTGCACTCATCCGGCACACTGGAGTTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCAAGCAAAACGGAACAAGTTCTGCGTGCAAAAGGGGATCTAGTAGTAGTTTTTTTAGTAGATTAAATTGGTTGACCCACTTAAGCAACATATATCCAGCACAGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTTGGGGGGTTCACCACCCGGATACGGACAAGAACCAAATCTCCTGTTCGCCCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAAGGACATTCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACACTTTTGATTAACAGCACAGGGAATCTAATTGCTCTAAGAGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGGATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCGTTCCAAAATGTAAACAGGATCACATACAGGGCCTGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGC ++ +=<;999::<=;:;::744*)(),4<899AA??>==>@>??>>??<<<<>>=?====<>>>@=;:;+++,(,,,2=>>>>;<?@@?BFC3222267?@B98832*))*+1000176..**)*-//2;<;:;<??@ABADB@B<9:::9:;=>@??@B@A?==<<=<<;;;==??@5400078888,**+/1.-.45;??=;99210&&%&),7566.---88<=;;77669<9300/..((()0/77>?>===<=<;;=>5422566=;;;;;<=>>?<=99999>555631232889@<>=<99::::999;:=<776::;.....<<<<<<<===<<==A78:@?;::::>??A@><;<;;::<84(%%&&%'()*++,-*+++/344...-,,,,43/.../*199>AA>=;;;;=?<<<;.*,,.534<>><<<;<>=:;<>9988::;:;9644447<<<;:::;<<<=;:;;:<>?AC@?@@@ABDA@::9=?@-12.++++-0))))('*)*.68:4...-+++)()+*,,.21)((/;633335:;8=;;87773007:8<>=;;<<;=<;::::6>?=:8998:;;:;::9::;<===>???>833335;?<=<99:556<<:+*4,,9;<<87556600099=:9:99<<=>=<;;;@=:98,,&%%&&&(+.07898:7;:9610013=;99:<>B444355AAABA:999444464)((((.0123==>=<;;967752*****,77----/3579><;:66667<:889;=?><==:643446?;C=:888832///788<998))&&'*+69:;;:::;>;::;9;;98999:: +@dbdcfc8a-6609-4268-9867-fc4bec4bbaac +ACGCGTGATCAGCAACCAGAGACTAATTCTATTAACCATGAAGACTATCATTTAGCACGAGCAACCATTCTATGTATCTTGTTTTCGCTCAAAAAATACCTGGAAATGACAATAACACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTAAAATATGCGACAGTCCCCATCAGATCCTTGATGGAGGGAACTGCACACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCGTTCAAAATAAGGAATGGGACCTTTTTGTTGAACGAAGCGAGCCAACAACAACTGCCACCCTTATGATGTACGGATTAACTTCCCTTAGGTCACTAGTTATATTAACTGCTGGAGTTTAAAAATGAAAGCTCAATTGGACTGGAGTCCGTAAAACGGAACAAGTTCTGCGTGCAAAAGGATCTAGTAGTAGTTTTTTTAGTAGATTAAATTGGTTGACCCACTTAAACAACATATCAGCACAGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGTTCACCACCGGATACGGACAGAGCCCGATCTCCCTGTTCGCCCAATCATCAGGAAGAATCACATTATCTACCAAAAGAAGCCAAGCAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGACATTCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACAGAATGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGAATCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCGTTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACCAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGATGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAATCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGAAGCTGAATCGATTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAGGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCCAACATACGTTGACCCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAAGAAGCAACTGGGGGAAAATGCTGAGGATACTAGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCGGGGTACAAGATTGGATCCTATGGATTTCCTTTGCCATGTCATGTTTTTTTGCTTTGTATTGCTTTGTTGGGGTTTCATCATGTGGGCCTGCCAAAGGGCAACATTAGATTTTAACATTTGCATTTGAGTGCATTAATTAAACACCCTTGTTTCTACTGATCACGCGT ++ +++,--..003.****''''&&%%%(()09977888==<;5511111126666-++++++++,6..***+8643341111178<<796;:;:<>CF=??>===?A>DDE@@><77)((()5567888730..033/..--01;<;;22223;==@AB===;9;;99:9:==>>DDB?@???A@?>=<;;;<<765566535434677999988656889910044;@@??55576677:==>87)))))43234;9:;?0//000::;>?@;;;:51786454337;>55444;96660)))))((,-/<999...8::;:66666<=@BGBI>88777=>A>=<::;<@>BBA<787-,,,./22254,+)''(()0/02..1114567::><))'&&+-)(****/,+-+*****)(%&&''((&&&&&&''(*+22222275397.+*(')+'''*,0001971//23***+345=:8889<;87:>;77778?>>>04=859<9888778;;??=970.011244>>@>=?=:77778:;0,,,,,)))))11133+**77/+*).)''(&&&&(*2458999:>???@BBDDFJC97,++++3433400-(''''/888,+*&&+-('&&&''&&&%%$$&'(199*)+9=<;;:1019::<=,++++:876635,(((()@887775.520--21,)(*,,15::<==??==>@AB@=76679;=;;<;:9::;;>:&&'&(89:;0///998677<(''+::666?>;;::;=>=<<<=BC=(((():8;:<;<==A@@??@B@>;:99/..+*25///<=>;;:5+((((+(-/00013349<;7752+''''**.//>?>=>@?>>>?>><<;<=?ABA@?877<<10,+,,'&&(())69:=;;;:;<<;<;:;==>=@A??>>>>>=566/8:;;=999//////,,,-.;7666466666655552.---.0**+;<>>99<;63354,.49:<=@>>>>==>-,++-965)))*,:<331-...,,.46<=9(&&&(///557779:<44445<9978899@;@@AAADC=86556211))+:??@.-.-245@999:;A++++,0489=@AACCBD?>1-3::9989:?AB9+****;<::89*'(.578<::9;<;;@AA?=<<>>?=<<<<9210/1349@?1+'&(&&%%*,,176;<<5224890//014.*,//0...0852*)))35546778:;@>=;;7----*)).78;<>?>::966888>ACCDB@??>????@?>>?@@AA>;:989:<=??@FDEE>==426773,,,,)(*134,,+''''**))176778<<>973349;<<<<<><<<==>>A@?CCA=354*5**(*....8.--.0:=C?@>==876--3,,''(((/7;?AAAACB:>96/1*)))(((((?>@@?>>??@<<<7677757777??AA98888AAA223344478<>??<<<<=A=76777755679::9:=?:<**01,---9-,*,,,+.3)'''',-01026;99;==<<;;<<<<==97779+;<==:<<;;>>>>>7924:9;<>A@BA?<<==?BB;9989<535@;788***+;:<<<<32.+./.57777721111112557>@A==><=@=>:99978788<7442'000118;>A::666:9;=@=:8777 +@70880d96-c688-4852-afd4-07330abdc8ff +ACGCGTGATCAGCGAAACCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAATGCCTGGAAATGACAATAGCGGCGACGCTGTACGCTTAGGCACCATGCAGTACCAAACGGAACGATAAGGCAAAAACAATCACAAATGACCGAATTGGAGAGTTACTTAATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTAAAATATGCGACGGTCCCCATCAGATCCTTGATGGAGGGAACTGCACACTAATAGATGCTCTATTGGGGGACCCTCAGGTGTGACGGCATTCAAAATAAGGAATGGGACCTTTTGTGAACGAAGCAAGAACCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTGGGTCACTGGATGCCTCATCCGGCACACGCGAGTTTTAAAAATGAAAACTTCAATTGGGCTGGAGTCAAGCAAAACGGAACAAATTCTGCGTGCAAAAGGGGATCTAGTAGTAGTTTTTTAGTAGATTAAATTGGTTGACTTACTTAAACAACATATATCCAGCACAAGACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGATACGGACAAGAACCAAATCTCTGTTCGCCCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATACCAAGGACATTCCTAGCAGAATAGCGCATCTATTGGACAATAGTAAAGCCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACGTTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGCAAAATGTACCAGAGAAACAAACCAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGATGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCCAAAATTCTGAGGAAGAGGACAATCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGGAAGCTGAATCGATTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAGAATTCTCTTGAGGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACGATTGACCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCGGATCAATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTCAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATGTGTTTTTTTTGCTTTTGCTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAGGAGCAGCATTAGATGCAACATTTGCATTTGAGTGCATTAATTAAAAACACCCTTGTTTCTACTGATCACG ++ +:9:;::<<=>=77688=>=322+++29<>BA::9=>?===>>@?<<;;<@@@A@><=>=?@;;<::;;;=;;;;<=<<>===AE::::66666.....67665343.,,+**&&&+,//.../*)&&%%%%&&&/3>>99(((((;;<=>=<<<<:::8)(((((((+2334A@@@@BEA@@?=====>3221000223++*,48>333326667:?@?983236;=>>>??44444;@CECA>=<<:)))))0,+24433599<=>><=9887,+/4322*)*59=>?888887<<3/-.--,,99:>876(&%$%&&&&&&&&&*,/;;AAEB977799<<<<434490+&&&+,.2264----))))*,++++,=;;=<;<<<=?A@AB@B87778=<==<)(''')003++++,66::90///-//07>=77744('&&&'((*-)0.2106<:;5(((()><<98555570.19698--/025;=====@BE=;9:::<=,(,*+./3:;;<=;33344789;008::;>@@CBCBCB?>>=21'''''):<<=?>>>===>{>2112788((((((./28887:::<;:9878888?699:=CAA@A?A88442125,..-34458:@?>>?<;=>>><;::9:;:;=<<><;1<+))))699>>>>AABCA@87778;????A@?<==><=>===?@?BDHA=?>>?B=+++'&&&'+B@?AB3>==767;=?@>>>>><;8+').//4//248<=;1002<=99871))(((+,-':<<.--(&&31*)21+((()979=;>>2222311:9??AB;9978111,*,---1:;;?==;9844444354666410021/42224:;?:8=?DB961114:::D@<<<<<@??=<<;9989:77=>>;88999:==>>======<<<9211:999<<;9;::;=44444=<::;:8++++,89:;;;;:9;:;<,.55222570+4767:5<==>?><;:<<>892...*('&&'(47662226?<8732232,,,,,--->;=?>>>AACB?>?>>>A?>><;;::==9875003=999<=A?=<999:''',7988,,(((()35:88889:?A;;999;::;;;==A9>:;;;:88755112=>>?@?>==>??C;:5,4558:<<=?>===<=DC?<;99;=:;9;;<:99::;:=?>===?AB@?><:;80(((/;><.(''),./178:;;;21777==+++-8888811249::=@?>><<<=>?>??@BD8=<<=>>BB=<<<;<<=@>;88732221/03322,+,,,==<;22-+++++..27:;<<<==>>>=??989437=??>>>@;>??>@CFDDKFM@?>;-:752:9862222++/=87,,,,<=ABBAA;;32213720-+%%(++++-/,,../=>=...0>?>?))/0299==>??>>=0////2562+*(((()*<=32--..,,,,/488<;<<9977+..41/00.2;899--)'&%$$$%&''''%%%$&((100121.--.:<;;;;=?//3,,,,****+-*)&&&&)*047224>=@BA@<<;;:<<>>@?@ABCDBA>=<,;<@BC??=<;;<<>==; +@405024e8-3a9e-48a9-82d4-37d40bf98f8c +ACGCGTGATCAGCGAAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCCAAAAAATACCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTAAAATATGCGACAGTCCCCATCAGATCCTTGATGGAGGGAACTGCACGCACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCGTTCAAAATAAGGAATGGGACCTTTTTGTTGGGCAAAACGAAACCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCAAGCAAAACGGAACAAGTTCTGCGTGCAAGGATCTAGTAGTAGTTTTTTAGTAGATTAAATTGGTTGACCCGCCAAACAACATATATCACTTTAGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGATACGGACAAGAATAAATCTCCTGTTCGCCCAATCATCAGGGAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGACATTCCTAGCAGAATAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGAATCTAATTGCCCCTAGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCGTTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACCAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGATGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGAAGAGGACAATCAGCAGATCTCAAAAGCACTCAAACAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAGGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACGATTGACCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAACAAAGCCAACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATGTCATGTTTTTTGCTTTGTATTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGATGCAACATTTGCATTTGAGTGCATTAATTAAAAACACCCTTGTTTCTACTGATCACGCGT ++ +,,,,112::;633233472/0+*(*-6;@A@?65555;?,+++*+4488<+'&&&&**('&&'''534444;;;==>>==?@@C8777-..3<;6745422888:;5@?AA>=<<99:::<=>><<:;:=333311126<<;<<<;;;:9::;?@<;;;;??;99889;=====A@?@@@BB@>=<<=>>AB><<;::88;;::9;;<8,++//:=>>6612221+******,,,-<>@CC><;;<<:9989:<;>>>==<;:<<<>A436881039400-,)++,-10564461111178;9333599:8994,+)()++99987:?>@BDCB?>=<=>??<<;99;=B7777(''&()'&&('&&&'''0(((((,)&&&&'*,2:;<<@AAA985448@@A@765448878788;:;:<:::;:;;;;5564344***,,0011259964/,--,-.0118:<>><<;9999::99;:=>>@DD?=<<;;543459;;;====>))('*>?9997799;??>903322447?@<555544444;93'''''&'1.,+++.11/&&&)*((((-../11===<<;;;<6/+(((()+,:@C>><;<>AB@?0///07///.1+))+5::;=<;889:;::99:<<:10''&&'-19<('3334=<<<=>?@?@.,,&&(*,00---56::;A>====<-3>;9<<<<<=><;<::;==><<<<=AA>;8889;??===<<><=>==>43:56<<@5444242--.3.('',134=??>===>?=668=@B@=882+6888899=>>AA>?A@BC9550///**+.;<<==;:::8<;45475:<>>?>=>@CFB:7777;A?>>>?@@<;>=?>?>?>A?>?@??@==;:9:78:>:5543+****)),--+---..///<:;;;=???>===;;;;;===>=A@@>=@@@<;:;;<=>@DCCAAA@??>=?>?@?>><:8864556:===><<<>>@@?@>>>?BCB776646-7:<=?@==<>==1111;=>?//..02610''')***04679>>=<<71255=???>>:9899<+++*+../;==@>.----(''(,99:5=>;79;@@@?>???=>===>?><;<<<5<<<<==?@?@AAB?=9993((34789;;=?;323:;=@>>><>==;;;76)((((449<<>>?@@@?@@A@@<;7<;<<==?AA;;;;;AA=;:::9996666888779;;BC????@AABAABABFFCBBBABBA?@A?>==<=<<9899;:::>9:56>>==>?A@BA@AA@?>?>====>?BBCCDDC??>>>???>=<;<=>>?922229;:<=988-+000:;:<<>=<=<;==>@??9889:>==;8798888688,,,46=??98=@C@@===??A@>BD@89,(((&&(/000:-,2:=@C<98111((::8889:@;==889:AC?@=?=?A?3300023=5<=643222380--+++,.488<=ABC7666<;;;;:;<=ABDB??>>>@@@@><===<:999=BA====>>?@>?==<77888==<<>?<=,+1'')*+===><84223-,,,,479@;,000017=?>????<;;;::<<<>:;?==?@?<<<<>BB@8:>>>?>DCBEC@?@?>@@=99889==BA=<<<:=<=<;;;;<>?/9;6567:;;<>>>=>>@@@BABB=?@AACC@=<;;;>?AAABBEDFKCC@>224?ADD@>=;;;<=?544/// +@63c385b5-aad4-4178-a973-b5dd8a909558 +ACGCGTGATCAGCGAAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTAAAATATGCGACAGTCCCCATCAGATCCTTGATGGAGGAACTGCCACTAATAGATGCTCTATTGGGGGACCCTCAGTGCATGACGGCGTTCAAAATAAGGAATGGGACCTTTTTGTGAACGAAGCAGAGCCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTGGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTAGGTCAAAACAAAACGGAACAAGTTCTGCGTGCAAGGGGATCTGGTAATAGTTTTTTAGTAGATTAAATTGGTTGACCCACTTAAACAACATATATCCAGCACAAGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGAGTTCACCACCCGGATACGGACAAGAACCAAATCTCCCTGTTCGCCCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGACATTTCTAGCAGAATAAGCATCTATT ++ +<<:::::<==;00001:542++++,8:;A@==<;:;;<<===?><==<:;558976:;===?<====ABCC28;<889DFCD6=9666667:==<<<=>===<889:=?><;;;><<<<?==<>>?::9735---56566=844525>???777?>=<<=<<;<@=<:::;;=<<;;;<=222222:::;<>><;<.-2911.--/8899::<;;<;:6666677=<:99::0000)(((()10001;;559==A?=99988=<<;99:891,+'&'/0001<=999==>;;;;:=<<;:::::<<<<=>>@@??<;:99:;;<322::980+0..../666)((((****.5;:;:::>?>=>=777791635CDA>A>=;<===<77;::91)(())+-*''''(?@>=<<<>@=<>==?<::::98((**,365/...**(((+,56019<<;<=@@=<<;=;;<;<>?@5;=9?;69E,(()***('(.=<@?????8864,,,--/=<==>@DA>@---777::>99:634/()((,565ACB>>CBB76588=>D;:><;:;;7=998335;;::?>>;?B@DCC54445@DEC>22*))*64('''(48=>AAA>>==>??ABC;=?=:77;<987533888ABC33334B@?>===GAAABA;9:: \ No newline at end of file diff --git a/tests/data/inputs_outputs/sars-cov-2.fastq b/tests/data/inputs_outputs/sars-cov-2.fastq new file mode 100644 index 0000000..896087c --- /dev/null +++ b/tests/data/inputs_outputs/sars-cov-2.fastq @@ -0,0 +1,20 @@ +@SRR30635841.1 VL00553:3:AACM5FTM5:1:1101:26525:1000 length=301 +AATTCACAGACTTTAATAACAACATTAGTAGCGTTATTAACAATAAGTAGGGACTGGGTCTTCGAATCTAAAGTAGTACCAAAAATCCAGCCTCTTATTATGTTAGACTTCTCAATGGAAGCAAAATAAACACCATCATTAAATGGTAGGACAGGGTTATCAAACCTCTTAGTACCATTGGTCCCAGATATAACATGGAACCAAGTAACATTGGAAAAGAAAGGTAAGAACAAGTCCTGAGTTGAATGTAAAACTGAGGATCTGAAAACTTTGTCAGGGTAATAAACACCACGTGTGAAAG ++SRR30635841.1 VL00553:3:AACM5FTM5:1:1101:26525:1000 length=301 +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCC*C5C*CCCCCCCCCC5CC +@SRR30635841.2 VL00553:3:AACM5FTM5:1:1101:34648:1133 length=301 +GAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGGTAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCA ++SRR30635841.2 VL00553:3:AACM5FTM5:1:1101:34648:1133 length=301 +CCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCC5CCCCCCCCCCCCCCCCCC5CCCCC*C5CCCCC* +@SRR30635841.3 VL00553:3:AACM5FTM5:1:1101:54190:1133 length=301 +TCATTGCTAGAAAACTCATCTGAGATATTGAGTGTTGGGTATAAGCCAGTAATTCTAACATAGTGCTCTTGTGGCACTAGTGTAGGTGCACTTAATGGCATTACTGTATGTGATGTCAGCACAAAATAATCACCAACATTTAATTTGTAAGTTGTTGTACCTCGGTAAACAACAGCATCACCATAGTCACCTTTTTCAAAGGTGTACTCTCCTATTTGTACTTTACTGTTTTTAGTTACACGATAACCAGTAAAGACATAATTTCGGTTAAGTGGTGGTCTAGGTTTACCAACTTCCCATG ++SRR30635841.3 VL00553:3:AACM5FTM5:1:1101:54190:1133 length=301 +5CCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCC**CCCCCC5C +@SRR30635841.4 VL00553:3:AACM5FTM5:1:1101:54606:1133 length=301 +TGTGGTTTGAGTGAATATGACATAGTCATATTCTGAGCCCTGTGATGAATCAACAGTTTGAGTTGGTAGTCCCAAAATCTTTGAGGCTACAGCATTCTGTGAATTATAAGGTGAAATAAAGACAGCTTTTCTCCAAGCAGGGTTACGTGTAAGGAATTCTCTTACCACGCCTATTTGTGGCCTGTTAATTGCAGATGAAACATCATGCGTGATAACACCCTTATAAAACATTTTAAAGCATTGAGCTGATTTGTCTTTATGTGCTTTAAGCTTATTATCATAAACCAAAGCACTCACAGTG ++SRR30635841.4 VL00553:3:AACM5FTM5:1:1101:54606:1133 length=301 +5C5CCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCC*CCCCCC*CCCCCCC5CCCCCCCCC +@SRR30635841.5 VL00553:3:AACM5FTM5:1:1101:51596:1151 length=301 +GCCAAAGCCTCATTATTATTCTTACAAAGTTTATACTCTGCAAGAAGTAGACTAAAGCATAAAGATAGAGAAAAGGGGCTTCAAGGCCAGCAGCAACGAGCAAAAGGTGTGAGTAAACTGTTACAAACAACAACAGCAAGTTGCAAACAAAGTGAACACCCTTGGAGAGTGCTAGTTGCCATCTCTTTTTGAGAGTTATGATTTTGGAAGCGCTCTGAAAAACAGCAAGAAGTGCAACGCCAACAATAAGCCATCCGAAAGGGAGTGAGGCTTGTATCGGTATCGTTGCAGTAGCGCGAAC ++SRR30635841.5 VL00553:3:AACM5FTM5:1:1101:51596:1151 length=301 +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCC5CCCCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5 \ No newline at end of file diff --git a/tests/data/inputs_outputs/synthetic.fastq b/tests/data/inputs_outputs/synthetic.fastq new file mode 100644 index 0000000..6e72dfb --- /dev/null +++ b/tests/data/inputs_outputs/synthetic.fastq @@ -0,0 +1,8 @@ +@read_number_1_first_120_of_ref +GGAAATTCATTCTAGGGAGTGACGTGGACCCCGGATTGATACAGGATCACATGTAGAAAAGGTAGTCGGACAAGTTACCGCTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@read_number_2_last_120_of_ref +CTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCCGCACTGACAGGACCACGCTTCATCATAACTTTGGCGGCTGGGCAACGGATTTAATGGTACATAACTCATCATTCTACGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII \ No newline at end of file diff --git a/tests/data/inputs_outputs/too-short.fastq b/tests/data/inputs_outputs/too-short.fastq new file mode 100644 index 0000000..3ddd8fa --- /dev/null +++ b/tests/data/inputs_outputs/too-short.fastq @@ -0,0 +1,8 @@ +@read_number_1_first_120_of_ref +GGAAATTCATTCTAGGGAGTGACGTGGACCCCGGATTGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@read_number_2_last_120_of_ref +CTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII \ No newline at end of file diff --git a/tests/data/inputs_outputs/wrong_format.fastq b/tests/data/inputs_outputs/wrong_format.fastq new file mode 100644 index 0000000..98588fe --- /dev/null +++ b/tests/data/inputs_outputs/wrong_format.fastq @@ -0,0 +1,6 @@ ++read_number_1_first_120_of_ref +GGAAATTCATTCTAGGGAGTGACGTGGACCCCGGATTGATACAGGATCACATGTAGAAAAGGTAGTCGGACAAGTTACCGCTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCC +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII ++read_number_2_last_120_of_ref +CTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCCGCACTGACAGGACCACGCTTCATCATAACTTTGGCGGCTGGGCAACGGATTTAATGGTACATAACTCATCATTCTACGTA +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII \ No newline at end of file diff --git a/tests/data/primers/ARTIC-V5.3.2.fasta b/tests/data/primers/ARTIC-V5.3.2.fasta new file mode 100644 index 0000000..4153564 --- /dev/null +++ b/tests/data/primers/ARTIC-V5.3.2.fasta @@ -0,0 +1,384 @@ +>MN908947.3:47-78_LEFT +CTCTTGTAGATCTGTTCTCTAAACGAACTTT +>MN908947.3:419-447_RIGHT +GCTTAGTAGAAGTTGAAAAAGGCGTTTT +>MN908947.3:344-366_LEFT +TCGTACGTGGCTTTGGAGACTC +>MN908947.3:707-732_RIGHT +AGCTTGGCACTGATCCTTATGAAGA +>MN908947.3:638-661_LEFT +AGAACGGTAATAAAGGAGCTGGT +>MN908947.3:1018-1047_RIGHT +CAGACACCTTTTGAAATTAAATTGGCAAA +>MN908947.3:970-995_LEFT +CATGAAATTGCTTGGTACACGGAAC +>MN908947.3:1340-1370_RIGHT +ACTTACCCCAAAATGCTGTTGTTAAAATTT +>MN908947.3:1292-1320_LEFT +TTTGTGGCACTGAGAATTTGACTAAAGA +>MN908947.3:1660-1692_RIGHT +GACTTTAAACTTAATGAAGAGATCGCCATTAT +>MN908947.3:1574-1596_LEFT +GTGTTGTTGGAGAAGGTTCCGA +>MN908947.3:1945-1972_RIGHT +GTTTTACAGAAGGCCGCTATAACAATA +>MN908947.3:1882-1905_LEFT +GCTGCTCGTGTTGTACGATCAAT +>MN908947.3:2259-2284_RIGHT +CACCTGTGCAAAGGAAATTAAGGAG +>MN908947.3:2229-2252_LEFT +TGCTTGTGAAATTGTCGGTGGAC +>MN908947.3:2603-2629_RIGHT +GTACACCAGTTTGTATTAACGGGCTT +>MN908947.3:2533-2563_LEFT +GTCTTGAAAACTGGTGATTTACAACCATTA +>MN908947.3:2900-2933_RIGHT +TCATAAAAACTTTGCAACCAGTATCTGAATTAC +>MN908947.3:2854-2880_LEFT +CTCGGTACAGAAGTAAATGAGTTCGC +>MN908947.3:3233-3254_RIGHT +AACAAGACGGCAGTGAGGACA +>MN908947.3:3184-3213_LEFT +GAGCAAGAAGAAGATTGGTTAGATGATGA +>MN908947.3:3560-3584_RIGHT +TGGGTGGTAGTTGTGTTTTAAGCG +>MN908947.3:3510-3540_LEFT +CATGCAAGTTGAATCTGATGATTACATAGC +>MN908947.3:3883-3913_RIGHT +CCTAAAGAGGAAGTTAAGCCATTTATAACT +>MN908947.3:3791-3824_LEFT +CTGTCTTTGATAAAAATCTCTATGACAAACTTG +>MN908947.3:4147-4180_RIGHT +GTTTTAACTGCTGTGGTTATACCTACTAAAAAG +>MN908947.3:4079-4108_LEFT +GTGACATTGACATCACTTTCTTAAAGAAA +>MN908947.3:4457-4488_RIGHT +TAGTTTCAACTATACAGCGTAAATATAAGGG +>MN908947.3:4403-4425_LEFT +CACATGCAGAAGAAACACGCAA +>MN908947.3:4776-4803_RIGHT +CATCTCACTTGCTGGTTCCTATAAAGA +>MN908947.3:4723-4756_LEFT +AATGGTTATCTTACTTCTTCTTCTAAAACACCT +>MN908947.3:5089-5119_RIGHT +CCTCATAATTCACATGAAGGTAAAACATTT +>MN908947.3:5036-5063_LEFT +GACAACAGTTTGGTCCAACTTATTTGG +>MN908947.3:5398-5429_RIGHT +GCACTTATCTTAGCCTACTGTAATAAGACAG +>MN908947.3:5344-5370_LEFT +GCTCTACAAGATGCTTATTACAGAGC +>MN908947.3:5716-5744_RIGHT +CATGGTACATTTACTTGTGCTAGTGAGT +>MN908947.3:5671-5696_LEFT +CCTTTTGTTATGATGTCAGCACCAC +>MN908947.3:6031-6062_RIGHT +GCAAGCTTCGATAATTTTAAGTTTGTATGTG +>MN908947.3:5891-5923_LEFT +CCATAAAACCAGTTACTTATAAATTGGATGGT +>MN908947.3:6257-6288_RIGHT +CTAATAAAGCCACGTATAAACCAAATACCTG +>MN908947.3:6204-6237_LEFT +GAAAGGAGCTAAATTGTTACATAAACCTATTGT +>MN908947.3:6562-6595_RIGHT +GTAGACAATTCTAGTCTTACTATTAAGAAACCT +>MN908947.3:6515-6542_LEFT +TAAAAATTACAGAAGAGGTTGGCCACA +>MN908947.3:6882-6915_RIGHT +CGGTAAATTTTGTCTAGAGGCTTCATTTAATTA +>MN908947.3:6823-6854_LEFT +AATTCTAGAATTAAAGCATCTATGCCGACTA +>MN908947.3:7199-7229_RIGHT +CATCTTTTAAATGGGATTTAACTGCTTTTG +>MN908947.3:7145-7179_LEFT +GTTTAGATTCTTTAGACACCTATCCTTCTTTAGA +>MN908947.3:7518-7545_RIGHT +AACAAGAGTCGAATGTACAACTATTGT +>MN908947.3:7456-7482_LEFT +GTGCATGTTGTAGACGGTTGTAATTC +>MN908947.3:7819-7850_RIGHT +TCTCATTTTGTTAACTTAGACAACCTGAGAG +>MN908947.3:7768-7797_LEFT +CATCTTTACTTTGATAAAGCTGGTCAAAA +>MN908947.3:8136-8169_RIGHT +GTCCTTAGACAATGTCTTATCTACTTTTATTTC +>MN908947.3:8085-8112_LEFT +AAAACTCAAAACACTAGTTGCAACTGC +>MN908947.3:8468-8498_RIGHT +AGAATAACTTACCTTTTAAGTTGACATGTG +>MN908947.3:8406-8436_LEFT +CGTTAAAGATTTCATGTCATTGTCTGAACA +>MN908947.3:8781-8806_RIGHT +CCAGCGTGGTGGTAGTTATACTAAT +>MN908947.3:8732-8761_LEFT +CAGATACTTGTTTTGCTAACAAACATGCT +>MN908947.3:9107-9129_RIGHT +TACGCCCTGACACACGTTATGT +>MN908947.3:9023-9052_LEFT +CAATTTTTAAAGATGCTTCTGGTAAGCCA +>MN908947.3:9397-9423_RIGHT +GCATCTATAGTAGCTGGTGGTATTGT +>MN908947.3:9299-9324_LEFT +GATCTTTACCAGGAGTTTTCTGTGG +>MN908947.3:9673-9706_RIGHT +CCTTTCTGGATAACAATTGCTTATATCATTTGT +>MN908947.3:9571-9604_LEFT +GGTGTTTATTCTGTTATTTACTTGTACTTGACA +>MN908947.3:9949-9971_RIGHT +GCTGCTTGTTGTCATCTCGCAA +>MN908947.3:9896-9929_LEFT +ATAATAAGTACAAGTATTTTAGTGGAGCAATGG +>MN908947.3:10266-10295_RIGHT +TAATGTTCAACTCAGGGTTATTGGACATT +>MN908947.3:10215-10245_LEFT +TGAAGATTTACTCATTCGTAAGTCTAATCA +>MN908947.3:10587-10615_RIGHT +AGGTAACTTTTATGGACCTTTTGTTGAC +>MN908947.3:10527-10557_LEFT +TTTTTGTTACATGCACCATATGGAATTACC +>MN908947.3:10897-10927_RIGHT +TTGGGTAGTGCTTTATTAGAAGATGAATTT +>MN908947.3:10832-10865_LEFT +CCGTTTTAGATATGTGTGCTTCATTAAAAGAAT +>MN908947.3:11201-11232_RIGHT +CTGTAGCTTATTTTAATATGGTCTATATGCC +>MN908947.3:11152-11181_LEFT +AAACATAAGCATGCATTTCTCTGTTTGTT +>MN908947.3:11514-11536_RIGHT +TGTCATGTTTTTGGCCAGAGGT +>MN908947.3:11463-11494_LEFT +GTGGGCTCTTATAATCTCTGTTACTTCTAAC +>MN908947.3:11832-11863_RIGHT +CACTGTACAGTCTAAAATGTCAGATGTAAAG +>MN908947.3:11785-11811_LEFT +AACATTAAATTGTTGGGTGTTGGTGG +>MN908947.3:12161-12185_RIGHT +AGGCTGTTGCTAATGGTGATTCTG +>MN908947.3:12112-12137_LEFT +TCCCTTCCATCATATGCAGCTTTTG +>MN908947.3:12477-12510_RIGHT +GGTTGTCATACCAGACTATAACACATATAAAAA +>MN908947.3:12419-12444_LEFT +CAAGAGATGGTTGTGTTCCCTTGAA +>MN908947.3:12794-12819_RIGHT +GAGGTAGGTTTGTACTTGCACTGTT +>MN908947.3:12752-12774_LEFT +GCACTGATGACAATGCGTTAGC +>MN908947.3:13121-13146_RIGHT +GTGGGGGACAACCAATCACTAATTG +>MN908947.3:13075-13099_LEFT +GCTTTTGCTGTAGATGCTGCTAAA +>MN908947.3:13458-13480_RIGHT +GTTTTTAAACGGGTTTGCGGTG +>MN908947.3:13415-13435_LEFT +ATCAACTCCGCGAACCCATG +>MN908947.3:13787-13815_RIGHT +TCAACGTCTTACTAAATACACAATGGCA +>MN908947.3:13738-13767_LEFT +ACTTCTTTAAGTTTAGAATAGACGGTGAC +>MN908947.3:14120-14144_RIGHT +AGGTAGTGGAGTTCCTGTTGTAGA +>MN908947.3:14073-14100_LEFT +CTCAATGGTAACTGGTATGATTTCGGT +>MN908947.3:14427-14457_RIGHT +GTGAGAAAAATATTTGTTGATGGTGTTCCA +>MN908947.3:14375-14407_LEFT +CTTTAATGTTTTATTCTCTACAGTGTTCCCAC +>MN908947.3:14745-14775_RIGHT +GAATTAAAACACTTCTTCTTTGCTCAGGAT +>MN908947.3:14700-14725_LEFT +GACTTTGCTGTGTCTAAGGGTTTCT +>MN908947.3:15065-15095_RIGHT +GAATCTTAAGTATGCCATTAGTGCAAAGAA +>MN908947.3:15016-15045_LEFT +CACTTTTCGCATATACAAAACGTAATGTC +>MN908947.3:15386-15416_RIGHT +ACACCGTTTCTATAGATTAGCTAATGAGTG +>MN908947.3:15342-15366_LEFT +TCACTTGTTCTTGCTCGCAAACAT +>MN908947.3:15716-15742_RIGHT +TGACGATGCTGTTGTGTGTTTCAATA +>MN908947.3:15659-15688_LEFT +CTTTGTGAATGAGTTTTACGCATATTTGC +>MN908947.3:16028-16059_RIGHT +TATAGATGCTTACCCACTTACTAAACATCCT +>MN908947.3:15992-16018_LEFT +TGGTACACTTATGATTGAACGGTTCG +>MN908947.3:16386-16409_RIGHT +AATGCTCCAGGTTGTGATGTCAC +>MN908947.3:16285-16311_LEFT +GTGCTTGCATACGTAGACCATTCTTA +>MN908947.3:16650-16679_RIGHT +AAAGCTACTGAGGAGACATTTAAACTGTC +>MN908947.3:16624-16647_LEFT +TCAAGCTTTTTGCAGCAGAAACG +>MN908947.3:17004-17033_RIGHT +AATATCTCAGATGAGTTTTCTAGCAATGT +>MN908947.3:16962-16994_LEFT +CAAGAGCACTATGTTAGAATTACTGGCTTATA +>MN908947.3:17333-17362_RIGHT +GACAGCAGATATAGTTGTCTTTGATGAAA +>MN908947.3:17182-17212_LEFT +CACTATGTGAGAAGGCATTAAAATATTTGC +>MN908947.3:17560-17582_RIGHT +GGCGTTGTCCTGCTGAAATTGT +>MN908947.3:17478-17507_LEFT +GGCACACTAGAACCAGAATATTTCAATTC +>MN908947.3:17859-17886_RIGHT +GACTATGTCATATTCACTCAAACCACT +>MN908947.3:17813-17839_LEFT +GGGACTACCAACTCAAACTGTTGATT +>MN908947.3:18181-18212_RIGHT +ACATGACCTATAGAAGACTCATCTCTATGAT +>MN908947.3:18121-18153_LEFT +GTGTTGACACTAAATTCAAAACTGAAGGTTTA +>MN908947.3:18504-18527_RIGHT +GGACTTCCTTGGAATGTAGTGCG +>MN908947.3:18460-18484_LEFT +CGCCTGGAGATCAATTTAAACACC +>MN908947.3:18835-18860_RIGHT +ATGCACATGTAGCTAGTTGTGATGC +>MN908947.3:18789-18815_LEFT +GGTAACCTACAAAGCAACCATGATCT +>MN908947.3:19170-19195_RIGHT +ACAGATGGTGTATGCCTATTTTGGA +>MN908947.3:19087-19112_LEFT +TCTATGATGCACAGCCTTGTAGTGA +>MN908947.3:19469-19495_RIGHT +CAATTTAGGTGGTGCTGTCTGTAGAC +>MN908947.3:19415-19449_LEFT +AGTGTCAGATATAGATTATGTACCACTAAAGTCT +>MN908947.3:19770-19796_RIGHT +GTTAATGTAGCATTTGAGCTTTGGGC +>MN908947.3:19721-19750_LEFT +AGTTGATGGTGTTGATGTAGAATTGTTTG +>MN908947.3:20091-20121_RIGHT +AAACAAGCTAGTCTTAATGGAGTCACATTA +>MN908947.3:20028-20054_LEFT +GCCCGTAATGGTGTTCTTATTACAGA +>MN908947.3:20408-20441_RIGHT +TGAATTAGAAGATTTTATTCCTATGGACAGTAC +>MN908947.3:20358-20388_LEFT +GGTTTACATCTACTGATTGGACTAGCTAAA +>MN908947.3:20729-20758_RIGHT +GTGTGACCTTCAAAATTATGGTGATAGTG +>MN908947.3:20650-20676_LEFT +AATTACAATCTAGTCAAGCGTGGCAA +>MN908947.3:21018-21051_RIGHT +GCTAATAAATGGGATCTCATTATTAGTGATATG +>MN908947.3:20991-21018_LEFT +ATTGGTGATTGTGCAACTGTACATACA +>MN908947.3:21372-21402_RIGHT +TTGTCTTCCTATTCTTTATTTGACATGAGT +>MN908947.3:21322-21352_LEFT +ATGTCATGCATGCAAATTACATATTTTGGA +>MN908947.3:21696-21722_RIGHT +CTCAGTTTTACATTCAACTCAGGACT +>MN908947.3:21579-21607_LEFT +TTTATTGCCACTAGTCTCTAGTCAGTGT +>MN908947.3:21927-21960_RIGHT +CGCTACTAATGTTGTTATTAAAGTCTGTGAATT +>MN908947.3:21866-21894_LEFT +GAGGCTGGATTTTTGGTACTACTTTAGA +>MN908947.3:22238-22266_RIGHT +TGGTAGATTTGCCAATAGGTATTAACAT +>MN908947.3:22156-22189_LEFT +GGTTATTTTAAAATATATTCTAAGCACACGCCT +>MN908947.3:22517-22547_RIGHT +GAGTCCAACCAACAGAATCTATTGTTAGAT +>MN908947.3:22466-22494_LEFT +CGTTGAAATCCTTCACTGTAGAAAAAGG +>MN908947.3:22839-22866_RIGHT +AGATGATTTTACAGGCTGCGTTATAGC +>MN908947.3:22742-22774_LEFT +ATGTCTATGCAGATTCATTTGTAATTAGAGGT +>MN908947.3:23119-23140_RIGHT +GCACCAGCAACTGTTTGTGGA +>MN908947.3:23078-23109_LEFT +AACCATACAGAGTAGTAGTACTTTCTTTTGA +>MN908947.3:23452-23478_RIGHT +CCTACTTGGCGTGTTTATTCTACAGG +>MN908947.3:23229-23258_LEFT +CAAAAAGTTTCTGCCTTTCCAACAATTTG +>MN908947.3:23609-23631_RIGHT +GGGCACGTAGTGTAGCTAGTCA +>MN908947.3:23563-23589_LEFT +GCAGGTATATGCGCTAGTTATCAGAC +>MN908947.3:23914-23944_RIGHT +GTCAAACAAATTTACAAAACACCACCAATT +>MN908947.3:23823-23853_LEFT +GCAATATGGCAGTTTTTGTACACAATTAAA +>MN908947.3:24209-24231_RIGHT +CTTCTGGTTGGACCTTTGGTGC +>MN908947.3:24160-24189_LEFT +GATGAAATGATTGCTCAATACACTTCTGC +>MN908947.3:24535-24560_RIGHT +CAAATTGATAGGTTGATCACAGGCA +>MN908947.3:24442-24468_LEFT +ACGCTTGTTAAACAACTTAGCTCCAA +>MN908947.3:24815-24839_RIGHT +GAAAAGCACACTTTCCTCGTGAAG +>MN908947.3:24751-24774_LEFT +CATGTGACTTATGTCCCTGCACA +>MN908947.3:25120-25151_RIGHT +AATGAGGTTGCCAAGAATTTAAATGAATCTC +>MN908947.3:25053-25082_LEFT +TGATTTAGGTGACATCTCTGGCATTAATG +>MN908947.3:25423-25452_RIGHT +GAACTGTAACTTTGAAGCAAGGTGAAATC +>MN908947.3:25372-25402_LEFT +CATTACACATAAACGAACTTATGGATTTGT +>MN908947.3:25744-25777_RIGHT +TAAACTTTGTAAGAATAATAATGAGGCTTTGGC +>MN908947.3:25653-25680_LEFT +GTAACAGTTTACTCACACCTTTTGCTC +>MN908947.3:26048-26072_RIGHT +GAGTACAGACACTGGTGTTGAACA +>MN908947.3:26011-26039_LEFT +TCACTTCAGACTATTACCAGCTGTACTC +>MN908947.3:26382-26411_RIGHT +GTTAACGTGAGTCTTGTAAAACCTTCTTT +>MN908947.3:26339-26362_LEFT +CATCCTTACTGCGCTTCGATTGT +>MN908947.3:26730-26756_RIGHT +TTTACAGAATAAATTGGATCACCGGT +>MN908947.3:26593-26621_LEFT +AGGTTTCCTATTCCTTACATGGATTTGT +>MN908947.3:26989-27009_RIGHT +AGGACGCTGTGACATCAAGG +>MN908947.3:26958-26981_LEFT +GTGGACATCTTCGTATTGCTGGA +>MN908947.3:27349-27376_RIGHT +CTCAATTAGATGAAGAGCAACCAATGG +>MN908947.3:27200-27226_LEFT +GATGTTTCATCTCGTTGACTTTCAGG +>MN908947.3:27583-27603_RIGHT +CTTTTGCTTGTCCTGACGGC +>MN908947.3:27530-27558_LEFT +TCATCCTCTAGCTGATAACAAATTTGCA +>MN908947.3:27927-27950_RIGHT +CTGTAGCTGCATTTCACCAAGAA +>MN908947.3:27832-27860_LEFT +TATCTTTTGGTTCTCACTTGAACTGCAA +>MN908947.3:28209-28237_RIGHT +AAGACTTTTTAGAGTATCATGACGTTCG +>MN908947.3:28135-28166_LEFT +TTCCTGTTTACCTTTTACAATTAATTGCCAG +>MN908947.3:28513-28539_RIGHT +GATGACCAAATTGGCTACTACCGAAG +>MN908947.3:28473-28493_LEFT +TCGAGGACAAGGCGTTCCAA +>MN908947.3:28849-28873_RIGHT +AGTTCAAGAAATTCAACTCCAGGC +>MN908947.3:28808-28829_LEFT +GCAGTCAAGCCTCTTCTCGTT +>MN908947.3:29203-29224_RIGHT +GCTTCAGCGTTCTTCGGAATG +>MN908947.3:29159-29183_LEFT +CTGATTACAAACATTGGCCGCAAA +>MN908947.3:29538-29559_RIGHT +TGCAGACCACACAAGGCAGAT +>MN908947.3:29462-29486_LEFT +CTGCAGATTTGGATGATTTCTCCA +>MN908947.3:29840-29873_RIGHT +GTGATTTTAATAGCTTCTTAGGAGAATGACAAA diff --git a/tests/data/primers/SARS-CoV-2-ARTIC-V5.3.2.scheme.bed b/tests/data/primers/SARS-CoV-2-ARTIC-V5.3.2.scheme.bed new file mode 100644 index 0000000..2da0b06 --- /dev/null +++ b/tests/data/primers/SARS-CoV-2-ARTIC-V5.3.2.scheme.bed @@ -0,0 +1,192 @@ +MN908947.3 47 78 SARS-CoV-2_400_1_LEFT_1 1 + +MN908947.3 419 447 SARS-CoV-2_400_1_RIGHT_1 1 - +MN908947.3 344 366 SARS-CoV-2_400_2_LEFT_0 2 + +MN908947.3 707 732 SARS-CoV-2_400_2_RIGHT_0 2 - +MN908947.3 638 661 SARS-CoV-2_400_3_LEFT_1 1 + +MN908947.3 1018 1047 SARS-CoV-2_400_3_RIGHT_0 1 - +MN908947.3 970 995 SARS-CoV-2_400_4_LEFT_0 2 + +MN908947.3 1340 1370 SARS-CoV-2_400_4_RIGHT_0 2 - +MN908947.3 1292 1320 SARS-CoV-2_400_5_LEFT_0 1 + +MN908947.3 1660 1692 SARS-CoV-2_400_5_RIGHT_0 1 - +MN908947.3 1574 1596 SARS-CoV-2_400_6_LEFT_1 2 + +MN908947.3 1945 1972 SARS-CoV-2_400_6_RIGHT_1 2 - +MN908947.3 1882 1905 SARS-CoV-2_400_7_LEFT_2 1 + +MN908947.3 2259 2284 SARS-CoV-2_400_7_RIGHT_2 1 - +MN908947.3 2229 2252 SARS-CoV-2_400_8_LEFT_0 2 + +MN908947.3 2603 2629 SARS-CoV-2_400_8_RIGHT_0 2 - +MN908947.3 2533 2563 SARS-CoV-2_400_9_LEFT_0 1 + +MN908947.3 2900 2933 SARS-CoV-2_400_9_RIGHT_0 1 - +MN908947.3 2854 2880 SARS-CoV-2_400_10_LEFT_0 2 + +MN908947.3 3233 3254 SARS-CoV-2_400_10_RIGHT_0 2 - +MN908947.3 3184 3213 SARS-CoV-2_400_11_LEFT_0 1 + +MN908947.3 3560 3584 SARS-CoV-2_400_11_RIGHT_0 1 - +MN908947.3 3510 3540 SARS-CoV-2_400_12_LEFT_0 2 + +MN908947.3 3883 3913 SARS-CoV-2_400_12_RIGHT_0 2 - +MN908947.3 3791 3824 SARS-CoV-2_400_13_LEFT_0 1 + +MN908947.3 4147 4180 SARS-CoV-2_400_13_RIGHT_0 1 - +MN908947.3 4079 4108 SARS-CoV-2_400_14_LEFT_0 2 + +MN908947.3 4457 4488 SARS-CoV-2_400_14_RIGHT_0 2 - +MN908947.3 4403 4425 SARS-CoV-2_400_15_LEFT_0 1 + +MN908947.3 4776 4803 SARS-CoV-2_400_15_RIGHT_0 1 - +MN908947.3 4723 4756 SARS-CoV-2_400_16_LEFT_0 2 + +MN908947.3 5089 5119 SARS-CoV-2_400_16_RIGHT_0 2 - +MN908947.3 5036 5063 SARS-CoV-2_400_17_LEFT_0 1 + +MN908947.3 5398 5429 SARS-CoV-2_400_17_RIGHT_0 1 - +MN908947.3 5344 5370 SARS-CoV-2_400_18_LEFT_0 2 + +MN908947.3 5716 5744 SARS-CoV-2_400_18_RIGHT_0 2 - +MN908947.3 5671 5696 SARS-CoV-2_400_19_LEFT_0 1 + +MN908947.3 6031 6062 SARS-CoV-2_400_19_RIGHT_0 1 - +MN908947.3 5891 5923 SARS-CoV-2_400_20_LEFT_0 2 + +MN908947.3 6257 6288 SARS-CoV-2_400_20_RIGHT_0 2 - +MN908947.3 6204 6237 SARS-CoV-2_400_21_LEFT_0 1 + +MN908947.3 6562 6595 SARS-CoV-2_400_21_RIGHT_0 1 - +MN908947.3 6515 6542 SARS-CoV-2_400_22_LEFT_0 2 + +MN908947.3 6882 6915 SARS-CoV-2_400_22_RIGHT_0 2 - +MN908947.3 6823 6854 SARS-CoV-2_400_23_LEFT_0 1 + +MN908947.3 7199 7229 SARS-CoV-2_400_23_RIGHT_0 1 - +MN908947.3 7145 7179 SARS-CoV-2_400_24_LEFT_0 2 + +MN908947.3 7518 7545 SARS-CoV-2_400_24_RIGHT_0 2 - +MN908947.3 7456 7482 SARS-CoV-2_400_25_LEFT_0 1 + +MN908947.3 7819 7850 SARS-CoV-2_400_25_RIGHT_0 1 - +MN908947.3 7768 7797 SARS-CoV-2_400_26_LEFT_0 2 + +MN908947.3 8136 8169 SARS-CoV-2_400_26_RIGHT_0 2 - +MN908947.3 8085 8112 SARS-CoV-2_400_27_LEFT_0 1 + +MN908947.3 8468 8498 SARS-CoV-2_400_27_RIGHT_0 1 - +MN908947.3 8406 8436 SARS-CoV-2_400_28_LEFT_0 2 + +MN908947.3 8781 8806 SARS-CoV-2_400_28_RIGHT_0 2 - +MN908947.3 8732 8761 SARS-CoV-2_400_29_LEFT_0 1 + +MN908947.3 9107 9129 SARS-CoV-2_400_29_RIGHT_0 1 - +MN908947.3 9023 9052 SARS-CoV-2_400_30_LEFT_0 2 + +MN908947.3 9397 9423 SARS-CoV-2_400_30_RIGHT_0 2 - +MN908947.3 9299 9324 SARS-CoV-2_400_31_LEFT_1 1 + +MN908947.3 9673 9706 SARS-CoV-2_400_31_RIGHT_0 1 - +MN908947.3 9571 9604 SARS-CoV-2_400_32_LEFT_0 2 + +MN908947.3 9949 9971 SARS-CoV-2_400_32_RIGHT_0 2 - +MN908947.3 9896 9929 SARS-CoV-2_400_33_LEFT_0 1 + +MN908947.3 10266 10295 SARS-CoV-2_400_33_RIGHT_0 1 - +MN908947.3 10215 10245 SARS-CoV-2_400_34_LEFT_0 2 + +MN908947.3 10587 10615 SARS-CoV-2_400_34_RIGHT_0 2 - +MN908947.3 10527 10557 SARS-CoV-2_400_35_LEFT_0 1 + +MN908947.3 10897 10927 SARS-CoV-2_400_35_RIGHT_0 1 - +MN908947.3 10832 10865 SARS-CoV-2_400_36_LEFT_0 2 + +MN908947.3 11201 11232 SARS-CoV-2_400_36_RIGHT_0 2 - +MN908947.3 11152 11181 SARS-CoV-2_400_37_LEFT_0 1 + +MN908947.3 11514 11536 SARS-CoV-2_400_37_RIGHT_0 1 - +MN908947.3 11463 11494 SARS-CoV-2_400_38_LEFT_0 2 + +MN908947.3 11832 11863 SARS-CoV-2_400_38_RIGHT_0 2 - +MN908947.3 11785 11811 SARS-CoV-2_400_39_LEFT_0 1 + +MN908947.3 12161 12185 SARS-CoV-2_400_39_RIGHT_0 1 - +MN908947.3 12112 12137 SARS-CoV-2_400_40_LEFT_0 2 + +MN908947.3 12477 12510 SARS-CoV-2_400_40_RIGHT_0 2 - +MN908947.3 12419 12444 SARS-CoV-2_400_41_LEFT_0 1 + +MN908947.3 12794 12819 SARS-CoV-2_400_41_RIGHT_0 1 - +MN908947.3 12752 12774 SARS-CoV-2_400_42_LEFT_0 2 + +MN908947.3 13121 13146 SARS-CoV-2_400_42_RIGHT_0 2 - +MN908947.3 13075 13099 SARS-CoV-2_400_43_LEFT_0 1 + +MN908947.3 13458 13480 SARS-CoV-2_400_43_RIGHT_0 1 - +MN908947.3 13415 13435 SARS-CoV-2_400_44_LEFT_0 2 + +MN908947.3 13787 13815 SARS-CoV-2_400_44_RIGHT_0 2 - +MN908947.3 13738 13767 SARS-CoV-2_400_45_LEFT_0 1 + +MN908947.3 14120 14144 SARS-CoV-2_400_45_RIGHT_0 1 - +MN908947.3 14073 14100 SARS-CoV-2_400_46_LEFT_0 2 + +MN908947.3 14427 14457 SARS-CoV-2_400_46_RIGHT_0 2 - +MN908947.3 14375 14407 SARS-CoV-2_400_47_LEFT_0 1 + +MN908947.3 14745 14775 SARS-CoV-2_400_47_RIGHT_0 1 - +MN908947.3 14700 14725 SARS-CoV-2_400_48_LEFT_0 2 + +MN908947.3 15065 15095 SARS-CoV-2_400_48_RIGHT_0 2 - +MN908947.3 15016 15045 SARS-CoV-2_400_49_LEFT_0 1 + +MN908947.3 15386 15416 SARS-CoV-2_400_49_RIGHT_0 1 - +MN908947.3 15342 15366 SARS-CoV-2_400_50_LEFT_0 2 + +MN908947.3 15716 15742 SARS-CoV-2_400_50_RIGHT_0 2 - +MN908947.3 15659 15688 SARS-CoV-2_400_51_LEFT_0 1 + +MN908947.3 16028 16059 SARS-CoV-2_400_51_RIGHT_0 1 - +MN908947.3 15992 16018 SARS-CoV-2_400_52_LEFT_2 2 + +MN908947.3 16386 16409 SARS-CoV-2_400_52_RIGHT_2 2 - +MN908947.3 16285 16311 SARS-CoV-2_400_53_LEFT_0 1 + +MN908947.3 16650 16679 SARS-CoV-2_400_53_RIGHT_0 1 - +MN908947.3 16624 16647 SARS-CoV-2_400_54_LEFT_1 2 + +MN908947.3 17004 17033 SARS-CoV-2_400_54_RIGHT_1 2 - +MN908947.3 16962 16994 SARS-CoV-2_400_55_LEFT_1 1 + +MN908947.3 17333 17362 SARS-CoV-2_400_55_RIGHT_1 1 - +MN908947.3 17182 17212 SARS-CoV-2_400_56_LEFT_0 2 + +MN908947.3 17560 17582 SARS-CoV-2_400_56_RIGHT_0 2 - +MN908947.3 17478 17507 SARS-CoV-2_400_57_LEFT_0 1 + +MN908947.3 17859 17886 SARS-CoV-2_400_57_RIGHT_0 1 - +MN908947.3 17813 17839 SARS-CoV-2_400_58_LEFT_0 2 + +MN908947.3 18181 18212 SARS-CoV-2_400_58_RIGHT_0 2 - +MN908947.3 18121 18153 SARS-CoV-2_400_59_LEFT_0 1 + +MN908947.3 18504 18527 SARS-CoV-2_400_59_RIGHT_0 1 - +MN908947.3 18460 18484 SARS-CoV-2_400_60_LEFT_0 2 + +MN908947.3 18835 18860 SARS-CoV-2_400_60_RIGHT_0 2 - +MN908947.3 18789 18815 SARS-CoV-2_400_61_LEFT_0 1 + +MN908947.3 19170 19195 SARS-CoV-2_400_61_RIGHT_0 1 - +MN908947.3 19087 19112 SARS-CoV-2_400_62_LEFT_2 2 + +MN908947.3 19469 19495 SARS-CoV-2_400_62_RIGHT_0 2 - +MN908947.3 19415 19449 SARS-CoV-2_400_63_LEFT_0 1 + +MN908947.3 19770 19796 SARS-CoV-2_400_63_RIGHT_0 1 - +MN908947.3 19721 19750 SARS-CoV-2_400_64_LEFT_0 2 + +MN908947.3 20091 20121 SARS-CoV-2_400_64_RIGHT_0 2 - +MN908947.3 20028 20054 SARS-CoV-2_400_65_LEFT_0 1 + +MN908947.3 20408 20441 SARS-CoV-2_400_65_RIGHT_0 1 - +MN908947.3 20358 20388 SARS-CoV-2_400_66_LEFT_0 2 + +MN908947.3 20729 20758 SARS-CoV-2_400_66_RIGHT_0 2 - +MN908947.3 20650 20676 SARS-CoV-2_400_67_LEFT_1 1 + +MN908947.3 21018 21051 SARS-CoV-2_400_67_RIGHT_1 1 - +MN908947.3 20991 21018 SARS-CoV-2_400_68_LEFT_0 2 + +MN908947.3 21372 21402 SARS-CoV-2_400_68_RIGHT_0 2 - +MN908947.3 21322 21352 SARS-CoV-2_400_69_LEFT_0 1 + +MN908947.3 21696 21722 SARS-CoV-2_400_69_RIGHT_0 1 - +MN908947.3 21579 21607 SARS-CoV-2_400_70_LEFT_0 2 + +MN908947.3 21927 21960 SARS-CoV-2_400_70_RIGHT_0 2 - +MN908947.3 21866 21894 SARS-CoV-2_400_71_LEFT_0 1 + +MN908947.3 22238 22266 SARS-CoV-2_400_71_RIGHT_0 1 - +MN908947.3 22156 22189 SARS-CoV-2_400_72_LEFT_0 2 + +MN908947.3 22517 22547 SARS-CoV-2_400_72_RIGHT_0 2 - +MN908947.3 22466 22494 SARS-CoV-2_400_73_LEFT_0 1 + +MN908947.3 22839 22866 SARS-CoV-2_400_73_RIGHT_0 1 - +MN908947.3 22742 22774 SARS-CoV-2_400_74_LEFT_0 2 + +MN908947.3 23119 23140 SARS-CoV-2_400_74_RIGHT_0 2 - +MN908947.3 23078 23109 SARS-CoV-2_400_75_LEFT_1 1 + +MN908947.3 23452 23478 SARS-CoV-2_400_75_RIGHT_1 1 - +MN908947.3 23229 23258 SARS-CoV-2_400_76_LEFT_0 2 + +MN908947.3 23609 23631 SARS-CoV-2_400_76_RIGHT_0 2 - +MN908947.3 23563 23589 SARS-CoV-2_400_77_LEFT_0 1 + +MN908947.3 23914 23944 SARS-CoV-2_400_77_RIGHT_0 1 - +MN908947.3 23823 23853 SARS-CoV-2_400_78_LEFT_0 2 + +MN908947.3 24209 24231 SARS-CoV-2_400_78_RIGHT_0 2 - +MN908947.3 24160 24189 SARS-CoV-2_400_79_LEFT_0 1 + +MN908947.3 24535 24560 SARS-CoV-2_400_79_RIGHT_0 1 - +MN908947.3 24442 24468 SARS-CoV-2_400_80_LEFT_0 2 + +MN908947.3 24815 24839 SARS-CoV-2_400_80_RIGHT_0 2 - +MN908947.3 24751 24774 SARS-CoV-2_400_81_LEFT_0 1 + +MN908947.3 25120 25151 SARS-CoV-2_400_81_RIGHT_0 1 - +MN908947.3 25053 25082 SARS-CoV-2_400_82_LEFT_0 2 + +MN908947.3 25423 25452 SARS-CoV-2_400_82_RIGHT_0 2 - +MN908947.3 25372 25402 SARS-CoV-2_400_83_LEFT_0 1 + +MN908947.3 25744 25777 SARS-CoV-2_400_83_RIGHT_0 1 - +MN908947.3 25653 25680 SARS-CoV-2_400_84_LEFT_2 2 + +MN908947.3 26048 26072 SARS-CoV-2_400_84_RIGHT_2 2 - +MN908947.3 26011 26039 SARS-CoV-2_400_85_LEFT_0 1 + +MN908947.3 26382 26411 SARS-CoV-2_400_85_RIGHT_0 1 - +MN908947.3 26339 26362 SARS-CoV-2_400_86_LEFT_0 2 + +MN908947.3 26730 26756 SARS-CoV-2_400_86_RIGHT_0 2 - +MN908947.3 26593 26621 SARS-CoV-2_400_87_LEFT_1 1 + +MN908947.3 26989 27009 SARS-CoV-2_400_87_RIGHT_1 1 - +MN908947.3 26958 26981 SARS-CoV-2_400_88_LEFT_2 2 + +MN908947.3 27349 27376 SARS-CoV-2_400_88_RIGHT_2 2 - +MN908947.3 27200 27226 SARS-CoV-2_400_89_LEFT_2 1 + +MN908947.3 27583 27603 SARS-CoV-2_400_89_RIGHT_0 1 - +MN908947.3 27530 27558 SARS-CoV-2_400_90_LEFT_0 2 + +MN908947.3 27927 27950 SARS-CoV-2_400_90_RIGHT_0 2 - +MN908947.3 27832 27860 SARS-CoV-2_400_91_LEFT_0 1 + +MN908947.3 28209 28237 SARS-CoV-2_400_91_RIGHT_0 1 - +MN908947.3 28135 28166 SARS-CoV-2_400_92_LEFT_0 2 + +MN908947.3 28513 28539 SARS-CoV-2_400_92_RIGHT_0 2 - +MN908947.3 28473 28493 SARS-CoV-2_400_93_LEFT_0 1 + +MN908947.3 28849 28873 SARS-CoV-2_400_93_RIGHT_0 1 - +MN908947.3 28808 28829 SARS-CoV-2_400_94_LEFT_0 2 + +MN908947.3 29203 29224 SARS-CoV-2_400_94_RIGHT_0 2 - +MN908947.3 29159 29183 SARS-CoV-2_400_95_LEFT_0 1 + +MN908947.3 29538 29559 SARS-CoV-2_400_95_RIGHT_0 1 - +MN908947.3 29462 29486 SARS-CoV-2_400_96_LEFT_1 2 + +MN908947.3 29840 29873 SARS-CoV-2_400_96_RIGHT_0 2 - diff --git a/tests/data/primers/empty.fasta b/tests/data/primers/empty.fasta new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/primers/synthetic-no-match.fasta b/tests/data/primers/synthetic-no-match.fasta new file mode 100644 index 0000000..1dece2a --- /dev/null +++ b/tests/data/primers/synthetic-no-match.fasta @@ -0,0 +1,8 @@ +>read_number_1_LEFT +AGCTTAGCTA +>read_number_1_RIGHT +TGCATGCAAT +>read_number_2_LEFT +CGTACGTAGC +>read_number_2_RIGHT +TACGATCGTA \ No newline at end of file diff --git a/tests/data/primers/synthetic.bed b/tests/data/primers/synthetic.bed new file mode 100644 index 0000000..5066d6c --- /dev/null +++ b/tests/data/primers/synthetic.bed @@ -0,0 +1,4 @@ +MN908947.3 0 10 SARS-CoV-2_400_1_LEFT_1 1 + +MN908947.3 110 120 SARS-CoV-2_400_1_RIGHT_1 1 - +MN908947.3 80 90 SARS-CoV-2_400_2_LEFT_1 1 + +MN908947.3 190 200 SARS-CoV-2_400_2_RIGHT_1 1 - \ No newline at end of file diff --git a/tests/data/primers/synthetic.fasta b/tests/data/primers/synthetic.fasta new file mode 100644 index 0000000..54bd943 --- /dev/null +++ b/tests/data/primers/synthetic.fasta @@ -0,0 +1,8 @@ +>read_number_1_LEFT +GGAAATTCAT +>read_number_1_RIGHT +ACCAGGAGCC +>read_number_2_LEFT +CTACCCTCGA +>read_number_2_RIGHT +ATTCTACGTA \ No newline at end of file diff --git a/tests/data/references/SARS-CoV-2-reference.fasta b/tests/data/references/SARS-CoV-2-reference.fasta new file mode 100644 index 0000000..a1da7b7 --- /dev/null +++ b/tests/data/references/SARS-CoV-2-reference.fasta @@ -0,0 +1,429 @@ +>MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA +CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC +TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG +TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC +CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC +GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG +CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT +GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT +TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA +GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG +TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG +CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG +TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG +CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA +ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC +CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA +GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT +ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG +GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG +CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA +CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA +ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT +TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG +GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG +TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC +GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG +ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG +GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT +AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT +AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA +GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC +TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT +AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA +GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT +ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA +GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA +ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA +TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG +AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT +TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA +CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC +AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA +GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA +CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG +TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT +GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT +TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA +TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT +GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC +TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA +GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG +ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT +GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT +GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA +TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC +AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA +CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA +TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT +TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG +AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA +ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC +CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA +AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT +AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG +ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA +TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA +ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA +ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT +GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT +GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC +ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT +GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG +ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG +TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG +TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC +CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA +CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC +CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG +TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA +ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA +AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA +TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT +ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG +CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA +TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG +CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA +TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC +TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG +GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT +TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA +TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC +TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC +TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT +GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG +GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT +GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA +GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA +TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT +GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT +AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT +AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG +AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT +TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT +ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG +GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG +AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG +CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT +TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA +ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA +ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC +ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC +ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT +TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT +GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC +TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC +AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA +GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG +GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA +CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT +ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT +CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG +TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG +CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC +GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC +TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC +TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT +GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG +ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG +ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG +TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT +ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG +TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT +GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT +TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA +ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG +CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA +TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA +ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC +AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC +TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA +TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT +AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG +ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT +CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG +TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC +TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG +GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA +GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC +TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA +AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA +GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA +GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA +TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA +GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA +TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA +ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT +TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG +TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT +GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA +CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT +TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT +ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT +ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT +GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT +GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG +TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA +AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG +TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA +CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA +ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT +GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC +CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA +ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC +ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG +ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA +GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG +TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT +TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA +AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG +ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG +ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG +CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC +AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT +GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC +AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG +ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT +AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC +TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC +TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG +GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG +TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT +AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA +GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC +AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC +ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC +AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT +GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA +TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC +AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC +GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA +TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA +CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT +AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT +ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT +GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT +GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA +TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT +GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG +AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT +TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC +GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT +AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC +TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG +GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT +GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT +AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT +TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC +ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT +TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC +TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA +GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC +AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA +TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC +TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT +GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC +TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA +CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT +CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA +GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG +TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA +TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA +GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC +TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT +TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA +TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA +TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT +AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA +AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT +GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG +TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG +AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC +ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC +CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT +AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT +TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG +ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA +GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA +AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA +CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA +CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT +TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC +CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA +AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT +AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA +CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA +CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT +ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG +ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA +AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC +GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA +TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC +GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT +TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA +ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT +AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG +GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG +GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA +AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG +GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA +CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA +ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA +GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG +TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC +CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT +TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC +GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC +AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA +TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT +TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA +GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT +GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT +GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG +TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC +ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT +GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG +ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC +TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT +TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT +TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC +AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC +TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA +GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC +CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA +CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT +CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG +GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC +CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA +ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA +TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC +AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA +TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC +TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG +ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG +GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA +AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA +CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA +TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG +TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA +TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA +GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT +TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA +CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC +TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA +GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG +CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT +ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG +ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA +ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC +GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT +TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT +GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG +GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT +CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA +CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG +GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA +ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC +CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT +TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC +TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT +GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT +CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA +TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC +CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA +AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT +AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC +ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT +GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA +GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG +ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG +CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC +TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA +AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC +CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA +TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT +TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT +GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT +ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG +CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA +GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG +TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA +TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT +GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA +AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG +ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG +TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT +GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC +CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT +GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA +AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC +ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT +AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA +ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG +TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG +CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA +ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG +TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC +TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC +TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT +TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG +CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT +GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC +GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT +TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAA diff --git a/tests/data/references/empty.fasta b/tests/data/references/empty.fasta new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/references/indexed_ref.mmi b/tests/data/references/indexed_ref.mmi new file mode 100644 index 0000000..58bdbaa Binary files /dev/null and b/tests/data/references/indexed_ref.mmi differ diff --git a/tests/data/references/synthetic.fasta b/tests/data/references/synthetic.fasta new file mode 100644 index 0000000..05cae80 --- /dev/null +++ b/tests/data/references/synthetic.fasta @@ -0,0 +1,2 @@ +>homemade_reference +GGAAATTCATTCTAGGGAGTGACGTGGACCCCGGATTGATACAGGATCACATGTAGAAAAGGTAGTCGGACAAGTTACCGCTACCCTCGACCTCGTGGGGCCTACACCTGACCAGGAGCCGCACTGACAGGACCACGCTTCATCATAACTTTGGCGGCTGGGCAACGGATTTAATGGTACATAACTCATCATTCTACGTA \ No newline at end of file diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/config_parser.py b/tests/e2e/config_parser.py new file mode 100644 index 0000000..0bd1921 --- /dev/null +++ b/tests/e2e/config_parser.py @@ -0,0 +1,58 @@ +""" +This module provides a configuration parser for reading and parsing YAML configuration files. + +Classes +------- +ConfigParser + A class to parse YAML configuration files and return the configuration as a dictionary. +""" + +import yaml + +from AmpliGone.log import log as logger + + +class ConfigParser: + """ + A class to parse YAML configuration files. + + Attributes + ---------- + logger : logging.Logger + The logger instance for logging messages. + + Methods + ------- + parse_config(config_file: str) -> dict[str, dict[str, dict[str, str]]] + Parses the given YAML configuration file and returns the configuration as a dictionary. + """ + + def __init__(self) -> None: + """ + Initializes the ConfigParser with a logger instance. + """ + self.logger = logger + + def parse_config(self, config_file: str) -> dict[str, dict[str, dict[str, str]]]: + """ + Parses the given YAML configuration file and returns the configuration as a dictionary. + + Parameters + ---------- + config_file : str + The path to the YAML configuration file. + + Returns + ------- + dict[str, dict[str, dict[str, str]]] + The parsed configuration as a nested dictionary. + """ + with open(config_file, "r", encoding="utf-8") as file: + config: dict[str, dict[str, dict[str, str]]] = yaml.safe_load(file) + return config + + def edit_config(self, config_file: str) -> None: + """ + Placeholder for editing configuration files + """ + raise NotImplementedError("Editing configuration files is not yet supported.") diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py new file mode 100644 index 0000000..14d1116 --- /dev/null +++ b/tests/e2e/test_e2e.py @@ -0,0 +1,181 @@ +""" +Module for end-to-end testing of the AmpliGone CLI program. + +This module contains the TestE2e class which provides methods to run end-to-end tests +on the AmpliGone CLI program. The tests are configured using a YAML file and include +various scenarios such as real-world data, synthetic data, and edge cases. +""" + +import logging +import os +from typing import Generator + +import pytest + +from AmpliGone.__main__ import main +from tests.e2e.config_parser import ConfigParser + + +class TestE2e: # pylint: disable=too-few-public-methods + """ + Class for end-to-end testing of the AmpliGone CLI program. + + This class provides methods to run end-to-end tests on the AmpliGone CLI program. + The tests are configured using a YAML file and include various scenarios such as + real-world data, synthetic data, and edge cases. + + Attributes + ---------- + config_parser : ConfigParser + An instance of the ConfigParser class to parse the configuration file. + config : dict + A dictionary containing the parsed configuration data. + + Methods + ------- + _cleanup() + Fixture to clean up output files after tests are run. + _order_fastq_by_name(fastq_lines) + Orders the lines of a FASTQ file by read name. + _compare_outputs(output_file, expected_output_file) + Compares the output file with the expected output file. + _args_to_list(args) + Converts a dictionary of arguments to a list. + test_ampligone(test_case, caplog) + Runs the AmpliGone CLI program with the specified test case and captures logs. + """ + + config_parser = ConfigParser() + config: dict[str, dict[str, dict[str, str]]] = config_parser.parse_config( + "tests/config.yaml" + ) + + @pytest.fixture( + scope="session", autouse=True + ) # session scope is used to run the fixture only once + def _cleanup(self) -> Generator[None, None, None]: + """ + Fixture to clean up output files after tests are run. + + This fixture runs after all tests in the session have completed and removes + any output files generated during the tests. + + Yields + ------ + None + """ + yield # this is to wait for the test to finish before cleaning up + for case in self.config.values(): + output_path = case["pipeline_args"]["--output"] + if os.path.exists(output_path): + os.remove(output_path) + + def _order_fastq_by_name(self, fastq_lines: list[str]) -> list[str]: + """ + Orders the lines of a FASTQ file by read name. + Used because the order of reads in a FASTQ file is not guaranteed. + + Parameters + ---------- + fastq_lines : list of str + The lines of the FASTQ file. + + Returns + ------- + list of str + The ordered lines of the FASTQ file. + """ + chunks = [fastq_lines[i : i + 4] for i in range(0, len(fastq_lines), 4)] + sorted_chunks = sorted(chunks, key=lambda x: x[0]) + sorted_fastq = [line for chunk in sorted_chunks for line in chunk] + return sorted_fastq + + def _compare_outputs(self, output_file: str, expected_output_file: str) -> None: + """ + Compares the output file with the expected output file. + Specifically, it compares the FASTQ files generated by the AmpliGone CLI program. + + Parameters + ---------- + output_file : str + The path to the output file generated by the AmpliGone CLI program. + expected_output_file : str + The path to the expected output file. + + Raises + ------ + AssertionError + If the output file does not match the expected output file. + """ + with open(output_file, "r", encoding="utf-8") as output, open( + expected_output_file, "r", encoding="utf-8" + ) as expected_output: + output_lines = output.readlines() + expected_output_lines = expected_output.readlines() + # the fastq file is unordered, so we need to sort it by the read name + output_lines = self._order_fastq_by_name(output_lines) + expected_output_lines = self._order_fastq_by_name(expected_output_lines) + for line1, line2 in zip(output_lines, expected_output_lines): + if line1 != line2: + raise AssertionError(f"Output: {line1}, Expected Output: {line2}") + + def _args_to_list(self, args: dict[str, str]) -> list[str]: + """ + Converts a dictionary of arguments to a list. + + Parameters + ---------- + args : dict of str + The dictionary of arguments. + + Returns + ------- + list of str + The list of arguments. + """ + return [item for pain in args.items() for item in pain] + + @pytest.mark.parametrize( + "test_case", list(config.values()), ids=list(config.keys()) + ) + def test_ampligone( + self, test_case: dict[str, dict[str, str]], caplog: pytest.LogCaptureFixture + ) -> None: + """ + Runs the AmpliGone CLI program with the specified test cases from the config.yml file. + + + Parameters + ---------- + test_case : dict + The test case configuration. + caplog : pytest.LogCaptureFixture + The log capture fixture. + + Raises + ------ + SystemExit + If the test case is expected to fail. + AssertionError + If the output file does not match the expected output file or if the + expected log message is not found in the logs. + """ + args = self._args_to_list(test_case["pipeline_args"]) + + with caplog.at_level(logging.DEBUG, logger="rich"): + if test_case["test_args"]["fails"]: + with pytest.raises(SystemExit): + main(args) + else: + main(args) + self._compare_outputs( + test_case["pipeline_args"]["--output"], + test_case["test_args"]["comparison_file"], + ) + if test_case["test_args"].get( + "expected_log_message", None + ): # I dont think its necessary to check the logs in every case + assert ( + test_case["test_args"]["expected_log_message"].lower() + in caplog.text.lower() + ) diff --git a/tests/unit/test_args.py b/tests/unit/test_args.py new file mode 100644 index 0000000..aecd5ff --- /dev/null +++ b/tests/unit/test_args.py @@ -0,0 +1,143 @@ +""" +Unit tests for the `get_args` function in the AmpliGone module. + +This module contains unit tests for the `get_args` function, which parses command-line arguments for the AmpliGone pipeline. + +Test Scenarios +-------------- +- Test with the `--help` flag to ensure it raises `SystemExit`. +- Test with no arguments to ensure it raises `SystemExit`. +- Test with invalid arguments to ensure it raises `SystemExit`. +- Test with necessary arguments to ensure they are parsed correctly. + +Classes +------- +TestArgs + A class containing unit tests for the `get_args` function. +""" + +import pytest + +from AmpliGone.args import get_args +from tests.e2e.config_parser import ConfigParser + + +class TestArgs: + """ + Unit tests for the `get_args` function in the AmpliGone module. + + This class contains unit tests for the `get_args` function, + which parses command-line arguments for the AmpliGone pipeline. + + Attributes + ---------- + config_parser : ConfigParser + An instance of the ConfigParser class to parse the configuration file. + config : dict + The parsed configuration dictionary. + pipeline_args : dict + The dictionary of pipeline arguments from the configuration. + happy_arg_list : list + The list of arguments for the happy flow scenario. + + Methods + ------- + test_help() + Test that the `get_args` function raises `SystemExit` when the `--help` flag is provided. + test_no_args() + Test that the `get_args` function raises `SystemExit` when no arguments are provided. + test_invalid_args() + Test that the `get_args` function raises `SystemExit` when invalid arguments are provided. + test_necessary_args() + Test that the `get_args` function correctly parses the necessary arguments. + """ + + config_parser = ConfigParser() + config = config_parser.parse_config("tests/config.yaml") + pipeline_args = config["happy_sars_cov_2"]["pipeline_args"] + happy_arg_list = [item for pain in pipeline_args.items() for item in pain] + + def test_help(self) -> None: + """ + Test that the `get_args` function raises `SystemExit` when the `--help` flag is provided. + + This test ensures that the `get_args` function correctly handles the `--help` flag by raising `SystemExit`. + + Parameters + ---------- + self : TestArgs + The instance of the test class. + + Returns + ------- + None + This function does not return any value. It asserts that `SystemExit` is raised. + """ + with pytest.raises(SystemExit): + get_args(["--help"]) + + def test_no_args(self) -> None: + """ + Test that the `get_args` function raises `SystemExit` when no arguments are provided. + + This test ensures that the `get_args` function correctly handles the case where no arguments are provided by raising `SystemExit`. + + Parameters + ---------- + self : TestArgs + The instance of the test class. + + Returns + ------- + None + This function does not return any value. It asserts that `SystemExit` is raised. + """ + with pytest.raises(SystemExit): + get_args([]) + + def test_invalid_args(self) -> None: + """ + Test that the `get_args` function raises `SystemExit` when invalid arguments are provided. + + This test ensures that the `get_args` function correctly handles invalid arguments by raising `SystemExit`. + + Parameters + ---------- + self : TestArgs + The instance of the test class. + + Returns + ------- + None + This function does not return any value. It asserts that `SystemExit` is raised. + """ + with pytest.raises(SystemExit): + get_args(["--invalid"]) + + def test_necessary_args(self) -> None: + """ + Test that the `get_args` function correctly parses the necessary arguments. + + This test ensures that the `get_args` function correctly parses the necessary arguments and matches them with the expected values. + + Parameters + ---------- + self : TestArgs + The instance of the test class. + + Returns + ------- + None + This function does not return any value. It asserts that the parsed arguments match the expected values. + """ + args = vars( + get_args(self.happy_arg_list) + ) # get_args returns a Namespace object, so we need to convert it to a dictionary + + for args_key, args_value in args.items(): + for expected_key, expected_value in self.pipeline_args.items(): + if args_key == expected_key[2:]: # remove the '--' from the key + if args_key == "amplicon_type": + assert args_value == expected_value + else: # args_value has absolute path, expected_value has relative path + assert expected_value in args_value diff --git a/tests/unit/test_cut_reads.py b/tests/unit/test_cut_reads.py new file mode 100644 index 0000000..7511c42 --- /dev/null +++ b/tests/unit/test_cut_reads.py @@ -0,0 +1,342 @@ +""" +Unit tests for the `cut_reads` function in the AmpliGone module. + +This module contains unit tests for the `cut_reads` function, +which processes sequencing reads by cutting them based on primer locations and reference mapping. + +Extra tests that could be added if deemed necessary: +- Tests with faulty inputs, like an empty DataFrame, invalid sequences or qualities. +- Tests with different parameters, like different scoring thresholds or presets. + + +Test Scenarios +-------------- +- Test with reads that are too short to be processed. +- Test with no primers available for the reference. +- Test with valid data and different amplicon types. +- Test with primers partially on the read. +- Test with primers not on the read. + +Classes +------- +TestCutReads + A class containing unit tests for the `cut_reads` function. +""" + +from collections import defaultdict + +import pandas as pd +import pytest + +from AmpliGone.cut_reads import cut_reads + +# Length of the read should theoretically be ~42bp because the default k-mer size for the short reads preset (-sr) is 21. +# However, in practice this does not work because the -sr preset includes a `-s 40` parameter which means that - +# the scoring threshold is set to 40. This score is very hard to achieve with only 42bp reads. +# This means that for testing purposes reads should be around 50-100bp long. +# To make sure everything works as expected, the read length that will be used for testing is 100bp. + +AMPLICON_TYPES = ( + "end-to-end", + "end-to-mid", + "fragmented", +) # cannot be in the class because the pytest.mark.parametrize decorator needs to access it before the class is created + + +class TestCutReads: + """ + Unit tests for the `cut_reads` function in the AmpliGone module. + + This class contains unit tests for the `cut_reads` function, + which processes sequencing reads by cutting them based on primer locations and reference mapping. + + Attributes + ---------- + HAPPY_SEQ : str + A sample sequence used for testing. + HAPPY_QUAL : str + A sample quality string corresponding to the HAPPY_SEQ. + reference : str + The path to the reference genome sequence used for testing. + preset : str + The preset used for minimap2 alignment. + scoring : list of int + The scoring matrix used for minimap2 alignment. + fragment_lookaround_size : int + The number of bases to look around a fragment when cutting reads. + + Methods + ------- + test_cut_reads_too_short() + Test that the `cut_reads` function skips reads that are too short to be processed. + test_cut_reads_no_primers() + Test that the `cut_reads` function handles cases where no primers are available for the reference. + test_cut_reads_happy(amplicon_type) + Test a happy path scenario for the `cut_reads` function with different amplicon types. + test_cut_reads_primer_half_on_read(amplicon_type) + Test a scenario where the FW and RV primers are only partially on the read. + test_cut_reads_wrong_primers(amplicon_type) + Test a scenario where the primers are not on the read. + """ + + HAPPY_SEQ = "GGAAATTCATTCTAGGGAGTGACGTGGACCCCGGATTGATACAGGATCACATGTAGAAAAGGTAGTCGGACAAGTTACCGCTACCCTCGACCTCGTGGGG" + HAPPY_QUAL = "EE10%-1#-@7F&@?7(13;-$)A7.7/3(I(.9)&//,$G9?HA'DG=/;3C)2:@C!2/#;8.#7'98AC;FG>E>;E'>'$100G&44763?0,@7I" + + reference = "/home/raaijmag/IDS/AmpliGone/tests/data/references/synthetic.fasta" + preset = "sr" + scoring: list[int] = [] + fragment_lookaround_size = 10000 + + def test_cut_reads_too_short(self) -> None: + """ + Test that the `cut_reads` function skips reads that are too short to be processed. + + This test ensures that reads shorter than the minimum required length (42bp) are not processed + and are skipped by the `cut_reads` function. It might be useful to add a warning message in the future, + for reads under 100bp. + + This does not raise an error, but returns an empty DataFrame. + + Parameters + ---------- + self : TestCutReads + The instance of the test class containing the test data and parameters. + + Returns + ------- + None + This function does not return any value. It asserts that the result DataFrame is empty. + """ + + short_seq = self.HAPPY_SEQ[:20] + self.HAPPY_SEQ[-20:] + short_qual = self.HAPPY_QUAL[:20] + self.HAPPY_QUAL[-20:] + + data: tuple[pd.DataFrame, int] = ( + pd.DataFrame( + { + "Readname": ["read_number_1"], + "Sequence": [short_seq], + "Qualities": [short_qual], + } + ), + 0, + ) + primer_1 = set(range(0, 5)) + primer_2 = set(range(15, 20)) + primer_sets = ( + defaultdict(set, {"synthetic_reference": primer_1}), + defaultdict(set, {"synthetic_reference": primer_2}), + ) + result = cut_reads( + data, + primer_sets, + self.reference, + self.preset, + self.scoring, + self.fragment_lookaround_size, + "end-to-end", + ) + assert result.empty + + def test_cut_reads_no_primers(self) -> None: + """ + Test that the `cut_reads` function handles cases where no primers are available for the reference. + + This test ensures that the `cut_reads` function can process reads even when no primers are available for the reference. + It verifies that the function does not remove any coordinates in such cases. + + Parameters + ---------- + self : TestCutReads + The instance of the test class containing the test data and parameters. + + Returns + ------- + None + This function does not return any value. It asserts that the "Removed_coordinates" column in the result DataFrame is empty. + """ + data: tuple[pd.DataFrame, int] = ( + pd.DataFrame( + { + "Readname": ["read_number_1"], + "Sequence": [self.HAPPY_SEQ], + "Qualities": [self.HAPPY_QUAL], + } + ), + 0, + ) + empty_primer_1: set = set() + empty_primer_2: set = set() + primer_sets = ( + defaultdict(set, {"synthetic_reference": empty_primer_1}), + defaultdict(set, {"synthetic_reference": empty_primer_2}), + ) + result = cut_reads( + data, + primer_sets, + self.reference, + self.preset, + self.scoring, + self.fragment_lookaround_size, + "end-to-end", + ) + assert not result["Removed_coordinates"].iloc[0] # empty list + + @pytest.mark.parametrize("amplicon_type", AMPLICON_TYPES) + def test_cut_reads_happy(self, amplicon_type: str) -> None: + """ + Test a happy path scenario for the `cut_reads` function with different amplicon types. + + This test ensures that the `cut_reads` function processes reads correctly for different amplicon types, + including "end-to-end", "end-to-mid", and "fragmented". + + Parameters + ---------- + self : TestCutReads + The instance of the test class containing the test data and parameters. + amplicon_type : str + The type of amplicon, either "end-to-end", "end-to-mid", or "fragmented". + + Returns + ------- + None + This function does not return any value. It asserts that the removed coordinates match the expected coordinates. + """ + data: tuple[pd.DataFrame, int] = ( + pd.DataFrame( + { + "Readname": ["read_number_1"], + "Sequence": [self.HAPPY_SEQ], + "Qualities": [self.HAPPY_QUAL], + } + ), + 0, + ) + primer_1 = set(range(0, 10)) + primer_2 = set(range(91, 101)) + primer_sets = ( + defaultdict(set, {"synthetic_reference": primer_1}), + defaultdict(set, {"synthetic_reference": primer_2}), + ) + result = cut_reads( + data, + primer_sets, + self.reference, + self.preset, + self.scoring, + self.fragment_lookaround_size, + amplicon_type, + ) + if amplicon_type == "end-to-end" or amplicon_type == "fragmented": + ete_coords: list[int] = result["Removed_coordinates"].iloc[0] + ete_expected_coords = list(primer_1) + list(primer_2) + assert ete_coords.sort() == ete_expected_coords.sort() + else: + assert amplicon_type == "end-to-mid" + etm_coords: list[int] = result["Removed_coordinates"].iloc[0] + etm_expected_coords = list(primer_1) + assert etm_coords.sort() == etm_expected_coords.sort() + + @pytest.mark.parametrize("amplicon_type", AMPLICON_TYPES) + def test_cut_reads_primer_half_on_read(self, amplicon_type: str) -> None: + """ + Test a scenario where the FW and RV primers are only partially on the read. + + This test ensures that the `cut_reads` function can handle cases where the forward (FW) and reverse (RV) primers + are only partially present on the read sequence. + + Parameters + ---------- + self : TestCutReads + The instance of the test class containing the test data and parameters. + amplicon_type : str + The type of amplicon, either "end-to-end", "end-to-mid", or "fragmented". + + Returns + ------- + None + This function does not return any value. It asserts that the removed coordinates match the expected coordinates. + """ + seq = "ATCGC" + self.HAPPY_SEQ[5:-5] + "ATCGC" + data: tuple[pd.DataFrame, int] = ( + pd.DataFrame( + { + "Readname": ["read_number_1"], + "Sequence": [seq], + "Qualities": [self.HAPPY_QUAL], + } + ), + 0, + ) + primer_1 = set(range(0, 10)) + primer_2 = set(range(91, 101)) + primer_sets = ( + defaultdict(set, {"synthetic_reference": primer_1}), + defaultdict(set, {"synthetic_reference": primer_2}), + ) + result = cut_reads( + data, + primer_sets, + self.reference, + self.preset, + self.scoring, + self.fragment_lookaround_size, + amplicon_type, + ) + if amplicon_type == "end-to-end" or amplicon_type == "fragmented": + ete_coords: list[int] = result["Removed_coordinates"].iloc[0] + ete_expected_coords = list(primer_1) + list(primer_2) + assert ete_coords.sort() == ete_expected_coords.sort() + else: + assert amplicon_type == "end-to-mid" + etm_coords: list[int] = result["Removed_coordinates"].iloc[0] + etm_expected_coords = list(primer_1) + assert etm_coords.sort() == etm_expected_coords.sort() + + @pytest.mark.parametrize("amplicon_type", AMPLICON_TYPES) + def test_cut_reads_wrong_primers(self, amplicon_type: str) -> None: + """ + Test a scenario where the primers are not on the read. + + This test ensures that the `cut_reads` function can handle cases where the forward (FW) and reverse (RV) primers + are not present on the read sequence. + + Parameters + ---------- + self : TestCutReads + The instance of the test class containing the test data and parameters. + amplicon_type : str + The type of amplicon, either "end-to-end", "end-to-mid", or "fragmented". + + Returns + ------- + None + This function does not return any value. It asserts that no coordinates are removed. + """ + seq = "ATCGCATCGC" + self.HAPPY_SEQ[10:-10] + "ATCGCATCGC" + data: tuple[pd.DataFrame, int] = ( + pd.DataFrame( + { + "Readname": ["read_number_1"], + "Sequence": [seq], + "Qualities": [self.HAPPY_QUAL], + } + ), + 0, + ) + primer_1 = set(range(0, 10)) + primer_2 = set(range(91, 101)) + primer_sets = ( + defaultdict(set, {"synthetic_reference": primer_1}), + defaultdict(set, {"synthetic_reference": primer_2}), + ) + result = cut_reads( + data, + primer_sets, + self.reference, + self.preset, + self.scoring, + self.fragment_lookaround_size, + amplicon_type, + ) + assert not result["Removed_coordinates"].iloc[0] diff --git a/tests/unit/test_cutlery.py b/tests/unit/test_cutlery.py new file mode 100644 index 0000000..fdd3d94 --- /dev/null +++ b/tests/unit/test_cutlery.py @@ -0,0 +1,105 @@ +""" +Unit tests for the `cutlery` module in the AmpliGone package. + +This module contains unit tests for the functions in the `cutlery` module, which provide utility functions for processing sequencing reads. + +Test Scenarios +-------------- +- Test the `position_in_or_before_primer` function with various read positions. +- Test the `position_in_or_after_primer` function with various read positions. + +Functions +--------- +test_position_in_or_before_primer(read, result) + Tests the `position_in_or_before_primer` function with various read positions and expected results. +test_position_in_or_after_primer(read, result) + Tests the `position_in_or_after_primer` function with various read positions and expected results. +""" + +import pytest + +from AmpliGone import cutlery + + +@pytest.mark.parametrize( + "read, result", + [ + (22, True), + (25, True), + (10, False), + (26, False), + ], + ids=[ + "before_within_lookaround", + "same_as_primerstart", + "before_outside_lookaround", + "higher_than_primerstart", + ], +) +def test_position_in_or_before_primer(read: int, result: bool) -> None: + """ + Tests the `position_in_or_before_primer` function with various read positions and expected results. + + Parameters + ---------- + read : int + The read position to test. + result : bool + The expected result indicating whether the read position is in or before the primer. + + Returns + ------- + None + This function does not return any value. It asserts that the function's output matches the expected result. + """ + primer_positions = (25, 35) + max_lookaround = 10 + outcome = cutlery.position_in_or_before_primer( + read, primer_positions, max_lookaround + ) + if outcome != result: + raise AssertionError( + f"Expected {result} but got {outcome} while running cutlery.position_in_or_before_primer({read}, {primer_positions}, {max_lookaround})" + ) + + +@pytest.mark.parametrize( + "read, result", + [ + (22, False), + (10, False), + (25, True), + (26, True), + ], + ids=[ + "before_within_lookaround", + "before_outside_lookaround", + "same_as_primerstart", + "higher_than_primerstart", + ], +) +def test_postition_in_or_after_primer(read: int, result: bool) -> None: + """ + Tests the `position_in_or_after_primer` function with various read positions and expected results. + + Parameters + ---------- + read : int + The read position to test. + result : bool + The expected result indicating whether the read position is in or after the primer. + + Returns + ------- + None + This function does not return any value. It asserts that the function's output matches the expected result. + """ + primer_positions = (25, 35) + max_lookaround = 10 + outcome = cutlery.position_in_or_after_primer( + read, primer_positions, max_lookaround + ) + if outcome != result: + raise AssertionError( + f"Expected {result} but got {outcome} while running cutlery.position_in_or_before_primer({read}, {primer_positions}, {max_lookaround})" + ) diff --git a/tests/unit/test_fasta2bed.py b/tests/unit/test_fasta2bed.py new file mode 100644 index 0000000..88aa01d --- /dev/null +++ b/tests/unit/test_fasta2bed.py @@ -0,0 +1,53 @@ +import os +from typing import Generator + +import pandas as pd +import pytest + +from AmpliGone.fasta2bed import main + + +@pytest.fixture() +def setup() -> Generator[tuple[str, str, str, str], None, None]: + path_to_fasta = "tests/data/primers/ARTIC-V5.3.2.fasta" + path_to_reference = "tests/data/references/SARS-CoV-2-reference.fasta" + path_to_output = "tests/data/primers/new.bed" + path_to_example = "tests/data/primers/SARS-CoV-2-ARTIC-V5.3.2.scheme.bed" + + yield path_to_fasta, path_to_reference, path_to_output, path_to_example + + if os.path.exists(path_to_output): + os.remove(path_to_output) + + +class TestFasta2Bed: + def compare_bed_files(self, result: str, example: str) -> None: + res_df = pd.read_csv(result, sep="\t", header=None) + example_df = pd.read_csv(example, sep="\t", header=None) + + # drop the names [3], they are not important + # and + # drop the score column [4], as AmpliGone uses it to store the alignment score + # while ARTIC (files used for testing) uses it to store the primer pool + res_df = res_df.drop(columns=[3, 4]) + example_df = example_df.drop(columns=[3, 4]) + + if not res_df.equals(example_df): + raise AssertionError(f"{result} and {example} are not equal") + + def test_fasta2bed(self, setup: tuple[str, str, str, str]) -> None: + path_to_fasta, path_to_reference, path_to_output, path_to_example = setup + args = [ + "--primers", + path_to_fasta, + "--reference", + path_to_reference, + "--output", + path_to_output, + ] + main(args) + if not os.path.exists(path_to_output): + raise AssertionError(f"{path_to_output} was not created") + if os.path.getsize(path_to_output) == 0: + raise AssertionError(f"{path_to_output} is empty") + self.compare_bed_files(path_to_output, path_to_example)