fmfi-compbio · M-Fedor · Apr 22, 2021 · ppershing · Jun 27, 2021 · M-Fedor
diff --git a/interface/__init__.py b/interface/__init__.py
@@ -0,0 +1,5 @@
+from .data_types import Arguments, LibraryComponents, OutputData, FilePath, Path
+from .argument_parser import ArgumentParserInterface
+from .input_reader import InputReaderInterface
+from .output_formatter import OutputFormatterInterface
+from .task_executor import BasecallerInterface, TaskExecutorInterface
diff --git a/interface/argument_parser.py b/interface/argument_parser.py
@@ -0,0 +1,23 @@
+"""
+This module defines public interface of ArgumentParser class.
+See test/sample_app.py for example usage.
+"""
+
+from .data_types import Arguments, LibraryComponents
+
+
+class ArgumentParserInterface:
+
+    def parse_arguments(self) -> Arguments:
+        """
+        Method parses user arguments and returns filled Arguments container
+        necessary for proper library integration.
+        """
+        raise NotImplementedError
+
+    def get_library_components(self) -> LibraryComponents:
+        """
+        Method returns configured and initialized library components
+        ready for use by integrator.
+        """
+        raise NotImplementedError
diff --git a/interface/data_types.py b/interface/data_types.py
@@ -0,0 +1,57 @@
+"""
+This module defines public library data containers and type aliases.
+See test/sample_app.py for example usage.
+"""
+
+from __future__ import annotations
+from typing import NamedTuple
+
+
+class OutputData(NamedTuple):
+    """
+    Class is a container for basecaller output.
+    All fields are mandatory and filled by this library,
+    so MinKnow-like output can be provided.
+    """
+
+    read_id: str
+    run_id: str
+    read_number: int
+    channel_number: str
+    start_time: str
+    basecalled_sequence: str
+    quality_scores: str
+
+
+class LibraryComponents(NamedTuple):
+    """
+    Class is a container for class objects executing
+    library functionality.
+    Library components can be obtained from ArgumentParser object.
+    Components are always returned initialized and ready for use
+    (i.e. their initialize() method has been called).
+    """
+
+    input_reader: InputReaderInterface
+    task_executor: TaskExecutorInterface
+    output_formatter: OutputFormatterInterface
+
+
+class Arguments(NamedTuple):
+    """
+    Class is a container for parsed user arguments unrelated
+    to configuration of library compnents.
+    Those arguments are rather relevant for configuration
+    on integrator side (i.e. basecaller object configuration).
+    Arguments can be obtained from ArgumentParser object.
+    """
+
+    watch_directory: bool
+    network_type: str
+    weights_path: FilePath
+    beam_size: int
+    beam_cut_threshold: float
+
+
+FilePath = str
+Path = str
diff --git a/interface/input_reader.py b/interface/input_reader.py
@@ -0,0 +1,35 @@
+"""
+This module defines public interface of InputReader class family.
+See test/sample_app.py for example usage.
+"""
+
+from typing import List
+
+from .data_types import FilePath, Path
+
+
+class InputReaderInterface:
+
+    def __init__(self, input_directories: List[Path], input_files: List[FilePath]) -> None:
+
+        self.input_directories = input_directories
+        self.input_files = input_files
+        self.task_batch = None
+
+    def initialize(self) -> None:
+        """
+        Method initializes class object.
+        This method is always called by ArgumentParser and should NOT be
+        called by integrator.
+        """
+        raise NotImplementedError
+
+    def get_next_batch(self) -> List[FilePath]:
+        """
+        Method returns next batch of files ready to be processed
+        by task executor.
+        This method can be called as many times as necessary on
+        DirectoryWatcher family, but should be called only ONCE
+        on basic DirectoryReaders.
+        """
+        raise NotImplementedError
diff --git a/interface/output_formatter.py b/interface/output_formatter.py
@@ -0,0 +1,32 @@
+"""
+This module defines public interface for OutputFormatter class family.
+See test/sample_app.py for example usage.
+"""
+
+import gzip
+from typing import List
+
+from .data_types import OutputData
+
+
+class OutputFormatterInterface:
+
+    def __init__(self, output_name: str, compressed_output: bool) -> None:
+
+        self.compressed_output = compressed_output
+
+        if self.compressed_output:
+            self.output_stream = gzip.open(output_name, "wt")
+        else:
+            self.output_stream = open(output_name, "w")
+
+    def __del__(self) -> None:
+        self.output_stream.close()
+
+    def write_output(self, output_data: List[OutputData]) -> None:
+        """
+        Method implements printing of output batch in various formats.
+        Output is flushed immediately and fsync is called after
+        every batch is printed.
+        """
+        raise NotImplementedError
diff --git a/interface/task_executor.py b/interface/task_executor.py
@@ -0,0 +1,58 @@
+"""
+This module defines public interface from TaskExecutor class family.
+Also defines interface that basecaller dependency injection MUST implement.
+See test/sample_app.py for example usage.
+"""
+
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import numpy as np
+
+from .data_types import FilePath, OutputData
+
+@dataclass
+class BasecallerInterface:
+    """
+    Task executors in this library can use any injected
+    basecaller object that implements following interface.
+    No default basecaller is available.
+    """
+
+    def call_raw_signal(self, signal: np.ndarray) -> Tuple[str, str]:
+        """
+        Method implements signal basecalling. Takes one argument
+        of numpy.ndarray of numpy.float32 values.
+        Please note that input signal is already normalized by library.
+        This can be discussed in the future (see source/task_executor.py).
+
+        Method returns a tuple of strings. Those are basecalled sequence
+        and quality scores (as defined in fastq file format) respectively.
+        """
+        raise NotImplementedError
+
+
+class TaskExecutorInterface:
+
+    def __init__(self) -> None:
+        self.caller = None
+
+    def set_caller(self, caller: BasecallerInterface) -> None:
+        """
+        Method lets integrator to inject a custom basecaller object and
+        verifies at least partially whether it implements required interface.
+        """
+
+        obj_type = type(caller)
+
+        if hasattr(obj_type, 'call_raw_signal') and callable(obj_type.call_raw_signal):
+            self.caller = caller
+        else:
+            raise ValueError('Caller object does NOT implement BasecallerInterface!')
+
+    def execute_task_batch(self, tasks: List[FilePath]) -> List[OutputData]:
+        """
+        Method performs basecalling using custom basecaller object on all files in task batch.
+        Returns list of filled OutputData objects ready for output formatting.
+        """
+        raise NotImplementedError
diff --git a/source/__init__.py b/source/__init__.py
@@ -0,0 +1,14 @@
+from sys import platform
+
+from .input_reader import DirectoryReader
+
+if platform == 'linux':
+    from .input_reader import DirectoryWatcherLinux as DirectoryWatcher
+elif platform == 'win32':
+    from .input_reader import DirectoryWatcherWindows as DirectoryWatcher
+elif platform == 'darwin':
+    from .input_reader import DirectoryWatcherDarwin as DirectoryWatcher
+
+from .output_formatter import OutputFormatterFasta, OutputFormatterFastq
+from .task_executor import SequentialTaskExecutor, ParallelTaskExecutor
+from .argument_parser import ArgumentParser
diff --git a/source/argument_parser.py b/source/argument_parser.py
@@ -0,0 +1,86 @@
+import argparse
+
+from interface import (ArgumentParserInterface, InputReaderInterface, OutputFormatterInterface,
+                       TaskExecutorInterface, Arguments, LibraryComponents)
+
+from source import (DirectoryReader, DirectoryWatcher, OutputFormatterFasta,
+                    OutputFormatterFastq, SequentialTaskExecutor, ParallelTaskExecutor)
+
+
+class ArgumentParser(ArgumentParserInterface):
+
+    def __init__(self) -> None:
+
+        self.parser = argparse.ArgumentParser(description='Fast caller for ONT reads')
+        self.library_components = None
+
+    def parse_arguments(self) -> Arguments:
+
+        self.parser.add_argument('--directory', type=str, nargs='*',
+                                 help='One or more directories with reads')
+        self.parser.add_argument('--watch-directory', action='store_true', default=False,
+                                 help='Watch directories for new reads')
+        self.parser.add_argument('--reads', type=str, nargs='*',
+                                 help='One or more read files')
+
+        self.parser.add_argument("--cores", type=int, default=1,
+                                 help="Number of cores available for basecalling, defaults to 1")
+
+        self.parser.add_argument("--output", type=str, required=True,
+                                 help="Output FASTA/FASTQ file name")
+        self.parser.add_argument("--output-format", choices=["fasta", "fastq"], default="fasta")
+        self.parser.add_argument("--gzip-output", action="store_true",
+                                 help="Compress output with gzip")
+
+        self.parser.add_argument("--weights", type=str, default=None,
+                                 help="Path to network weights; only used for custom weights")
+        self.parser.add_argument("--network-type", choices=["48", "56", "64", "80", "96", "256"], default="48",
+                                 help="Size of network. Default 48")
+        self.parser.add_argument("--beam-size", type=int, default=None,
+                                 help="Beam size (defaults 5 for 48,56,64,80,96 and 20 for 256). Use 1 for greedy decoding.")
+        self.parser.add_argument("--beam-cut-threshold", type=float, default=None,
+                                 help="Threshold for creating beams (higher means faster beam search, but smaller accuracy). \
+                                 Values higher than 0.2 might lead to weird errors. Default 0.1 for 48,...,96 and 0.0001 for 256")
+
+        arguments = self.parser.parse_args()
+
+        input_reader = _initialize_input_reader(arguments)
+        task_executor = _initialize_task_executor(arguments)
+        output_formatter = _initialize_output_formatter(arguments)
+
+        self.library_components = LibraryComponents(input_reader, task_executor, output_formatter)
+
+        return Arguments(arguments.watch_directory, arguments.network_type, arguments.weights,
+                         arguments.beam_size, arguments.beam_cut_threshold)
+
+    def get_library_components(self) -> LibraryComponents:
+        return self.library_components
+
+
+def _initialize_input_reader(arguments: argparse.Namespace) -> InputReaderInterface:
+
+    if arguments.watch_directory:
+        watcher = DirectoryWatcher(arguments.directory, arguments.reads)
+    else:
+        watcher = DirectoryReader(arguments.directory, arguments.reads)
+
+    watcher.initialize()
+    return watcher
+
+
+def _initialize_task_executor(arguments: argparse.Namespace) -> TaskExecutorInterface:
+
+    if arguments.cores <= 1:
+        return SequentialTaskExecutor()
+
+    return ParallelTaskExecutor(arguments.cores)
+
+
+def _initialize_output_formatter(arguments: argparse.Namespace) -> OutputFormatterInterface:
+
+    if arguments.output_format == 'fasta':
+        return OutputFormatterFasta(arguments.output, arguments.gzip_output)
+    if arguments.output_format == 'fastq':
+        return OutputFormatterFastq(arguments.output, arguments.gzip_output)
+
+    return None