From 6e603b1749546260ba620edc0460228cdd4c0b4a Mon Sep 17 00:00:00 2001 From: "Magnus G. Jespersen" <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 10:00:13 +1100 Subject: [PATCH 001/135] Add in action for tests on main branch --- .github/workflows/Test.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/Test.yml diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml new file mode 100644 index 0000000..01963ec --- /dev/null +++ b/.github/workflows/Test.yml @@ -0,0 +1,17 @@ +name: 'Test' +on: + push: + pull_request: + branches: + - main +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: 'Build Docker image' + run: docker build -t corekaburra . + - name: 'Run unit tests' + run: docker run --entrypoint /Corekaburra/.travis/unit-test.sh corekaburra + - name: 'Run functional tests' + run: docker run --entrypoint /Corekaburra/functional_tests/Corekaburra-test.sh corekaburra -p Corekaburra -d /Corekaburra/functional_tests/test_data -v From 7f146daed9da3fe2c64e54faa2b9f34a30b9bf25 Mon Sep 17 00:00:00 2001 From: "Magnus G. Jespersen" <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 10:14:20 +1100 Subject: [PATCH 002/135] Add in Github action to test dev branch --- .github/workflows/test_dev.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/workflows/test_dev.yml diff --git a/.github/workflows/test_dev.yml b/.github/workflows/test_dev.yml new file mode 100644 index 0000000..cf9f235 --- /dev/null +++ b/.github/workflows/test_dev.yml @@ -0,0 +1,16 @@ +name: 'Corekaburra_test_suite' +on: + push: + branches: + - dev +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: 'Build Docker image' + run: docker build -t corekaburra . + - name: 'Run unit tests' + run: docker run --entrypoint /Corekaburra/.travis/unit-test.sh corekaburra + - name: 'Run functional tests' + run: docker run --entrypoint /Corekaburra/functional_tests/Corekaburra-test.sh corekaburra -p Corekaburra -d /Corekaburra/functional_tests/test_data -v From 8fb187444345c564669719de7d82d4460fe7a466 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 11:26:09 +1100 Subject: [PATCH 003/135] delete sumamrt_table --- summary_table.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 summary_table.py diff --git a/summary_table.py b/summary_table.py deleted file mode 100644 index e69de29..0000000 From dea2fa8f443a4c6591c95e7905f17ed04d4d850c Mon Sep 17 00:00:00 2001 From: "Magnus G. Jespersen" <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 11:28:53 +1100 Subject: [PATCH 004/135] Add that test should only occur for main when push --- .github/workflows/Test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 01963ec..4097794 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -1,6 +1,8 @@ name: 'Test' on: push: + branches: + - main pull_request: branches: - main From 64c3c2d4798400a0a23190a2c196b812c28ca10a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 12:21:07 +1100 Subject: [PATCH 005/135] Add in the commandline interface and nice exit function with unit and functional tests --- Code_to_transfer/commandline_interface.py | 71 ------ Corekaburra/Corekaburra.py | 236 ------------------ Corekaburra/Corekaburra_test.py | 102 -------- Corekaburra/__main__.py | 106 ++++++++ Corekaburra/commandline_interface.py | 109 ++++++++ Corekaburra/exit_with_error.py | 30 +++ Dockerfile | 2 +- functional_tests/Corekaburra-test.sh | 21 +- functional_tests/test_data/no_input.expected | 22 ++ unit_tests/Corekaburra_test.py | 47 ++++ unit_tests/unit_test_data/.DS_Store | Bin 0 -> 6148 bytes .../TestExitWithError/.DS_Store | Bin 0 -> 6148 bytes .../TestExitWithError/tmp_folder/test_file | 0 13 files changed, 327 insertions(+), 419 deletions(-) delete mode 100644 Code_to_transfer/commandline_interface.py delete mode 100644 Corekaburra/Corekaburra.py delete mode 100644 Corekaburra/Corekaburra_test.py create mode 100644 Corekaburra/__main__.py create mode 100644 Corekaburra/commandline_interface.py create mode 100644 Corekaburra/exit_with_error.py create mode 100644 functional_tests/test_data/no_input.expected create mode 100644 unit_tests/Corekaburra_test.py create mode 100644 unit_tests/unit_test_data/.DS_Store create mode 100644 unit_tests/unit_test_data/TestExitWithError/.DS_Store create mode 100644 unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file diff --git a/Code_to_transfer/commandline_interface.py b/Code_to_transfer/commandline_interface.py deleted file mode 100644 index 7c7a2b0..0000000 --- a/Code_to_transfer/commandline_interface.py +++ /dev/null @@ -1,71 +0,0 @@ -import argparse - - -def get_commandline_arguments(args): - # Set up parser - parser = argparse.ArgumentParser(description='Welcome to Coredial!\n ' - 'Program to determine consensus core sequence from multiple genomes.\n' - 'Outputs consensus core gene alignment, distance between core genes, ' - 'number of accessory genes between core genes and low frequency genes ' - 'between core genes') - - parser.add_argument('-i_gffs', - '--input_gffs', - help='Path to gff files used for pan-genome', - required=True, - dest='input_gffs', - nargs='+') - - parser.add_argument('-i_pan', - '--input_pangenome_folder', - help='Path to the folder produced by Panaroo or Roary', - required=True, - dest='input_pan') - - parser.add_argument('-o', - help='Path to where output files will be placed', - required=True, - type=str, - dest='output_path') - - parser.add_argument('-p', - '--prefix', - help='Prefix for output files, if any is desired', - required=False, - default=None, - - dest='output_prefix') - - parser.add_argument('-a', - '--annotate_refound', - help='Flag to toggle off creation of new gff files, with annotation of refound genes.\n' - 'Only done if input pangenome is detected as comming from Panaroo', - required=False, - default=True, - action='store_false', - dest='annotate') - - parser.add_argument('-c', - '--complete_genomes', - help='text file containing names of genomes that are to be handled as complete genomes', - required=False, - default=None, - dest='comp_genomes') - - parser.add_argument('-q', - '--quiet', - help='Flag to toggle off printed info about the run', - required=False, - default=False, - action='store_true', - dest='quiet') - - # Check if any thing is given as input otherwise warn and print help - if len(args) == 0: - parser.print_usage() - print("Or run with -h or --help for more information") - exit(code=1) - else: - args = parser.parse_args(args) - - return args diff --git a/Corekaburra/Corekaburra.py b/Corekaburra/Corekaburra.py deleted file mode 100644 index fa5bb15..0000000 --- a/Corekaburra/Corekaburra.py +++ /dev/null @@ -1,236 +0,0 @@ -''' -Module : Main -Description : The main entry point for the program. -Copyright : (c) Magnus Ganer Jespersen, 11 Oct 2021 -License : MIT -Maintainer : magnus.ganer.j@gmail.com -Portability : POSIX - -The program reads one or more input FASTA files. For each file it computes a -variety of statistics, and then prints a summary of the statistics as output. -''' - -from argparse import ArgumentParser -from math import floor -import sys -import logging -import pkg_resources -from Bio import SeqIO - - -EXIT_FILE_IO_ERROR = 1 -EXIT_COMMAND_LINE_ERROR = 2 -EXIT_FASTA_FILE_ERROR = 3 -DEFAULT_MIN_LEN = 0 -DEFAULT_VERBOSE = False -HEADER = 'FILENAME\tNUMSEQ\tTOTAL\tMIN\tAVG\tMAX' -PROGRAM_NAME = "Corekaburra" - - -try: - PROGRAM_VERSION = pkg_resources.require(PROGRAM_NAME)[0].version -except pkg_resources.DistributionNotFound: - PROGRAM_VERSION = "undefined_version" - - -def exit_with_error(message, exit_status): - '''Print an error message to stderr, prefixed by the program name and 'ERROR'. - Then exit program with supplied exit status. - - Arguments: - message: an error message as a string. - exit_status: a positive integer representing the exit status of the - program. - ''' - logging.error(message) - print("{} ERROR: {}, exiting".format(PROGRAM_NAME, message), file=sys.stderr) - sys.exit(exit_status) - - -def parse_args(): - '''Parse command line arguments. - Returns Options object with command line argument values as attributes. - Will exit the program on a command line error. - ''' - description = 'Read one or more FASTA files, compute simple stats for each file' - parser = ArgumentParser(description=description) - parser.add_argument( - '--minlen', - metavar='N', - type=int, - default=DEFAULT_MIN_LEN, - help='Minimum length sequence to include in stats (default {})'.format( - DEFAULT_MIN_LEN)) - parser.add_argument('--version', - action='version', - version='%(prog)s ' + PROGRAM_VERSION) - parser.add_argument('--log', - metavar='LOG_FILE', - type=str, - help='record program progress in LOG_FILE') - parser.add_argument('fasta_files', - nargs='*', - metavar='FASTA_FILE', - type=str, - help='Input FASTA files') - return parser.parse_args() - - -class FastaStats(object): - '''Compute various statistics for a FASTA file: - - num_seqs: the number of sequences in the file satisfying the minimum - length requirement (minlen_threshold). - num_bases: the total length of all the counted sequences. - min_len: the minimum length of the counted sequences. - max_len: the maximum length of the counted sequences. - average: the average length of the counted sequences rounded down - to an integer. - ''' - #pylint: disable=too-many-arguments - def __init__(self, - num_seqs=None, - num_bases=None, - min_len=None, - max_len=None, - average=None): - "Build an empty FastaStats object" - self.num_seqs = num_seqs - self.num_bases = num_bases - self.min_len = min_len - self.max_len = max_len - self.average = average - - def __eq__(self, other): - "Two FastaStats objects are equal iff their attributes are equal" - if type(other) is type(self): - return self.__dict__ == other.__dict__ - return False - - def __repr__(self): - "Generate a printable representation of a FastaStats object" - return "FastaStats(num_seqs={}, num_bases={}, min_len={}, max_len={}, " \ - "average={})".format( - self.num_seqs, self.num_bases, self.min_len, self.max_len, - self.average) - - def from_file(self, fasta_file, minlen_threshold=DEFAULT_MIN_LEN): - '''Compute a FastaStats object from an input FASTA file. - - Arguments: - fasta_file: an open file object for the FASTA file - minlen_threshold: the minimum length sequence to consider in - computing the statistics. Sequences in the input FASTA file - which have a length less than this value are ignored and not - considered in the resulting statistics. - Result: - A FastaStats object - ''' - num_seqs = num_bases = 0 - min_len = max_len = None - for seq in SeqIO.parse(fasta_file, "fasta"): - this_len = len(seq) - if this_len >= minlen_threshold: - if num_seqs == 0: - min_len = max_len = this_len - else: - min_len = min(this_len, min_len) - max_len = max(this_len, max_len) - num_seqs += 1 - num_bases += this_len - if num_seqs > 0: - self.average = int(floor(float(num_bases) / num_seqs)) - else: - self.average = None - self.num_seqs = num_seqs - self.num_bases = num_bases - self.min_len = min_len - self.max_len = max_len - return self - - def pretty(self, filename): - '''Generate a pretty printable representation of a FastaStats object - suitable for output of the program. The output is a tab-delimited - string containing the filename of the input FASTA file followed by - the attributes of the object. If 0 sequences were read from the FASTA - file then num_seqs and num_bases are output as 0, and min_len, average - and max_len are output as a dash "-". - - Arguments: - filename: the name of the input FASTA file - Result: - A string suitable for pretty printed output - ''' - if self.num_seqs > 0: - num_seqs = str(self.num_seqs) - num_bases = str(self.num_bases) - min_len = str(self.min_len) - average = str(self.average) - max_len = str(self.max_len) - else: - num_seqs = num_bases = "0" - min_len = average = max_len = "-" - return "\t".join([filename, num_seqs, num_bases, min_len, average, - max_len]) - - -def process_files(options): - '''Compute and print FastaStats for each input FASTA file specified on the - command line. If no FASTA files are specified on the command line then - read from the standard input (stdin). - - Arguments: - options: the command line options of the program - Result: - None - ''' - if options.fasta_files: - for fasta_filename in options.fasta_files: - logging.info("Processing FASTA file from %s", fasta_filename) - try: - fasta_file = open(fasta_filename) - except IOError as exception: - exit_with_error(str(exception), EXIT_FILE_IO_ERROR) - else: - with fasta_file: - stats = FastaStats().from_file(fasta_file, options.minlen) - print(stats.pretty(fasta_filename)) - else: - logging.info("Processing FASTA file from stdin") - stats = FastaStats().from_file(sys.stdin, options.minlen) - print(stats.pretty("stdin")) - - -def init_logging(log_filename): - '''If the log_filename is defined, then - initialise the logging facility, and write log statement - indicating the program has started, and also write out the - command line from sys.argv - - Arguments: - log_filename: either None, if logging is not required, or the - string name of the log file to write to - Result: - None - ''' - if log_filename is not None: - logging.basicConfig(filename=log_filename, - level=logging.DEBUG, - filemode='w', - format='%(asctime)s %(levelname)s - %(message)s', - datefmt="%Y-%m-%dT%H:%M:%S%z") - logging.info('program started') - logging.info('command line: %s', ' '.join(sys.argv)) - - -def main(): - "Orchestrate the execution of the program" - options = parse_args() - init_logging(options.log) - print(HEADER) - process_files(options) - - -# If this script is run from the command line then call the main function. -if __name__ == '__main__': - main() diff --git a/Corekaburra/Corekaburra_test.py b/Corekaburra/Corekaburra_test.py deleted file mode 100644 index 7d17b22..0000000 --- a/Corekaburra/Corekaburra_test.py +++ /dev/null @@ -1,102 +0,0 @@ -''' -Unit tests for Corekaburra. - -Usage: python -m unittest -v Corekaburra_test -''' - -import unittest -from io import StringIO -#pylint: disable=no-name-in-module -from Corekaburra import FastaStats - -class TestFastaStats(unittest.TestCase): - '''Unit tests for FastaStats''' - def do_test(self, input_str, minlen, expected): - "Wrapper function for testing FastaStats" - result = FastaStats().from_file(StringIO(input_str), minlen) - self.assertEqual(expected, result) - - def test_zero_byte_input(self): - "Test input containing zero bytes" - expected = FastaStats(num_seqs=0, - num_bases=0, - min_len=None, - max_len=None, - average=None) - self.do_test('', 0, expected) - - def test_single_newline_input(self): - "Test input containing a newline (\n) character" - expected = FastaStats(num_seqs=0, - num_bases=0, - min_len=None, - max_len=None, - average=None) - self.do_test('\n', 0, expected) - - def test_single_greater_than_input(self): - "Test input containing a single greater-than (>) character" - expected = FastaStats(num_seqs=1, - num_bases=0, - min_len=0, - max_len=0, - average=0) - self.do_test('>', 0, expected) - - def test_one_sequence(self): - "Test input containing one sequence" - expected = FastaStats(num_seqs=1, - num_bases=5, - min_len=5, - max_len=5, - average=5) - self.do_test(">header\nATGC\nA", 0, expected) - - def test_two_sequences(self): - "Test input containing two sequences" - expected = FastaStats(num_seqs=2, - num_bases=9, - min_len=2, - max_len=7, - average=4) - self.do_test(">header1\nATGC\nAGG\n>header2\nTT\n", 0, expected) - - def test_no_header(self): - "Test input containing sequence without preceding header" - expected = FastaStats(num_seqs=0, - num_bases=0, - min_len=None, - max_len=None, - average=None) - self.do_test("no header\n", 0, expected) - - def test_minlen_less_than_all(self): - "Test input when --minlen is less than 2 out of 2 sequences" - expected = FastaStats(num_seqs=2, - num_bases=9, - min_len=2, - max_len=7, - average=4) - self.do_test(">header1\nATGC\nAGG\n>header2\nTT\n", 2, expected) - - def test_minlen_greater_than_one(self): - "Test input when --minlen is less than 1 out of 2 sequences" - expected = FastaStats(num_seqs=1, - num_bases=7, - min_len=7, - max_len=7, - average=7) - self.do_test(">header1\nATGC\nAGG\n>header2\nTT\n", 3, expected) - - def test_minlen_greater_than_all(self): - "Test input when --minlen is greater than 2 out of 2 sequences" - expected = FastaStats(num_seqs=0, - num_bases=0, - min_len=None, - max_len=None, - average=None) - self.do_test(">header1\nATGC\nAGG\n>header2\nTT\n", 8, expected) - - -if __name__ == '__main__': - unittest.main() diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py new file mode 100644 index 0000000..89d39a9 --- /dev/null +++ b/Corekaburra/__main__.py @@ -0,0 +1,106 @@ +''' +Module : Main +Description : The main entry point for the Corekaburra. +Copyright : (c) Magnus Ganer Jespersen, 11 Oct 2021 +License : MIT +Maintainer : magnus.ganer.j@gmail.com +Portability : POSIX + +The program reads one or more input FASTA files. For each file it computes a +variety of statistics, and then prints a summary of the statistics as output. # TODO - Change description +''' + +import os +import logging +import time + +try: + from Corekaburra.commandline_interface import get_commandline_arguments +except ModuleNotFoundError: + from commandline_interface import get_commandline_arguments + +from argparse import ArgumentParser +from math import floor +import sys +import pkg_resources + +EXIT_FILE_IO_ERROR = 1 +EXIT_COMMAND_LINE_ERROR = 2 +EXIT_FASTA_FILE_ERROR = 3 +DEFAULT_MIN_LEN = 0 +DEFAULT_VERBOSE = False +PROGRAM_NAME = "Corekaburra" + + +try: + PROGRAM_VERSION = pkg_resources.require(PROGRAM_NAME)[0].version +except pkg_resources.DistributionNotFound: + PROGRAM_VERSION = "undefined_version" + + +def init_logging(debug_log, quiet, out_path): + """ + initialise the logging file, and write log statement + indicating the program has started, and also write out the + command line from sys.argv + :param debug_log: Bool indicating if log is a debug log + :param quiet: Bool indicating if logging should be kept minimal + :param out_path: Output path for the program, and where log will be placed + :return: Logger object + """ + if debug_log: + level = logging.DEBUG + elif quiet: + level = logging.WARNING + else: + level = logging.INFO + + # Construct logger logging to file + file_logger = logging.getLogger(__name__) + file_logger.setLevel(level) + + formatter = logging.Formatter('[%(asctime)s] %(levelname)s - %(module)s - %(message)s', + datefmt="%Y-%m-%dT%H:%M:%S%z") + + file_handler = logging.FileHandler(os.path.join(out_path, 'Corekaburra.log')) + file_handler.setLevel(level) + file_handler.setFormatter(formatter) + file_logger.addHandler(file_handler) + + # Log command-line argument and debug line for Corekaburra start + file_logger.info(f"command line: {' '.join(argv)}") + + return file_logger + + +def stream_logging(file_logger): + """ + Function adding in stream logging following initial logging + :param file_logger: Logger object + :return: Logger object with added stream logging + """ + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.INFO) + + file_logger.addHandler(stream_handler) + + file_logger.info('Processing started') + + return file_logger + + +def main(): + """ + This is the main function for running Corekaburra. + Requires a pan-genome folder from an allowed program, along with GFF3 files used for producing the pan-genome + :return: + """ + total_time_start = time.time() + + # get arguments from the commandline + args = get_commandline_arguments(sys.argv[1:]) + + +# If this script is run from the command line then call the main function. +if __name__ == '__main__': + main() diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py new file mode 100644 index 0000000..7ce4075 --- /dev/null +++ b/Corekaburra/commandline_interface.py @@ -0,0 +1,109 @@ +import argparse +import sys +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error + +EXIT_COMMAND_LINE_ERROR = 2 + + +def get_commandline_arguments(args): + """ + Function that takes the input given to the commandline and passes it. + will check for no input and '-help' + :param args: List of input arguments given to the commandline + :return: matched argument object for passing in main function. + """ + # Set up parser + parser = argparse.ArgumentParser(description='Welcome to Corekaburra!\n ' + 'Program to determine consensus core sequence from multiple genomes.\n' + 'Outputs consensus core gene alignment, distance between core genes, ' + 'number of accessory genes between core genes and low frequency genes ' + 'between core genes') #TODO - Change + + parser.add_argument('-ig', + '--input_gffs', + help='Path to gff files used for pan-genome', + required=True, + metavar='file_1.gff ... file_n.gff', + dest='input_gffs', + nargs='+') + + parser.add_argument('-ip', + '--input_pangenome', + help='Path to the folder produced by Panaroo or Roary', + metavar='path/to/pan_genome', + required=True, + dest='input_pan') + + parser.add_argument('-cg', + '--complete_genomes', + help='text file containing names of genomes that are to be handled as complete genomes', + required=False, + metavar='complete_genomes.txt', + default=None, + dest='comp_genomes') + + parser.add_argument('-o', + '--output', + help='Path to where output files will be placed [default: current folder]', + required=False, + type=str, + metavar='path/to/output', + default='.', + dest='output_path') + + parser.add_argument('-p', + '--prefix', + help='Prefix for output files, if any is desired', + required=False, + default=None, + dest='output_prefix') + + parser.add_argument('-a', + '--no_annotate_refound', + help='Flag to toggle off the creation of new gff files, with annotation of refound genes.\n' + 'Only done if input pangenome is detected as comming from Panaroo', + required=False, + default=True, + action='store_false', + dest='annotate') + + parser.add_argument('-c', + '--cpu', + help='Give max number of CPUs [default: 1]', + required=False, + metavar='int', + default=1, + type=int, + dest='cpu') + + logger_level = parser.add_mutually_exclusive_group() + logger_level.add_argument('-l', + '--log', + help='Record program progress in for debugging purpose', + action='store_true', + default=False, + required=False) + + logger_level.add_argument('-q', + '--quiet', + help='Only print warnings', + action='store_true', + default=False, + required=False) + + # Check if any thing is given as input otherwise warn and print help + if len(args) < 1: + parser.print_help() + sys.exit(EXIT_COMMAND_LINE_ERROR) + elif '-help' in args: + parser.print_help() + sys.exit(0) + if '--check' in args: + sys.exit(1) #TODO write script that checks for dependencies! + + args = parser.parse_args(args) + + return args diff --git a/Corekaburra/exit_with_error.py b/Corekaburra/exit_with_error.py new file mode 100644 index 0000000..fe1c46a --- /dev/null +++ b/Corekaburra/exit_with_error.py @@ -0,0 +1,30 @@ +import sys +import logging +import os + + +def exit_with_error(message, exit_status, tmp_folder=None): + """ + Print an error message to stderr, prefixed by the program name and 'ERROR'. + Then exit program with supplied exit status. + :param message: Message to give the user upon exit + :param exit_status: Status returned as exit status + :param tmp_folder: Temporary folder for Corekaburra to be deleted under some circumstances. + :return: None + """ + + # Delete tmp files and folder + try: + if tmp_folder is not None: + tmp_files = os.listdir(tmp_folder) + for file in tmp_files: + os.remove(os.path.join(tmp_folder, file)) + os.rmdir(tmp_folder) + else: + pass + except FileNotFoundError: + pass + + logging.error(message) + print(f"Corekaburra ERROR: {message}, exiting", file=sys.stderr) + sys.exit(exit_status) diff --git a/Dockerfile b/Dockerfile index 62c0789..7bb631e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7.3-stretch +FROM python:3.9.7-buster WORKDIR /Corekaburra COPY . . diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 1a89f38..234d1e8 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -137,23 +137,26 @@ function test_exit_status { fi } +function call_new_test { + echo '' + echo $1 +} + # 1. Parse command line arguments. parse_args $@ # 2. Change to test directory cd $test_data_dir # 2. Run tests -test_stdout_exit "$test_program one_sequence.fasta" one_sequence.fasta.expected 0 -test_stdout_exit "$test_program two_sequence.fasta" two_sequence.fasta.expected 0 -test_stdout_exit "$test_program --minlen 200 two_sequence.fasta" two_sequence.fasta.minlen_200.expected 0 -test_stdout_exit "$test_program --minlen 200 < two_sequence.fasta" two_sequence.fasta.minlen_200.stdin.expected 0 -test_stdout_exit "$test_program empty_file" empty_file.expected 0 -# Test when --minlen filters out ALL sequences (empty result) -test_stdout_exit "$test_program --minlen 1000 two_sequence.fasta" two_sequence.fasta.minlen_1000.expected 0 +# Test output for no arguments +call_new_test "Test output for no arguments" +test_stdout_exit "$test_program" no_input.expected 2 +# Test output for -help argument given +call_new_test "Test output for -help argument given" +test_stdout_exit "$test_program -help" no_input.expected 0 # Test exit status for a bad command line invocation +call_new_test "Test exit status for a bad command line invocation" test_exit_status "$test_program --this_is_not_a_valid_argument > /dev/null 2>&1" 2 -# Test exit status for a non existent input FASTA file -test_exit_status "$test_program this_file_does_not_exist.fasta > /dev/null 2>&1" 1 # 3. End of testing - check if any errors occurrred diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected new file mode 100644 index 0000000..02b4b8c --- /dev/null +++ b/functional_tests/test_data/no_input.expected @@ -0,0 +1,22 @@ +usage: __main__.py [-h] -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-o path/to/output] [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] + +Welcome to Corekaburra! Program to determine consensus core sequence from multiple genomes. Outputs consensus core gene alignment, distance between core genes, number of accessory genes between core genes and low frequency genes between core +genes + +optional arguments: + -h, --help show this help message and exit + -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...], --input_gffs file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] + Path to gff files used for pan-genome + -ip path/to/pan_genome, --input_pangenome path/to/pan_genome + Path to the folder produced by Panaroo or Roary + -cg complete_genomes.txt, --complete_genomes complete_genomes.txt + text file containing names of genomes that are to be handled as complete genomes + -o path/to/output, --output path/to/output + Path to where output files will be placed [default: current folder] + -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX + Prefix for output files, if any is desired + -a, --no_annotate_refound + Flag to toggle off the creation of new gff files, with annotation of refound genes. Only done if input pangenome is detected as comming from Panaroo + -c int, --cpu int Give max number of CPUs [default: 1] + -l, --log Record program progress in for debugging purpose + -q, --quiet Only print warnings diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py new file mode 100644 index 0000000..aa52cd5 --- /dev/null +++ b/unit_tests/Corekaburra_test.py @@ -0,0 +1,47 @@ +''' +Unit tests for Corekaburra. + +Usage: python -m unittest -v Corekaburra_test +''' + +# import +import unittest +import os +import json +from shutil import copyfile +import logging +# pylint: disable=no-name-in-module + +# import Corekaburra functions +from Corekaburra import exit_with_error +from Corekaburra import commandline_interface + + +# move to folder with mock files. First try Github structure, then try pulled repository structure +try: + os.chdir('/Corekaburra/unit_tests/unit_test_data/') +except FileNotFoundError: + print(os.getcwd()) + os.chdir('unit_test_data/') + + +class TestExitWithError(unittest.TestCase): + def test_exit_w_tmp_folder_deletion(self): + ''' Test the exit function is able to remove the temporary folder ''' + + # copy the placeholder tmp folder to replace it afterwards + tmp_folder = 'TestExitWithError/tmp_folder' + tmp_folder_copy = 'TestExitWithError/tmp_folder_copy' + os.mkdir(tmp_folder_copy) + + tmp_files = os.listdir(tmp_folder) + for file in tmp_files: + copyfile(os.path.join(tmp_folder, file), os.path.join(tmp_folder_copy, file)) + + with self.assertRaises(SystemExit): + exit_with_error.exit_with_error(exit_status=2, message='test msg', tmp_folder=tmp_folder) + + os.rename(tmp_folder_copy, tmp_folder) + +if __name__ == '__main__': + unittest.main() diff --git a/unit_tests/unit_test_data/.DS_Store b/unit_tests/unit_test_data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d17d24c39e1918472d1b355041d48df7725ac1de GIT binary patch literal 6148 zcmeHK%}T>S5T30)M7&hIcoFs$`UbJ47q1mOR+|0^2`O5^W6nO1HxKIFr|=a#c=MZ` zp{684L=cggF!OD8_se#^4cQJ6xxvk}Pt+r#2+r8qMQAV{XRldDdw9@=XUr(2HO**B zq7)=w<%+_VCPkwqkzf z_&sJrew7WfrYT?wm;#Lo;LK)=_5>}O0;Yf|uu*`&4<61KE0%)c(}5650ALs0!O-Sf zf;nC>RxAZE0&_wI3e{> zvftSAvGb(Z9wOq+W49(+5mAFC$f8t4rmLnSb6x_u+&K(M6z3Eg><%FdAk>SRkyS zKn-OpF<8SfAIvWrW2L2fXyll7a1|Ma2>%r&AT^rD@&_u*9ivoc@djw!0=g4U>sy&Dfzi5~hC5z0L QaG*Z~3L)M(1HZt)JJ$I!9{>OV literal 0 HcmV?d00001 diff --git a/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file b/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file new file mode 100644 index 0000000..e69de29 From 4e67fdb6d8e774e80ed56f8303b809304f3162ca Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 13:57:02 +1100 Subject: [PATCH 006/135] Remove nice exit status usage from commandline interface. Add in check for complete genomes provided and unit tests for these --- Corekaburra/__main__.py | 16 ++++++- Corekaburra/commandline_interface.py | 4 -- unit_tests/Corekaburra_test.py | 39 +++++++++++++++++- unit_tests/unit_test_data/.DS_Store | Bin 6148 -> 6148 bytes .../complete_genomes_file.txt | 4 ++ 5 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 89d39a9..4fbe684 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -19,12 +19,17 @@ except ModuleNotFoundError: from commandline_interface import get_commandline_arguments +try: + from Corekaburra.read_complete_genome_file import parse_complete_genome_file +except ModuleNotFoundError: + from read_complete_genome_file import parse_complete_genome_file + from argparse import ArgumentParser from math import floor import sys import pkg_resources -EXIT_FILE_IO_ERROR = 1 +EXIT_INPUT_FILE_ERROR = 1 EXIT_COMMAND_LINE_ERROR = 2 EXIT_FASTA_FILE_ERROR = 3 DEFAULT_MIN_LEN = 0 @@ -68,7 +73,7 @@ def init_logging(debug_log, quiet, out_path): file_logger.addHandler(file_handler) # Log command-line argument and debug line for Corekaburra start - file_logger.info(f"command line: {' '.join(argv)}") + file_logger.info(f"command line: {' '.join(sys.argv)}") return file_logger @@ -100,6 +105,13 @@ def main(): # get arguments from the commandline args = get_commandline_arguments(sys.argv[1:]) + # TODO - Add in function(s) that will check all files to not be empty. - Andrew? + + if args.comp_genomes is not None: + comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs) + else: + comp_genomes = None + # If this script is run from the command line then call the main function. if __name__ == '__main__': diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 7ce4075..74f138d 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -1,9 +1,5 @@ import argparse import sys -try: - from Corekaburra.exit_with_error import exit_with_error -except ModuleNotFoundError: - from exit_with_error import exit_with_error EXIT_COMMAND_LINE_ERROR = 2 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index aa52cd5..e9eae92 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -14,14 +14,14 @@ # import Corekaburra functions from Corekaburra import exit_with_error -from Corekaburra import commandline_interface +from Corekaburra import read_complete_genome_file + # move to folder with mock files. First try Github structure, then try pulled repository structure try: os.chdir('/Corekaburra/unit_tests/unit_test_data/') except FileNotFoundError: - print(os.getcwd()) os.chdir('unit_test_data/') @@ -43,5 +43,40 @@ def test_exit_w_tmp_folder_deletion(self): os.rename(tmp_folder_copy, tmp_folder) + +class TestParsingCompleteGenomes(unittest.TestCase): + def test_all_files_found(self): + gff_files = ['/path/to/complete_genome_1.gff', + '/path/complete_genome_2.gff.gz', + 'complete_genome_3.gff.gz', + 'complete_genome_4.gff', + 'dummy_index_1', + 'dummy_index_2'] + + complete_genome_file = 'TestParsingCompleteGenomes/complete_genomes_file.txt' + + expected_return = ['complete_genome_1', + 'complete_genome_2', + 'complete_genome_3', + 'complete_genome_4'] + + return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files) + + self.assertEqual(return_object, expected_return) + + def test_correct_one_files_not_found(self): + gff_files = ['/path/complete_genome_2.gff.gz', + 'complete_genome_3.gff.gz', + 'complete_genome_4.gff', + 'dummy_index_1', + 'dummy_index_2'] + + complete_genome_file = 'TestParsingCompleteGenomes/complete_genomes_file.txt' + + with self.assertRaises(SystemExit): + read_complete_genome_file.parse_complete_genome_file(complete_genome_file, + gff_files) + + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/.DS_Store b/unit_tests/unit_test_data/.DS_Store index d17d24c39e1918472d1b355041d48df7725ac1de..879bb33168d979b7daa04763aeeac8c8ff81a80c 100644 GIT binary patch delta 348 zcmZoMXfc=|#>B!ku~2NHo+2a9#(>?7i$5?kG4gEYVYzA-7G~Vc&cV+C47<$_nZGkn<`;3~U}Rum0{MA! Igvc6Z09{#8A^-pY delta 67 zcmZoMXfc=|#>B)qu~2NHo+2a1#(>?7j2xSJSS~Yemf_%Lnb;t^nVo~51E^%PAjfy+ V$^0UY91K9f$iTp|IYML&GXOPK4-5bR diff --git a/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt b/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt new file mode 100644 index 0000000..13d5bce --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt @@ -0,0 +1,4 @@ +complete_genome_1.gff +/test/path/complete_genome_2.gff +complete_genome_3.gff.gz +/test/path/complete_genome_4.gff.gz From d521b1701defd932cd1b14636050f3eff6822a5f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 13:58:19 +1100 Subject: [PATCH 007/135] Add in function to read and verify presence of genome genomes provided --- Corekaburra/read_complete_genome_file.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Corekaburra/read_complete_genome_file.py diff --git a/Corekaburra/read_complete_genome_file.py b/Corekaburra/read_complete_genome_file.py new file mode 100644 index 0000000..08f0939 --- /dev/null +++ b/Corekaburra/read_complete_genome_file.py @@ -0,0 +1,39 @@ +import os + +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error +EXIT_INPUT_FILE_ERROR = 1 + + +def parse_complete_genome_file(complete_genome_file, gff_files): + """ + Function to check if all genomes given as complete genomes can be found in the pan genome. + :param complete_genome_file: + :param gff_files: + :return: a list of the base name of the complete genomes. + """ + + # Read the file and all lines (complete genomes given) + with open(complete_genome_file, 'r') as genome_file: + complete_genomes = genome_file.readlines() + complete_genomes = [name.strip().replace('.gz', '') for name in complete_genomes] + complete_genomes = [name.replace('.gff', '') for name in complete_genomes] + complete_genomes = [os.path.basename(name) for name in complete_genomes] + + # Take input gffs and remove path to the file + gffs = [os.path.basename(gff).replace('.gff', '').replace('.gz', '') for gff in gff_files] + + # check that all complete genomes are in the input gffs + complete_genome_status = all(complete_genome in gffs for complete_genome in complete_genomes) + + # If the complete genomes are found, return a list of complete genomes + if complete_genome_status: + return complete_genomes + else: + exit_with_error('Genome given in Complete genomes was not identified in pan-genome!', EXIT_INPUT_FILE_ERROR) + + +if __name__ == '__main__': + pass From fdf22d29caa582e70d7e384222533ab99aaa2fdb Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 14:46:05 +1100 Subject: [PATCH 008/135] Add in check for the soruce program of the pangenoe provided --- Corekaburra/__main__.py | 23 +- .../check_inputs.py | 45 +- unit_tests/Corekaburra_test.py | 32 + .../TestPangenomeSourceProgram/.DS_Store | Bin 0 -> 8196 bytes .../Mock_panaroo/gene_presence_absence.csv | 1775 +++++++++++++++++ .../gene_presence_absence_roary.csv | 1775 +++++++++++++++++ .../Mock_roary/gene_presence_absence.csv | 1768 ++++++++++++++++ .../Mock_unknwon/place_holder_file | 0 8 files changed, 5388 insertions(+), 30 deletions(-) rename {Code_to_transfer => Corekaburra}/check_inputs.py (64%) create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence.csv create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence_roary.csv create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_roary/gene_presence_absence.csv create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_unknwon/place_holder_file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 4fbe684..e30005d 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -24,6 +24,11 @@ except ModuleNotFoundError: from read_complete_genome_file import parse_complete_genome_file +try: + from Corekaburra.check_inputs import define_pangenome_program +except ModuleNotFoundError: + from check_inputs import define_pangenome_program + from argparse import ArgumentParser from math import floor import sys @@ -98,7 +103,6 @@ def main(): """ This is the main function for running Corekaburra. Requires a pan-genome folder from an allowed program, along with GFF3 files used for producing the pan-genome - :return: """ total_time_start = time.time() @@ -107,11 +111,28 @@ def main(): # TODO - Add in function(s) that will check all files to not be empty. - Andrew? + # Check the presence of provided complete genomes among input GFFs if args.comp_genomes is not None: comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs) else: comp_genomes = None + # Check source program from pan-genome and presence of nessecary files + if not args.quiet: + print("\n----Checking presence of input files in pan genome folder----\n") + + # Check if Panaroo or Roary input folder is given + source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan) + + # Check if gene_data file is present if Panaroo input is given an gffs should be annotated + if args.annotate and source_program is not 'Rorary': + gene_data_path = check_gene_data(args.input_pan) + if not args.quiet: + print(f"Pan genome determined to come from {source_program}") + print("All files found, let's move on!\n") + print("--------------------------------------------------------------\n") + + # If this script is run from the command line then call the main function. if __name__ == '__main__': diff --git a/Code_to_transfer/check_inputs.py b/Corekaburra/check_inputs.py similarity index 64% rename from Code_to_transfer/check_inputs.py rename to Corekaburra/check_inputs.py index 645000e..272edca 100644 --- a/Code_to_transfer/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -1,6 +1,12 @@ import os import warnings +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error +EXIT_INPUT_FILE_ERROR = 1 + def check_gff_files(file_list): for file in file_list: @@ -32,9 +38,13 @@ def check_gff_in_pan(file_list, gene_presence_absence_path): raise FileNotFoundError('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file') -def define_input_source(folder): - """ Function to examine if input pan genome folder stems from Roary or Panaroo. - Returns the program that is the source and the path to the right gene presence/absence file """ +def define_pangenome_program(folder): + """ + Function to examine if input pan genome folder stems from Roary or Panaroo. + :param folder: Input folder provided as pan-genome folder. + :return: The name of the program from which the pangenome is suspected to come from + """ + try: if os.path.isfile(os.path.join(folder, 'gene_presence_absence.csv')): # See if input is from Roary @@ -47,9 +57,11 @@ def define_input_source(folder): gene_pres_abs_file_path = os.path.join(folder, 'gene_presence_absence_roary.csv') if os.path.isfile(gene_pres_abs_file_path): return "Panaroo", gene_pres_abs_file_path + else: + exit_with_error('No gene presence/absence file was found in given pan-genome folder', EXIT_INPUT_FILE_ERROR) except FileNotFoundError: - raise FileNotFoundError('No gene presence absence file was found in given pan genome folder') + exit_with_error('No gene presence/absence file was found in given pan-genome folder', EXIT_INPUT_FILE_ERROR) def check_gene_data(folder): @@ -59,28 +71,3 @@ def check_gene_data(folder): else: raise FileNotFoundError('gene_data.csv file could not be located in the given pan genome input folder.\n' 'Please give the -a flag to omit this step or locate the gene_data.csv file.') - - -def check_gene_alignments(folder, core_gene_dict): - """ Check if the folder containing alignments for genes is available in the Panaroo folder """ - alignment_folder = os.path.join(folder, 'aligned_gene_sequences') - if os.path.isdir(alignment_folder): - - # Get a list of unique core genes - genome_dicts = [genome for genome in core_gene_dict.values()] - core_genes = [genome.values() for genome in genome_dicts] - core_genes = set([core_gene for genome in core_genes for core_gene in genome]) - - # Check that all core genes has an alignment - alignments_missing = [core_gene for core_gene in core_genes - if not os.path.isfile(os.path.join(alignment_folder, f'{core_gene}.aln.fas'))] - - if len(alignments_missing) == 0: - return alignment_folder - else: - warnings.warn(f'Not all core genes have alignments. ' - f'Genes missing alignments: {alignments_missing}') - return False - - else: - return False diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index e9eae92..194e604 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -15,6 +15,7 @@ # import Corekaburra functions from Corekaburra import exit_with_error from Corekaburra import read_complete_genome_file +from Corekaburra import check_inputs @@ -78,5 +79,36 @@ def test_correct_one_files_not_found(self): gff_files) +class TestPangenomeSourceProgram(unittest.TestCase): + def test_roary_input(self): + input_folder_path = 'TestPangenomeSourceProgram/Mock_roary' + + return_program, return_path = check_inputs.define_pangenome_program(input_folder_path) + + self.assertEqual("Roary", return_program) + self.assertEqual(input_folder_path + '/gene_presence_absence.csv', return_path) + + def test_panaroo_input(self): + input_folder_path = 'TestPangenomeSourceProgram/Mock_panaroo' + + return_program, return_path = check_inputs.define_pangenome_program(input_folder_path) + + self.assertEqual("Panaroo", return_program) + self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path) + + # def test_pirate_input(self): TODO - Make Corekaburra take Pirate input! + # pass + # input_folder_path = 'TestPangenomeSourceProgram/Mock_pirate' + # + # return_program, return_path = check_inputs.define_pangenome_program(input_folder_path) + # + # self.assertEqual("Pirate", return_program) + + def test_unknown_input(self): + input_folder_path = 'TestPangenomeSourceProgram/Mock_unknwon' + + with self.assertRaises(SystemExit): + check_inputs.define_pangenome_program(input_folder_path) + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store b/unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..69ff4c2450e0078397ed92420f5908fc7a8d045e GIT binary patch literal 8196 zcmeI1ze@u#6vtn*Xt5504h0czaC4A~lRBK`90jq9oAwt~Y`s!-=yr>%e~*H8@sD!# z`;xSG<#HBVDS|JMeD8AaC9j``OJ7Pvs{Gg}5fzCjkIK4MK;tOJ_bJDz)N4)v`@MvXfu<4(%XtZas&^z88S zv^go?sI)>r2&4&c?!HQ=6jGgT$MW}(!gi}vsfV3z!Y$Lb69z%89oAr*e+{Sz?Pcy@4 z$qnaw`hI4^Jb z(?RxseBDxnz#IuInz@Rc|DR-^|IaZZnMw! Date: Wed, 29 Dec 2021 15:03:52 +1100 Subject: [PATCH 009/135] Add in check for the presence of the gene_data.csv file, if pan-genome is from Panaroo --- Corekaburra/check_inputs.py | 67 +++++++++--------- unit_tests/Corekaburra_test.py | 15 ++++ .../TestPresenceOfGenedataFile/.DS_Store | Bin 0 -> 6148 bytes .../absent/place_holder_file | 0 .../present/gene_data.csv | 1 + 5 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store create mode 100644 unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file create mode 100644 unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 272edca..f6ac739 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -7,41 +7,10 @@ from exit_with_error import exit_with_error EXIT_INPUT_FILE_ERROR = 1 - -def check_gff_files(file_list): - for file in file_list: - if not os.path.isfile(file): - raise FileNotFoundError(f'{file} can not be found!') - return True - - -def check_gff_in_pan(file_list, gene_presence_absence_path): - with open(gene_presence_absence_path, 'r') as pan_file: - # Read the first line of the gene_presence_absence and extract the genome names - pan_header_line = pan_file.readline() - pan_header_line = pan_header_line.strip().split(',') - genome_names = pan_header_line[14:] - - - file_list = [os.path.basename(file) for file in file_list] - file_list_no_suffix = [file.rstrip('.gff') for file in file_list] - - # Check if all or subset of GFFs from pan genome have been supplied, - # if only a subset then raise warning - if set(file_list).issubset(genome_names) or set(file_list_no_suffix).issubset(genome_names): - if len(file_list) < len(genome_names): - warnings.warn( - "Not all gff in pan genome given as input. I will run with it but are you sure this is deliberate?") - - return True # True used for unit testing - - raise FileNotFoundError('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file') - - def define_pangenome_program(folder): """ Function to examine if input pan genome folder stems from Roary or Panaroo. - :param folder: Input folder provided as pan-genome folder. + :param folder: Input folder provided as pan-genome folder :return: The name of the program from which the pangenome is suspected to come from """ @@ -65,9 +34,37 @@ def define_pangenome_program(folder): def check_gene_data(folder): - """ Check if the gene_data.csv file is present in the folder from a Panaroo pan genome run. """ + """ + Check if the gene_data.csv file is present in the folder from a Panaroo pan-genome run + :param folder: Input folder provided as pan-genome folder + :return: Path to the identified gene_data.csv file + """ + if os.path.isfile(os.path.join(folder, 'gene_data.csv')): return os.path.join(folder, 'gene_data.csv') else: - raise FileNotFoundError('gene_data.csv file could not be located in the given pan genome input folder.\n' - 'Please give the -a flag to omit this step or locate the gene_data.csv file.') + exit_with_error('gene_data.csv file could not be located in the given pan genome input folder.\n' + 'Please give the -a flag to omit this step or locate the gene_data.csv file.', + EXIT_INPUT_FILE_ERROR) + + +def check_gff_in_pan(file_list, gene_presence_absence_path): + with open(gene_presence_absence_path, 'r') as pan_file: + # Read the first line of the gene_presence_absence and extract the genome names + pan_header_line = pan_file.readline() + pan_header_line = pan_header_line.strip().split(',') + genome_names = pan_header_line[14:] + + file_list = [os.path.basename(file) for file in file_list] + file_list_no_suffix = [file.rstrip('.gff') for file in file_list] + + # Check if all or subset of GFFs from pan genome have been supplied, + # if only a subset then raise warning + if set(file_list).issubset(genome_names) or set(file_list_no_suffix).issubset(genome_names): + if len(file_list) < len(genome_names): + warnings.warn( + "Not all gff in pan genome given as input. I will run with it but are you sure this is deliberate?") + + return True # True used for unit testing + + raise FileNotFoundError('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file') diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 194e604..7135f6c 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -110,5 +110,20 @@ def test_unknown_input(self): with self.assertRaises(SystemExit): check_inputs.define_pangenome_program(input_folder_path) + +class TestPresenceOfGenedataFile(unittest.TestCase): + def test_Genedata_File_present(self): + input_folder_path = 'TestPresenceOfGenedataFile/present' + return_path = check_inputs.check_gene_data(input_folder_path) + + self.assertEqual(return_path, input_folder_path +'/gene_data.csv') + + def test_Genedata_File_absent(self): + input_folder_path = 'TestPresenceOfGenedataFile/absent' + + with self.assertRaises(SystemExit): + check_inputs.check_gene_data(input_folder_path) + + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4d1d3738e86b331026c69b1a36816d3145aed360 GIT binary patch literal 6148 zcmeHKu};H447E!Ifi4{x?-%+9p$aoYXQUE}=+KY|%8K$U{0Iv_#`D=Kq?d>Vp{lYa z-@Ewii}NmuVuN?cC87)}I6BABBeE`<$jB_R$Z?M++HM|ai~G825^pygA_H=E zr*uapz0d>opI>Las*9rB)FtAwxAo2P;^Xsbe~+qO{p!=KllSC|8hS<+!60h%UhP`_7ImGkwoSXq?z!~@p22is_vWcRP&VV!E4D1+???ZqJ=7z0e{B&T5 zEdX!~a}>;_mynoXm>af=Sb?yH0yUJa#b6DGJ(yo^*eYr`u{9rTcV_EQINlxmhv-h6 zEBfdRI0IbSA_(%^$L@UIMf0g^#VSpWb4 literal 0 HcmV?d00001 diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv new file mode 100644 index 0000000..d5e0df0 --- /dev/null +++ b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv @@ -0,0 +1 @@ +test,file From 06ed59d8c8e5b9267dcbb6b67969a91d62071a25 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 29 Dec 2021 15:25:51 +1100 Subject: [PATCH 010/135] Add in test for presence of gffs in pangenome with unit tests --- Corekaburra/__main__.py | 18 ++++++++-- Corekaburra/check_inputs.py | 5 ++- unit_tests/Corekaburra_test.py | 35 +++++++++++++++++++ .../gene_presence_absence_roary.csv | 1 + 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index e30005d..3ad380b 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -25,9 +25,9 @@ from read_complete_genome_file import parse_complete_genome_file try: - from Corekaburra.check_inputs import define_pangenome_program + from Corekaburra.check_inputs import define_pangenome_program, check_gene_data except ModuleNotFoundError: - from check_inputs import define_pangenome_program + from check_inputs import define_pangenome_program, check_gene_data from argparse import ArgumentParser from math import floor @@ -125,13 +125,25 @@ def main(): source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan) # Check if gene_data file is present if Panaroo input is given an gffs should be annotated - if args.annotate and source_program is not 'Rorary': + if args.annotate and source_program is 'Panaroo': gene_data_path = check_gene_data(args.input_pan) if not args.quiet: print(f"Pan genome determined to come from {source_program}") print("All files found, let's move on!\n") print("--------------------------------------------------------------\n") + # TODO - Make the program work with less than all files in the pangenome. Just make sure that all gff files supplied can be found in the pan genome. This will make is possible to look at hotspots and segments in different lineages + check_gff_in_pan(args.input_gffs, input_pres_abs_file_path) + + + # Construct output folder + try: + mkdir(args.output_path) + if not args.quiet: + print("Output folder constructed") + except FileExistsError: + if not args.quiet: + print("Output folder exists") # If this script is run from the command line then call the main function. diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index f6ac739..198238e 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -64,7 +64,10 @@ def check_gff_in_pan(file_list, gene_presence_absence_path): if len(file_list) < len(genome_names): warnings.warn( "Not all gff in pan genome given as input. I will run with it but are you sure this is deliberate?") + # TODO - LOG above! return True # True used for unit testing - raise FileNotFoundError('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file') + # Exit with error is not all inputs can be found in the pan-genome presence absence file + exit_with_error('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file', + EXIT_INPUT_FILE_ERROR) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 7135f6c..50cdf6a 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -125,5 +125,40 @@ def test_Genedata_File_absent(self): check_inputs.check_gene_data(input_folder_path) +class TestPresenceOfGffsInPresAbsFile(unittest.TestCase): + # Test pairing of all files in pan genome + def test_input_gff_pres_abs_pairing_all_gffs(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['Silas_the_Salmonella', 'Christina_the_Streptococcus', 'Ajwa_the_Shigella'] + + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + + self.assertEqual(return_bool, True) + + # Test pairing of some files in pan genome - Warning + def test_input_gff_pres_abs_pairing_some(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff'] + + with self.assertWarns(Warning): + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + + self.assertEqual(return_bool, True) + + # Test when given a file not in pan genome among others that are in the pan genome + def test_input_gff_pres_abs_file_not_in_pan(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['not_found.gff', 'Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff'] + + with self.assertRaises(SystemExit): + check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + + def test_input_gff_pres_abs_some_file_not_in_pan(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['not_found.gff', 'also_not_found.gff', 'definitely_not_found.gff'] + + with self.assertRaises(SystemExit): + check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv new file mode 100644 index 0000000..b58a3ef --- /dev/null +++ b/unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv @@ -0,0 +1 @@ +Gene,Non.unique.Gene.name,Annotation,No..isolates,No..sequences,Avg.sequences.per.isolate,Genome.Fragment,Order.within.Fragment,Accessory.Fragment,Accessory.Order.with.Fragment,QC,Min.group.size.nuc,Max.group.size.nuc,Avg.group.size.nuc,Silas_the_Salmonella,Christina_the_Streptococcus,Ajwa_the_Shigella \ No newline at end of file From 9b5bed8122ddeda56609ce31631514b83fb7b8ec Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 30 Dec 2021 12:01:48 +1100 Subject: [PATCH 011/135] Add in the passing of the presence/absence matrix from a pan-genome. Add test for functions related to this procedure --- .../parse_gene_presence_absence.py | 218 -------- Corekaburra/__main__.py | 23 +- Corekaburra/parse_gene_presence_absence.py | 224 ++++++++ unit_tests/Corekaburra_test.py | 481 ++++++++++++++++++ .../Ajwa_the_Legionella.gff | 8 + .../Ajwa_the_Shigella.gff | 8 + .../Aman_the_Streptococcus.gff | 8 + .../Cari_the_Listeria.gff | 8 + .../Christina_the_Streptococcus.gff | 8 + .../Dina_the_Shigella.gff | 8 + .../Lilly_the_Shigella.gff | 7 + .../Silas_the_Legionella.gff | 8 + .../Silas_the_Salmonella.gff | 12 + .../Zion_the_Streptococcus.gff | 8 + .../gene_presence_absence.csv | 8 + .../gene_presence_absence_roary.csv | 8 + .../test_tmp_folder/tmp_file_in_tmp_folder | 0 17 files changed, 824 insertions(+), 221 deletions(-) delete mode 100644 Code_to_transfer/parse_gene_presence_absence.py create mode 100644 Corekaburra/parse_gene_presence_absence.py create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv create mode 100644 unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder diff --git a/Code_to_transfer/parse_gene_presence_absence.py b/Code_to_transfer/parse_gene_presence_absence.py deleted file mode 100644 index 332c971..0000000 --- a/Code_to_transfer/parse_gene_presence_absence.py +++ /dev/null @@ -1,218 +0,0 @@ -import os -import csv -from math import ceil, floor -import gffutils - - -def check_fragmented_gene(fragments_in_line, input_gffs, temp_folder_path): - return_list = [] - for fragment in fragments_in_line: - # TODO - write unit test for function. - # split the two fragments - fragments = fragment.split(';') - - # Get the name of the genome - genome = fragments[0].rsplit("_", 1)[0] - - # Get the gff and its path - try: - gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] - except IndexError: - raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') - - # Construct gff database to be searched - db_name = os.path.join(temp_folder_path, f'{genome}_db') - if not os.path.isfile(db_name): - gffutils.create_db(gff_file, db_name) - - # Attach database - gff_database = gffutils.FeatureDB(db_name) - - # Check that all fragments are on the same contig. - first_fragment_contig = gff_database[fragments[0]][0] - frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragments]) - if frag_same_contig: - # TODO - Get the coordinate of the fragments - # Get all coordinates - frag_coors = [] - for frag in fragments: - frag_coors.append(gff_database[frag][3]) - frag_coors.append(gff_database[frag][4]) - - # Construct region to be searched for annotations between fragments: - max_frag_coor = max(frag_coors) - min_frag_coor = min(frag_coors) - region = (first_fragment_contig, min_frag_coor, max_frag_coor) - - # Find all features that are completly within the region - region_features = gff_database.region(region=region, completely_within=True) - # print(list(region_features)) - - # find all genes that are not part of the fragmented gene - region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) - # print(region_locus_tags) - # print(fragments) - excess_genes = region_locus_tags.difference(fragments) - - - # check the number of excess genes, if any then False to being core - if len(excess_genes) > 0: - return_list.append(False) - else: - return_list.append(True) - - return return_list - - # TODO - Find out how the gff parser handles this? Does there need to be a check if a gene cluster is being paired to it self and if then drop it and change the end coordinates. - - -def read_gene_presence_absence(file_name, core_gene_presence, low_freq_gene, source_program, input_gffs, temp_folder_path, verbose=True): - """Function that pass a Roary style gene presence/absence file. - Returns directories of core and low frequency genes, and a directory of pan genome clusters and their annotation""" - - file = os.path.join("", file_name) - - # Check if file exists, if the read otherwise raise error. - if os.path.isfile(file): # TODO - THIS CHECK IS IRRELEVANT WITH THE CHECK OF THE SOURCE PROGRAM! - with open(file, 'r', newline='', ) as gene_presence_absence: - # Read column header line - gff_file_names = gene_presence_absence.readline() - # Strip for whitespace - gff_file_names = gff_file_names.strip() - # split column names - gff_file_names = gff_file_names.split(',') - - # Remove the quotes from Rorary input - if source_program == 'Roary': - gff_file_names = [filename.replace('"', '') for filename in gff_file_names] - - # Index gff filenames and column position in dict for better search - gff_file_dict = {} - for i, gff_name in enumerate(gff_file_names[14:]): - gff_file_dict[gff_name] = i - - # Read remaining lines and construct a nested dicts one dict for each genome and its core genes, - # and a dict for low frequency genes found in less than set percent of isolates - - # Initialise reader object to read remaining lines - reader = csv.reader(gene_presence_absence, delimiter=',') - # Counters - core_gene_number = 0 - low_freq_gene_number = 0 - acc_gene_number = 0 - - # Determine number of isolates that represent core and low frequency genes - core_gene_isolate_presence = floor(len(gff_file_dict.keys()) * core_gene_presence) - low_freq_gene_isolate_presence = ceil(len(gff_file_dict.keys()) * low_freq_gene) - - if verbose: - print(f"\n------------Opening the gene presence/absence file------------\n") - print(f"Core genes must be found in {core_gene_isolate_presence} or more isolates") - print(f"Low frequency genes must be found in {low_freq_gene_isolate_presence} or fewer isolates\n") - - # initialise dict of dicts to hold genes from each gffs and to be returned - core_gene_dict = {item: {} for item in gff_file_names[14:]} - low_freq_gene_dict = {item: {} for item in gff_file_names[14:]} - acc_gene_dict = {item: {} for item in gff_file_names[14:]} - - # Initialise dict that contain annotations - annotation_dict = {} - - # Read lines from file and determine if core, low frequency or 'regular' accessory and record annotations - for line in reader: - # Remove quotes if Roary - if source_program == 'Roary': - line = [element.replace('"', '') for element in line] - - # Record annotations of refound genes - if any(['refound' in gene for gene in line[14:]]): - refound_genes = [gene for gene in line[14:] if 'refound' in gene] - for gene in refound_genes: - annotation_dict[gene] = line[2] - - # Get number of genes in line and average presence of genes in genomes - gene_isolate_presence = int(line[3]) - avg_gene_presence = int(line[4]) - - # Check if core gene, if then add annotations to genomes - # TODO - Handle genes that have a paralog and are concatenated by ';', and check if neighbours - # Check if gene is present in all genomes and no one gene is fragmented - if core_gene_isolate_presence <= gene_isolate_presence == avg_gene_presence: - # Add gene cluster to genomes - for genome in core_gene_dict.keys(): - # Check if there is an annotation for the given genome - if len(line[14 + gff_file_dict[genome]]) > 0: - core_gene_dict[genome][line[14+gff_file_dict[genome]]] = line[0] - core_gene_number += 1 - - # Check if gene is present in all genomes, but more than one copy is pressent - elif core_gene_isolate_presence <= gene_isolate_presence: - # Identify annotations for genomes that are fragmented genes - fragments_in_line = [genes for genes in line[14:] if ';' in genes] - - # Check that each annotation is neighboring the other annotation. - return_list = check_fragmented_gene(fragments_in_line, input_gffs, temp_folder_path) - - # Check if gene was found to be a core gene - if all(return_list): - # Add the gene to the annotation dict - for genome in core_gene_dict.keys(): - # Get the annoations for a specific genome - genes_in_genome = line[14 + gff_file_dict[genome]] - # If there is an annotation add id - if len(genes_in_genome) > 0: - # Check if genome has fragments of genes, - # if then add them all to the annotation dict, - # if not then just ad the single annotation - if ';' in genes_in_genome: - for gene in genes_in_genome.split(';'): - core_gene_dict[genome][gene] = line[0] - else: - core_gene_dict[genome][genes_in_genome] = line[0] - core_gene_number += 1 - - else: - # Check if low frequency, if then add else then add as normal accessory - if low_freq_gene_isolate_presence >= gene_isolate_presence == avg_gene_presence: # TODO - review this == statement, should it be there? - for genome in low_freq_gene_dict.keys(): - if len(line[14 + gff_file_dict[genome]]) > 0: - low_freq_gene_dict[genome][line[14 + gff_file_dict[genome]]] = line[0] - low_freq_gene_number += 1 - else: - for genome in acc_gene_dict.keys(): - if len(line[14 + gff_file_dict[genome]]) > 0: - acc_gene_dict[genome][line[14 + gff_file_dict[genome]]] = line[0] - acc_gene_number += 1 - - # Check if accessory if then add annotation to genomes - elif low_freq_gene_isolate_presence >= gene_isolate_presence == avg_gene_presence: # TODO - review this == statement, should it be there? - for genome in low_freq_gene_dict.keys(): - if len(line[14+gff_file_dict[genome]]) > 0: - low_freq_gene_dict[genome][line[14+gff_file_dict[genome]]] = line[0] - low_freq_gene_number += 1 - - # If not core or low frequency count as regular accessory - else: - for genome in acc_gene_dict.keys(): - if len(line[14+gff_file_dict[genome]]) > 0: - acc_gene_dict[genome][line[14+gff_file_dict[genome]]] = line[0] - acc_gene_number += 1 - - if verbose: - print("A total of:") - print(f"{core_gene_number} core gene clusters were identified") - print(f"{low_freq_gene_number} low frequency gene clusters were identified") - print(f"{acc_gene_number} intermediate accessory gene clusters were identified\n") - else: - raise FileNotFoundError('Given gene presence absence file not found. Please check and try again.') - - # Remove gff databases - files_in_tmp = os.listdir(temp_folder_path) - gff_dbs = [file for file in files_in_tmp if '_db' in file] - [os.remove(os.path.join(temp_folder_path, db)) for db in gff_dbs] - - return core_gene_dict, low_freq_gene_dict, acc_gene_dict, annotation_dict - - -if __name__ == '__main__': - core_dict, *_ = read_gene_presence_absence('/Users/mjespersen/Downloads/gene_presence_absence.csv', 1, 0.05, 'Roary') diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 3ad380b..8df32b6 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -25,9 +25,14 @@ from read_complete_genome_file import parse_complete_genome_file try: - from Corekaburra.check_inputs import define_pangenome_program, check_gene_data + from Corekaburra.check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan except ModuleNotFoundError: - from check_inputs import define_pangenome_program, check_gene_data + from check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan + +try: + from Corekaburra.parse_gene_presence_absence import read_gene_presence_absence +except ModuleNotFoundError: + from parse_gene_presence_absence import read_gene_presence_absence from argparse import ArgumentParser from math import floor @@ -138,13 +143,25 @@ def main(): # Construct output folder try: - mkdir(args.output_path) + os.mkdir(args.output_path) if not args.quiet: print("Output folder constructed") except FileExistsError: if not args.quiet: print("Output folder exists") + # Construct temporary folder: + # TODO - check that the temporary folder does not exist and that the user does not have a folder with same name already. (Maybe use a time stamp for the start to make it unique.) + tmp_folder_path = os.path.join(args.output_path, 'Corekaburra_tmp') + os.mkdir(tmp_folder_path) + + ## Read in gene presence absence file + time_start = time.time() + # TODO - Add the user specified thresholds for core and low frequency genes. + core_dict, low_freq_dict, acc_gene_dict, attribute_dict = read_gene_presence_absence(input_pres_abs_file_path, + 1, 0.05, source_program, + args.input_gffs, + tmp_folder_path) # If this script is run from the command line then call the main function. if __name__ == '__main__': diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py new file mode 100644 index 0000000..dab5ea0 --- /dev/null +++ b/Corekaburra/parse_gene_presence_absence.py @@ -0,0 +1,224 @@ +import os +import csv +from math import ceil, floor +import gffutils + + +def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): + """ + Function to add a gene to a given dictionary + :param main_dict: Dict of genes from genomes. A dict of dicts, with first set of keys being genomes, second is locus_tags with pan-genome gene being the key. + :param gene: The gene in question from a specific genome (locus_tag) + :param pan_gene_name: The name of the pan-genome gene (cluster) to which the above gene belongs. + :param genome: The name of the genome in question + :return: returns the dict to be used further + """ + + if ';' in gene: + for gene_part in gene.split(';'): # TODO - NOTE! HERE BOTH GENES IN A PAIR IS ADDED as separate key/value-pairs + main_dict[genome][gene_part] = pan_gene_name + else: + main_dict[genome][gene] = pan_gene_name + + return main_dict + + +def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): + return_list = [] + for fragment in fragments_in_line: + # split the two fragments + fragments = fragment.split(';') + + # Get the name of the genome + genome = fragments[0].rsplit("_", 1)[0] + + # Get the gff and its path + try: + gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] + except IndexError: + raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') + + # Construct gff database to be searched + db_name = os.path.join(tmp_folder_path, f'{genome}_db') + if not os.path.isfile(db_name): + gffutils.create_db(gff_file, db_name, force_gff=True) + + # Attach database + gff_database = gffutils.FeatureDB(db_name) + + # Check that all fragments are on the same contig. + first_fragment_contig = gff_database[fragments[0]][0] + frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragments]) + if frag_same_contig: + # Get all coordinates + frag_coors = [] + for frag in fragments: + frag_coors.append(gff_database[frag][3]) + frag_coors.append(gff_database[frag][4]) + + # Construct region to be searched for annotations between fragments: + max_frag_coor = max(frag_coors) + min_frag_coor = min(frag_coors) + region = (first_fragment_contig, min_frag_coor, max_frag_coor) + + # Find all features that are completly within the region + region_features = gff_database.region(region=region, completely_within=True) + + # find all genes that are not part of the fragmented gene + region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) + excess_genes = region_locus_tags.difference(fragments) + + # check the number of excess genes, if any then False to being core + if len(excess_genes) > 0: + return_list.append(False) + else: + return_list.append(True) + + return return_list + # TODO - Find out how the gff parser handles this? Does there need to be a check if a gene cluster is being paired to it self and if then drop it and change the end coordinates. + + +def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, verbose=True): + """Function that pass a Roary style gene presence/absence file. + Returns directories of core and low frequency genes, and a directory of pan genome clusters and their annotation""" + + # file = os.path.join("", pres_abs_file) + + # Open the presence/absense file to index gene into core, accessory, or low-frequency genes + with open(pres_abs_file, 'r', newline='', ) as gene_presence_absence: + # Read column header line + gff_file_names = gene_presence_absence.readline() + # Strip for whitespace + gff_file_names = gff_file_names.strip() + # split column names + gff_file_names = gff_file_names.split(',') + + # Remove the quotes from Rorary input + if source_program == 'Roary': + gff_file_names = [filename.replace('"', '') for filename in gff_file_names] + + # Index gff filenames and column position in dict for better search + gff_file_dict = {} + for i, gff_name in enumerate(gff_file_names[14:]): + gff_file_dict[gff_name] = i + + # Read remaining lines and construct a nested dicts one dict for each genome and its core genes, + # and a dict for low frequency genes found in less than set percent of isolates + + # Initialise reader object to read remaining lines + reader = csv.reader(gene_presence_absence, delimiter=',') + # Counters + core_gene_number = 0 + low_freq_gene_number = 0 + acc_gene_number = 0 + + # Determine number of isolates that represent core and low frequency genes + core_gene_isolate_presence = floor(len(gff_file_dict.keys()) * core_gene_presence) + low_freq_gene_isolate_presence = ceil(len(gff_file_dict.keys()) * low_freq_gene) + + if verbose: + print(f"\n------------Opening the gene presence/absence file------------\n") + print(f"Core genes must be found in {core_gene_isolate_presence} or more isolates") + print(f"Low frequency genes must be found in {low_freq_gene_isolate_presence} or fewer isolates\n") + + # initialise dict of dicts to hold genes from each gffs and to be returned + core_gene_dict = {item: {} for item in gff_file_names[14:]} + low_freq_gene_dict = {item: {} for item in gff_file_names[14:]} + acc_gene_dict = {item: {} for item in gff_file_names[14:]} + + # Initialise dict that contain annotations + annotation_dict = {} + + # Read lines from file and determine if core, low frequency or 'regular' accessory and record annotations + for line in reader: + # Remove quotes if Roary + if source_program == 'Roary': + line = [element.replace('"', '') for element in line] + + # Record annotations of refound genes + if any(['refound' in gene for gene in line[14:]]): + refound_genes = [gene for gene in line[14:] if 'refound' in gene] + for gene in refound_genes: + annotation_dict[gene] = line[2] + + # Get number of genes in line and average presence of genes in genomes + gene_isolate_presence = int(line[3]) + no_seq_presence = int(line[4]) + + # Check if core gene, if then add annotations to genomes + # TODO - Handle genes that have a paralog and are concatenated by ';', and check if neighbours + # Check if gene is present in all genomes and no one gene is fragmented + if core_gene_isolate_presence <= gene_isolate_presence == no_seq_presence: + # Add gene cluster to genomes + for genome in core_gene_dict.keys(): # TODO - Change this to go through genomes with something in them - so that core threshold can be lower + # Check if there is an annotation for the given genome + if len(line[14 + gff_file_dict[genome]]) > 0: + core_gene_dict[genome][line[14+gff_file_dict[genome]]] = line[0] + core_gene_number += 1 + + # Check if gene is present in all genomes, but more than one copy is pressent + elif core_gene_isolate_presence <= gene_isolate_presence: + # Identify annotations for genomes that are fragmented genes + fragments_in_line = [genes for genes in line[14:] if ';' in genes] + + # Check that each annotation is neighboring the other annotation. + return_list = check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + + # Check if gene was found to be a core gene + if all(return_list): + # Add the gene to the annotation dict + for genome in core_gene_dict.keys(): + # Get the annoations for a specific genome + genes_in_genome = line[14 + gff_file_dict[genome]] + # If there is an annotation add id + if len(genes_in_genome) > 0: + # Check if genome has fragments of genes, + # if then add them all to the annotation dict, + # if not then just ad the single annotation + add_gene_to_dict(core_gene_dict, genes_in_genome, line[0], genome) + core_gene_number += 1 + + else: + # Check if low frequency, if then add else then add as normal accessory + if low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: # TODO - review this == statement, should it be there? + for genome in low_freq_gene_dict.keys(): + if len(line[14 + gff_file_dict[genome]]) > 0: + add_gene_to_dict(low_freq_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) + low_freq_gene_number += 1 + else: + for genome in acc_gene_dict.keys(): + if len(line[14 + gff_file_dict[genome]]) > 0: + add_gene_to_dict(acc_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) + acc_gene_number += 1 + + # Check if accessory if then add annotation to genomes + elif low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: # TODO - review this == statement, should it be there? + for genome in low_freq_gene_dict.keys(): + if len(line[14+gff_file_dict[genome]]) > 0: + add_gene_to_dict(low_freq_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) + low_freq_gene_number += 1 + + # If not core or low frequency count as regular accessory + else: + for genome in acc_gene_dict.keys(): + if len(line[14+gff_file_dict[genome]]) > 0: + add_gene_to_dict(acc_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) + acc_gene_number += 1 + + if verbose: + print("A total of:") + print(f"{core_gene_number} core gene clusters were identified") + print(f"{low_freq_gene_number} low frequency gene clusters were identified") + print(f"{acc_gene_number} intermediate accessory gene clusters were identified\n") + + + # Remove gff databases + files_in_tmp = os.listdir(tmp_folder_path) + gff_dbs = [file for file in files_in_tmp if '_db' in file] + [os.remove(os.path.join(tmp_folder_path, db)) for db in gff_dbs] + + return core_gene_dict, low_freq_gene_dict, acc_gene_dict, annotation_dict + + +if __name__ == '__main__': + pass diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 50cdf6a..e944c5c 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -16,6 +16,7 @@ from Corekaburra import exit_with_error from Corekaburra import read_complete_genome_file from Corekaburra import check_inputs +from Corekaburra import parse_gene_presence_absence @@ -27,6 +28,7 @@ class TestExitWithError(unittest.TestCase): + """ Test for the function carrying out a nice exit """ def test_exit_w_tmp_folder_deletion(self): ''' Test the exit function is able to remove the temporary folder ''' @@ -46,6 +48,7 @@ def test_exit_w_tmp_folder_deletion(self): class TestParsingCompleteGenomes(unittest.TestCase): + """ Test for the passing of input file containing names of complete genome and checking their presence in the pan-genome """ def test_all_files_found(self): gff_files = ['/path/to/complete_genome_1.gff', '/path/complete_genome_2.gff.gz', @@ -80,6 +83,7 @@ def test_correct_one_files_not_found(self): class TestPangenomeSourceProgram(unittest.TestCase): + """ Test of the function that determines the program from which the pan-genome originated """ def test_roary_input(self): input_folder_path = 'TestPangenomeSourceProgram/Mock_roary' @@ -112,6 +116,7 @@ def test_unknown_input(self): class TestPresenceOfGenedataFile(unittest.TestCase): + """ Test the function that ensures the presence of the Gene_data.csv file produced by Panaroo """ def test_Genedata_File_present(self): input_folder_path = 'TestPresenceOfGenedataFile/present' return_path = check_inputs.check_gene_data(input_folder_path) @@ -126,6 +131,7 @@ def test_Genedata_File_absent(self): class TestPresenceOfGffsInPresAbsFile(unittest.TestCase): + """ Test the function that ensures all gffs given as input are included in the pan-genome provided """ # Test pairing of all files in pan genome def test_input_gff_pres_abs_pairing_all_gffs(self): input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' @@ -160,5 +166,480 @@ def test_input_gff_pres_abs_some_file_not_in_pan(self): with self.assertRaises(SystemExit): check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + +class TestAddingGeneToDict(unittest.TestCase): + """ + Tests of the function that adds a gene to the dict used to holds that class (core, accessory of low-frequency) + """ + def test_adding_gene(self): + main_dict = {'Test_genome': {}} + gene = 'Test_gene_from_genome' + pan_gene_name = 'Test_pan_gene' + genome = 'Test_genome' + + expected_return = {'Test_genome': {'Test_gene_from_genome': 'Test_pan_gene'}} + + return_dict = parse_gene_presence_absence.add_gene_to_dict(main_dict, gene, pan_gene_name, genome) + + self.assertEqual(expected_return, return_dict) + + def test_adding_additional_gene(self): + main_dict = {'Test_genome': {'Test_gene_from_genome': 'Test_pan_gene'}} + gene = 'Test_gene_from_genome_2' + pan_gene_name = 'Test_pan_gene_2' + genome = 'Test_genome' + + expected_return = {'Test_genome': {'Test_gene_from_genome': 'Test_pan_gene', + 'Test_gene_from_genome_2': 'Test_pan_gene_2'}} + + return_dict = parse_gene_presence_absence.add_gene_to_dict(main_dict, gene, pan_gene_name, genome) + + self.assertEqual(expected_return, return_dict) + + +class TestCheckingFragmentedGenes(unittest.TestCase): + """ + Test of the function that examines the placement of a potential core gene's placement, if it is fragmented in at least one genome. + """ + + def tearDown(self): + """ Class to remove created database files of gff files in tmp-folder""" + for file in os.listdir('test_tmp_folder'): + if "_db" in file: + db_path = os.path.join('test_tmp_folder', file) + os.remove(db_path) + + def test_fragmented_gene_true(self): + """ Gene is fragmented but found next to each other with nothing in between """ + fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2'] + input_gffs =['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'test_tmp_folder' + + expected_return = [True] + + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + + self.assertEqual(expected_return, return_bool) + + def test_fragmented_gene_fasle(self): + """ Gene is fragmented but found next to each other with another gene in between """ + fragments_in_line = ['Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'test_tmp_folder' + + expected_return = [False] + + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + + self.assertEqual(expected_return, return_bool) + + def test_fragmented_gene_mutiple_genes_fasle(self): + """ Two genes fragmented with one having nothing and the other having something in between fragments """ + fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', 'Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'test_tmp_folder' + + expected_return = [True, False] + + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + + self.assertEqual(expected_return, return_bool) + + +class TestParsingGenePresenceAbsenceFile(unittest.TestCase): + """ + Tests for the function that passes the gene presence absence table from pan-genome program + """ + def test_parsing_w_100_presence(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv' + core_gene_presence = 1 + low_freq_gene = 0.1 + source_program = 'Panaroo' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + + expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", + 'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B"}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B"}, + 'Cari_the_Listeria': {'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B"}, + 'Zion_the_Streptococcus': {'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + expected_low_freq_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-7': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + expected_acc_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D', + 'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D", + 'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D", + "Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D", + 'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + "Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D", + "Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + "Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + "Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + "Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + expected_annotation_dict = {} # None - Should be done and holds refounds! - TODO Make test for this + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict, annotation_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path) + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + + def test_parsing_w_100_presence_roary(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv' + core_gene_presence = 1 + low_freq_gene = 0.1 + source_program = 'Roary' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + + + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict, annotation_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path) + + expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", + 'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B"}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B"}, + 'Cari_the_Listeria': {'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B"}, + 'Zion_the_Streptococcus': {'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + expected_low_freq_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-7': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + expected_acc_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D', + 'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D", + 'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D", + "Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D", + 'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + "Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D", + "Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + "Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + "Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + "Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + + def test_parsing_w_90_presence(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv' + core_gene_presence = 0.9 + low_freq_gene = 0.1 + source_program = 'Panaroo' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict, annotation_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path) + + expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", + 'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B", + 'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D',}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B", + 'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B", + "Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B", + 'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + 'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B", + "Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + 'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + 'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + 'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + expected_low_freq_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-7': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + expected_acc_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + + def test_parsing_w_90_presence_roary(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv' + core_gene_presence = 0.90 + low_freq_gene = 0.1 + source_program = 'Roary' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', + 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict, annotation_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path) + + expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", + 'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B", + 'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D', }, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B", + 'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B", + "Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B", + 'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + 'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B", + "Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + 'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + 'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + 'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + expected_low_freq_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-7': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + expected_acc_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff new file mode 100644 index 0000000..a8f1fee --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Ajwa_the_Legionella_tag-4-1;locus_tag=Ajwa_the_Legionella_tag-4-1 +contig_1 . CDS 100 190 . . . ID=Ajwa_the_Legionella_tag-4-2;locus_tag=Ajwa_the_Legionella_tag-4-2 +contig_1 . CDS 200 290 . . . ID=Ajwa_the_Legionella_tag-4-3;locus_tag=Ajwa_the_Legionella_tag-4-3 +contig_1 . CDS 300 390 . . . ID=Ajwa_the_Legionella_tag-4-4;locus_tag=Ajwa_the_Legionella_tag-4-4 +contig_1 . CDS 400 490 . . . ID=Ajwa_the_Legionella_tag-4-5;locus_tag=Ajwa_the_Legionella_tag-4-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff new file mode 100644 index 0000000..68c9fee --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Ajwa_the_Shigella_tag-3-1;locus_tag=Ajwa_the_Shigella_tag-3-1 +contig_1 . CDS 100 190 . . . ID=Ajwa_the_Shigella_tag-3-2;locus_tag=Ajwa_the_Shigella_tag-3-2 +contig_1 . CDS 200 290 . . . ID=Ajwa_the_Shigella_tag-3-3;locus_tag=Ajwa_the_Shigella_tag-3-3 +contig_1 . CDS 300 390 . . . ID=Ajwa_the_Shigella_tag-3-4;locus_tag=Ajwa_the_Shigella_tag-3-4 +contig_1 . CDS 400 490 . . . ID=Ajwa_the_Shigella_tag-3-5;locus_tag=Ajwa_the_Shigella_tag-3-5 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff new file mode 100644 index 0000000..4123676 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Aman_the_Streptococcus_tag-6-1;locus_tag=Aman_the_Streptococcus_tag-6-1 +contig_1 . CDS 100 190 . . . ID=Aman_the_Streptococcus_tag-6-2;locus_tag=Aman_the_Streptococcus_tag-6-2 +contig_1 . CDS 200 290 . . . ID=Aman_the_Streptococcus_tag-6-3;locus_tag=Aman_the_Streptococcus_tag-6-3 +contig_1 . CDS 300 390 . . . ID=Aman_the_Streptococcus_tag-6-4;locus_tag=Aman_the_Streptococcus_tag-6-4 +contig_1 . CDS 400 490 . . . ID=Aman_the_Streptococcus_tag-6-5;locus_tag=Aman_the_Streptococcus_tag-6-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff new file mode 100644 index 0000000..4388351 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Cari_the_Listeria_tag-5-1;locus_tag=Cari_the_Listeria_tag-5-1 +contig_1 . CDS 100 190 . . . ID=Cari_the_Listeria_tag-5-2;locus_tag=Cari_the_Listeria_tag-5-2 +contig_1 . CDS 200 290 . . . ID=Cari_the_Listeria_tag-5-3;locus_tag=Cari_the_Listeria_tag-5-3 +contig_1 . CDS 300 390 . . . ID=Cari_the_Listeria_tag-5-4;locus_tag=Cari_the_Listeria_tag-5-4 +contig_1 . CDS 400 490 . . . ID=Cari_the_Listeria_tag-5-5;locus_tag=Cari_the_Listeria_tag-5-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff new file mode 100644 index 0000000..ebad391 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Christina_the_Streptococcus_tag-2-1;locus_tag=Christina_the_Streptococcus_tag-2-1 +contig_1 . CDS 100 190 . . . ID=Christina_the_Streptococcus_tag-2-2;locus_tag=Christina_the_Streptococcus_tag-2-2 +contig_1 . CDS 200 290 . . . ID=Christina_the_Streptococcus_tag-2-3;locus_tag=Christina_the_Streptococcus_tag-2-3 +contig_1 . CDS 300 390 . . . ID=Christina_the_Streptococcus_tag-2-4;locus_tag=Christina_the_Streptococcus_tag-2-4 +contig_1 . CDS 400 490 . . . ID=Christina_the_Streptococcus_tag-2-5;locus_tag=Christina_the_Streptococcus_tag-2-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff new file mode 100644 index 0000000..7b29dda --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Dina_the_Shigella_tag-8-1;locus_tag=Dina_the_Shigella_tag-8-1 +contig_1 . CDS 100 190 . . . ID=Dina_the_Shigella_tag-8-2;locus_tag=Dina_the_Shigella_tag-8-2 +contig_1 . CDS 200 290 . . . ID=Dina_the_Shigella_tag-8-3;locus_tag=Dina_the_Shigella_tag-8-3 +contig_1 . CDS 300 390 . . . ID=Dina_the_Shigella_tag-8-4;locus_tag=Dina_the_Shigella_tag-8-4 +contig_1 . CDS 400 490 . . . ID=Dina_the_Shigella_tag-8-5;locus_tag=Dina_the_Shigella_tag-8-5 +##FASTA +>contigo newline at end of file diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff new file mode 100644 index 0000000..3c17dd1 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff @@ -0,0 +1,7 @@ +contig_1 . CDS 1 90 . . . ID=Lilly_the_Shigella_tag-10-1;locus_tag=Lilly_the_Shigella_tag-10-1 +contig_1 . CDS 100 190 . . . ID=Lilly_the_Shigella_tag-10-2;locus_tag=Lilly_the_Shigella_tag-10-2 +contig_1 . CDS 200 290 . . . ID=Lilly_the_Shigella_tag-10-5;locus_tag=Lilly_the_Shigella_tag-10-5 +contig_1 . CDS 300 390 . . . ID=Lilly_the_Shigella_tag-10-6;locus_tag=Lilly_the_Shigella_tag-10-6 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff new file mode 100644 index 0000000..5ee9f6a --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Legionella_tag-9-1;locus_tag=Silas_the_Legionella_tag-9-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Legionella_tag-9-2;locus_tag=Silas_the_Legionella_tag-9-2 +contig_1 . CDS 200 290 . . . ID=Silas_the_Legionella_tag-9-3;locus_tag=Silas_the_Legionella_tag-9-3 +contig_1 . CDS 300 390 . . . ID=Silas_the_Legionella_tag-9-4;locus_tag=Silas_the_Legionella_tag-9-4 +contig_1 . CDS 400 490 . . . ID=Silas_the_Legionella_tag-9-5;locus_tag=Silas_the_Legionella_tag-9-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff new file mode 100644 index 0000000..9a3ece9 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff @@ -0,0 +1,12 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.1;locus_tag=Silas_the_Salmonella_tag-1-2.1 +contig_1 . CDS 200 290 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 +contig_1 . CDS 300 390 . . . ID=Silas_the_Salmonella_tag-1-3;locus_tag=Silas_the_Salmonella_tag-1-3 +contig_1 . CDS 400 490 . . . ID=Silas_the_Salmonella_tag-1-4.1;locus_tag=Silas_the_Salmonella_tag-1-4.1 +contig_1 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_the_Salmonella_tag-1-4.2 +contig_1 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 +contig_1 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag-1.7;locus_tag=Silas_the_Salmonella_tag-1.7 +contig_1 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff new file mode 100644 index 0000000..36aa721 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Zion_the_Streptococcus_tag-7-1;locus_tag=Zion_the_Streptococcus_tag-7-1 +contig_1 . CDS 100 190 . . . ID=Zion_the_Streptococcus_tag-7-2;locus_tag=Zion_the_Streptococcus_tag-7-2 +contig_1 . CDS 200 290 . . . ID=Zion_the_Streptococcus_tag-7-3;locus_tag=Zion_the_Streptococcus_tag-7-3 +contig_1 . CDS 300 390 . . . ID=Zion_the_Streptococcus_tag-7-4;locus_tag=Zion_the_Streptococcus_tag-7-4 +contig_1 . CDS 400 490 . . . ID=Zion_the_Streptococcus_tag-7-5;locus_tag=Zion_the_Streptococcus_tag-7-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv new file mode 100644 index 0000000..43cb525 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"Gene","Non.unique.Gene.name","Annotation","No..isolates","No..sequences","Avg.sequences.per.isolate","Genome.Fragment","Order.within.Fragment","Accessory.Fragment","Accessory.Order.with.Fragment","QC","Min.group.size.nuc","Max.group.size.nuc","Avg.group.size.nuc","Silas_the_Salmonella","Christina_the_Streptococcus","Ajwa_the_Shigella","Ajwa_the_Legionella","Cari_the_Listeria","Aman_the_Streptococcus","Zion_the_Streptococcus","Dina_the_Shigella","Silas_the_Legionella","Lilly_the_Shigella" +"A",,,"10","10","1.0",,,,,,,,,"Silas_the_Salmonella_tag-1-1","Christina_the_Streptococcus_tag-2-1","Ajwa_the_Shigella_tag-3-1","Ajwa_the_Legionella_tag-4-1","Cari_the_Listeria_tag-5-1","Aman_the_Streptococcus_tag-6-1","Zion_the_Streptococcus_tag-7-1","Dina_the_Shigella_tag-8-1","Silas_the_Legionella_tag-9-1","Lilly_the_Shigella_tag-10-1" +"B",,,"10","11","1.2",,,,,,,,,"Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2","Christina_the_Streptococcus_tag-2-2","Ajwa_the_Shigella_tag-3-2","Ajwa_the_Legionella_tag-4-2","Cari_the_Listeria_tag-5-2","Aman_the_Streptococcus_tag-6-2","Zion_the_Streptococcus_tag-7-2","Dina_the_Shigella_tag-8-2","Silas_the_Legionella_tag-9-2","Lilly_the_Shigella_tag-10-2" +"C",,,"9","9","1.0",,,,,,,,,"Silas_the_Salmonella_tag-1-3","Christina_the_Streptococcus_tag-2-3","Ajwa_the_Shigella_tag-3-3","Ajwa_the_Legionella_tag-4-3","Cari_the_Listeria_tag-5-3","Aman_the_Streptococcus_tag-6-3","Zion_the_Streptococcus_tag-7-3","Dina_the_Shigella_tag-8-3","Silas_the_Legionella_tag-9-3","" +"D",,,"9","10","1.1",,,,,,,,,"Silas_the_Salmonella_tag-1-4.1;Silas_the_Salmonella_tag-1-4.2","Christina_the_Streptococcus_tag-2-4","Ajwa_the_Shigella_tag-3-4","Ajwa_the_Legionella_tag-4-4","Cari_the_Listeria_tag-5-4","Aman_the_Streptococcus_tag-6-4","Zion_the_Streptococcus_tag-7-4","Dina_the_Shigella_tag-8-4","Silas_the_Legionella_tag-9-4","" +"E",,,"10","11","1.2",,,,,,,,,"Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2","Christina_the_Streptococcus_tag-2-5","Ajwa_the_Shigella_tag-3-5","Ajwa_the_Legionella_tag-4-5","Cari_the_Listeria_tag-5-5","Aman_the_Streptococcus_tag-6-5","Zion_the_Streptococcus_tag-7-5","Dina_the_Shigella_tag-8-5","Silas_the_Legionella_tag-9-5","Lilly_the_Shigella_tag-10-5" +"F",,,"1","1","1.0",,,,,,,,,"","","","","","","","","","Lilly_the_Shigella_tag-10-6" +"G",,,"1","1","1.0",,,,,,,,,"Silas_the_Salmonella_tag-1-7","","","","","","","","","" diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv new file mode 100644 index 0000000..92ba2d0 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +Gene,Non.unique.Gene.name,Annotation,No..isolates,No..sequences,Avg.sequences.per.isolate,Genome.Fragment,Order.within.Fragment,Accessory.Fragment,Accessory.Order.with.Fragment,QC,Min.group.size.nuc,Max.group.size.nuc,Avg.group.size.nuc,Silas_the_Salmonella,Christina_the_Streptococcus,Ajwa_the_Shigella,Ajwa_the_Legionella,Cari_the_Listeria,Aman_the_Streptococcus,Zion_the_Streptococcus,Dina_the_Shigella,Silas_the_Legionella,Lilly_the_Shigella +A,,,10,10,1,,,,,,,,,Silas_the_Salmonella_tag-1-1,Christina_the_Streptococcus_tag-2-1,Ajwa_the_Shigella_tag-3-1,Ajwa_the_Legionella_tag-4-1,Cari_the_Listeria_tag-5-1,Aman_the_Streptococcus_tag-6-1,Zion_the_Streptococcus_tag-7-1,Dina_the_Shigella_tag-8-1,Silas_the_Legionella_tag-9-1,Lilly_the_Shigella_tag-10-1 +B,,,10,11,1.2,,,,,,,,,Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2,Christina_the_Streptococcus_tag-2-2,Ajwa_the_Shigella_tag-3-2,Ajwa_the_Legionella_tag-4-2,Cari_the_Listeria_tag-5-2,Aman_the_Streptococcus_tag-6-2,Zion_the_Streptococcus_tag-7-2,Dina_the_Shigella_tag-8-2,Silas_the_Legionella_tag-9-2,Lilly_the_Shigella_tag-10-2 +C,,,9,9,1,,,,,,,,,Silas_the_Salmonella_tag-1-3,Christina_the_Streptococcus_tag-2-3,Ajwa_the_Shigella_tag-3-3,Ajwa_the_Legionella_tag-4-3,Cari_the_Listeria_tag-5-3,Aman_the_Streptococcus_tag-6-3,Zion_the_Streptococcus_tag-7-3,Dina_the_Shigella_tag-8-3,Silas_the_Legionella_tag-9-3, +D,,,9,10,1.1,,,,,,,,,Silas_the_Salmonella_tag-1-4.1;Silas_the_Salmonella_tag-1-4.2,Christina_the_Streptococcus_tag-2-4,Ajwa_the_Shigella_tag-3-4,Ajwa_the_Legionella_tag-4-4,Cari_the_Listeria_tag-5-4,Aman_the_Streptococcus_tag-6-4,Zion_the_Streptococcus_tag-7-4,Dina_the_Shigella_tag-8-4,Silas_the_Legionella_tag-9-4, +E,,,10,11,1.2,,,,,,,,,Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2,Christina_the_Streptococcus_tag-2-5,Ajwa_the_Shigella_tag-3-5,Ajwa_the_Legionella_tag-4-5,Cari_the_Listeria_tag-5-5,Aman_the_Streptococcus_tag-6-5,Zion_the_Streptococcus_tag-7-5,Dina_the_Shigella_tag-8-5,Silas_the_Legionella_tag-9-5,Lilly_the_Shigella_tag-10-5 +F,,,1,1,1,,,,,,,,,,,,,,,,,,Lilly_the_Shigella_tag-10-6 +G,,,1,1,1,,,,,,,,,Silas_the_Salmonella_tag-1-7,,,,,,,,, \ No newline at end of file diff --git a/unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder b/unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder new file mode 100644 index 0000000..e69de29 From 53b04e25a16645264db45f35794a3f6a09c8d0bf Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 30 Dec 2021 12:24:29 +1100 Subject: [PATCH 012/135] Add doc-strings for the funtions parsing the gene pres/abs file --- Corekaburra/parse_gene_presence_absence.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index dab5ea0..70875a3 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -24,6 +24,13 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): + """ + Function that check for that placement of fragmented gene parts, to determine if they are neighbouring or have some genomic feature between them + :param fragments_in_line: List of genes that are found to be fragmented, one composite of fragments for each index + :param input_gffs: A list of file-paths to the gff files given as input + :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run + :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) + """ return_list = [] for fragment in fragments_in_line: # split the two fragments @@ -79,10 +86,17 @@ def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, verbose=True): - """Function that pass a Roary style gene presence/absence file. - Returns directories of core and low frequency genes, and a directory of pan genome clusters and their annotation""" - - # file = os.path.join("", pres_abs_file) + """ + Function that pass a Roary style gene presence/absence file. + :param pres_abs_file: File path to the gene presence/absence file identified + :param core_gene_presence: The ratio of genomes in which a gene must present, to be seen as a core gene + :param low_freq_gene: The ratio of genomes in which a gene must not surpass, to be seen as a low-frequency gene + :param source_program: The program from which the pan-genome was produced + :param input_gffs: A list of file-paths to the gff files given as input + :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run + :param verbose: Indeicater on verbosety level # TODO - Likely change to logger! + :return: Directories of directories of core and low frequency genes, and a directory of pan genome clusters and their annotation. + """ # Open the presence/absense file to index gene into core, accessory, or low-frequency genes with open(pres_abs_file, 'r', newline='', ) as gene_presence_absence: From 11211ef2b94e9c8248f7bc5e8bbed94acd5f9431 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 3 Jan 2022 11:40:00 +1100 Subject: [PATCH 013/135] Add in the functions for parsing gff files along with tests for these functions --- Corekaburra/__main__.py | 55 +- .../gff_parser.py | 422 ++-- .../merge_dicts.py | 0 Corekaburra/parse_gene_presence_absence.py | 5 +- unit_tests/Corekaburra_test.py | 1859 +++++++++++++++++ unit_tests/unit_test_data/.DS_Store | Bin 6148 -> 0 bytes .../Lilly_the_Shigella.gff | 7 + .../Silas_the_Legionella.gff | 8 + .../Silas_the_Salmonella.gff | 12 + .../Zion_the_Streptococcus.gff | 8 + .../multi_contig_unwrapped.txt | 6 + .../multi_contig_wrapped.txt | 33 + .../single_contig_unwrapped.txt | 4 + .../single_contig_wrapped.txt | 22 + .../TestPangenomeSourceProgram/.DS_Store | Bin 8196 -> 0 bytes .../Silas_the_Salmonella.gff | 12 + .../Silas_the_Salmonella_corrected.gff | 13 + .../test_double_chromosome.gff | 5 + .../test_single_chromosome.gff | 3 + .../test_triple_chromosome.gff | 7 + 20 files changed, 2259 insertions(+), 222 deletions(-) rename {Code_to_transfer => Corekaburra}/gff_parser.py (59%) rename {Code_to_transfer => Corekaburra}/merge_dicts.py (100%) delete mode 100644 unit_tests/unit_test_data/.DS_Store create mode 100644 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Lilly_the_Shigella.gff create mode 100644 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Legionella.gff create mode 100644 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Salmonella.gff create mode 100644 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff create mode 100644 unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt create mode 100644 unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt create mode 100644 unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt create mode 100644 unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt delete mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store create mode 100644 unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff create mode 100644 unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff create mode 100644 unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff create mode 100644 unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff create mode 100644 unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 8df32b6..ee15dff 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -13,6 +13,7 @@ import os import logging import time +import concurrent.futures try: from Corekaburra.commandline_interface import get_commandline_arguments @@ -34,6 +35,11 @@ except ModuleNotFoundError: from parse_gene_presence_absence import read_gene_presence_absence +try: + from Corekaburra.gff_parser import segment_genome_content +except ModuleNotFoundError: + from gff_parser import segment_genome_content + from argparse import ArgumentParser from math import floor import sys @@ -115,6 +121,7 @@ def main(): args = get_commandline_arguments(sys.argv[1:]) # TODO - Add in function(s) that will check all files to not be empty. - Andrew? + # TODO - Make Corekaburra take gzipped inputs # Check the presence of provided complete genomes among input GFFs if args.comp_genomes is not None: @@ -163,6 +170,52 @@ def main(): args.input_gffs, tmp_folder_path) -# If this script is run from the command line then call the main function. + # TODO - Add this into the multiprocessing loop to not doubble files + # TODO - Add a user command to keep and discard the corrected files (But still using them - Make mutually exclusive with -a option) + # Add in the refound genes into the gff files and print the corrected GFF files. + # if source_program == "Panaroo" and args.annotate: + # time_start = time.time() + # print(f"\n----------Adding in refound annotations for gff files---------") + # + # corrected_folder = correct_gffs(args.input_gffs, gene_data_path, args.output_path, attribute_dict, + # temp_folder_path) + # + # args.input_gffs = [join(corrected_folder, file) for file in listdir(corrected_folder) if '.gff' in file] + # if not args.quiet: + # time_calculator(time_start, time.time(), "add refound annotations to gff files") + + # Loop over all gffs and extract info from each of them. + time_start = time.time() + # Initialise dictionaries to contain results from all gff files + core_neighbour_pairs = {} + core_neighbour_distance = {} + core_neighbour_accessory_count = {} + core_neighbour_low_freq = {} + master_info_total = {} + non_core_contig_info = {} + + with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor: # TODO - change the max workers to the user specified number + print(f"\n------Start core region identification of given gff files-----") + print(f'{len(args.input_gffs)} GFF files to process') + + results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, i, comp_genomes) + for i, gff in enumerate(args.input_gffs)] + + for output in concurrent.futures.as_completed(results): + # Split the outputs + core_pairs, distance, acc_count, \ + low_freq, master_info_return, \ + core_less_contigs_return = output.result() + + # Merge results into single/master dictionaries + core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) + core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) + core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) + core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) + master_info_total.update(master_info_return) + non_core_contig_info.update(core_less_contigs_return) + + time_calculator(time_start, time.time(), "searching gff files for core genes") + if __name__ == '__main__': main() diff --git a/Code_to_transfer/gff_parser.py b/Corekaburra/gff_parser.py similarity index 59% rename from Code_to_transfer/gff_parser.py rename to Corekaburra/gff_parser.py index 9899981..5f3b89e 100644 --- a/Code_to_transfer/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -3,7 +3,11 @@ def parse_gff(input_file): - """ Read a gff file and return it as a generator object that return all line containing CDS """ + """ + Read a gff file and return it as a generator object that return all line containing CDS + :param input_file: File-path to a given gff file to be processed + :return: Generator object returning CDS from a gff file + """ with open(input_file, 'r') as gff_file: for line in gff_file: if "##FASTA" in line: @@ -19,27 +23,18 @@ def parse_gff(input_file): else: gene_id = line[8][line[8].find('ID'):line[8].find(';')] - # Remove equal sign from id and add as identifyer for the returned gff line + # Remove equal sign from id and add as identifier for the returned gff line gene_id = gene_id[gene_id.find('=') + 1:] line[8] = gene_id yield line - -def get_genome_size_from_gff(input_file): #TODO - it should be possible to remove this and just use the directory given by get_contig_lengths function - """ Get the genome size from a GFF3 file by counting characters in fasta appendix""" - genome_size = 0 - fasta_reached = False - with open(input_file, 'r', ) as gff_file: - for line in gff_file: - if fasta_reached and '>' not in line: - genome_size += len(line.rstrip()) - if "##FASTA" in line: - fasta_reached = True - - return genome_size - - def get_contig_lengths(input_file): + """ + Function that takes an input gff file path and records the length of each contig in the file + :param input_file: File path for a given gff file + :return: directory with key being the contig name (before first white spae) + and value the size of the contig in base pairs + """ fasta_reached = False contig_size = 0 contig_size_dir = {} @@ -75,22 +70,36 @@ def get_contig_lengths(input_file): def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, low_freq_genes_in_region, core_gene_pair_distance, accessory_gene_content, - low_freq_gene_content, core_gene_pairs, num_acc_genes_in_region, master_info): - - """ Function to record information about a core gene pair or a core gene and a sequence break, - along with accessory information between the two features""" - - # Examine if a fragmented core genome is given, if then skip the recording, reset the accessory counters and return the new end coordinate of the second fragment - if gff_line is not None and previous_core_gene_id is not "Sequence_break": + low_freq_gene_content, core_gene_pairs, master_info): + """ + Function to record information about a core gene pair or a core gene and a sequence break, + along with accessory information between the two features + :param core_genes: Dict of core genes passed to genomes and the pan-genome clusters. + :param gff_name: Name of the gff file currently being examined + :param gff_line: List for the gene currently being recorded. (Can be None, when previous gene is next to sequence break) + :param contig_end: Length of the contig in question, used to calculate the distance from last gene to end of contig + :param previous_core_gene_id: ID of the previous gene recorded in pair, can be Sequence_break + :param previous_core_gene_end_coor: End coordinate of the previously recorded genome (Can be None when new contig is initiated) + :param acc_genes_in_region: List of recorded accessory-frequency genes in the region being recorded + :param low_freq_genes_in_region: List of recorded low-frequency genes in the region being recorded + :param core_gene_pair_distance: Dict of distances between core pairs recorded. key being the pair and value the distance in base-pairs + :param accessory_gene_content: Dict of accessory frequency genes and their mapping to genomes + :param low_freq_gene_content: Dict of low frequency genes and their mapping to genomes + :param core_gene_pairs: List of core pairs recorded + :param master_info: Large dict holding key for each pair recorded with a list of info around the pair as value + :return: A tuple of dictionaries and lists, most of which are also given as input - See descriptions above + """ + + # Check that a line from gff is provided and previous gene is not a sequence break + if gff_line is not None and previous_core_gene_id != "Sequence_break": + # Check if core gene is fragmented if core_genes[gff_name][previous_core_gene_id] == core_genes[gff_name][gff_line[8]]: previous_core_gene_id = gff_line[8] previous_core_gene_end_coor = int(gff_line[4]) acc_genes_in_region = [] low_freq_genes_in_region = [] return (previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, low_freq_genes_in_region, - core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) - + core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) # Set core cluster names # If no line from gff is given there is a sequence break, @@ -99,6 +108,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous current_core_gene_cluster = core_genes[gff_name][gff_line[8]] try: previous_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] + # Catch is previous gene was a sequence break. except KeyError: previous_core_gene_cluster = previous_core_gene_id @@ -115,7 +125,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous # Set core neighbour distance # Check if measuring between two genes on same contig and not measuring from a sequence break - if gff_line is not None and previous_core_gene_cluster is not "Sequence_break": + if gff_line is not None and previous_core_gene_cluster != "Sequence_break": core_core_distance = int(gff_line[3]) - previous_core_gene_end_coor - 1 else: # Check if measuring between sequence break and first core gene on contig, if then set start coordinate to zero @@ -132,11 +142,12 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous # Add core neighbour distance core_gene_pair_distance[core_gene_neighbours_str] = core_core_distance - # Add number of accessory genes in region - #num_acc_genes_in_region[core_gene_neighbours_str] = len(acc_genes_in_region) + len(low_freq_genes_in_region) # Add counts and annotation for accessory and low frequency genes - accessory_gene_content[core_gene_neighbours_str] = acc_genes_in_region.copy() # Copies has been added here + acc_genes_in_region = list(set(acc_genes_in_region)) + low_freq_genes_in_region = list(set(low_freq_genes_in_region)) + + accessory_gene_content[core_gene_neighbours_str] = acc_genes_in_region.copy() low_freq_gene_content[core_gene_neighbours_str] = low_freq_genes_in_region.copy() # Add info to master dict @@ -162,8 +173,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous low_freq_genes_in_region = [] return (previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, low_freq_genes_in_region, - core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) + core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene_id, previous_core_gene_end_coor, @@ -171,26 +181,40 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene low_freq_genes_in_region, first_core_low_freq_genes, contig_size, core_gene_pairs, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, master_info): - - # TODO - Take first and last gene on chromosome and record them as connected. - - if previous_core_gene_id == 'Complete_genome_end_fail': - print('Complete_genome_end_fail given!') - previous_core_gene_id = first_core_gene_gff_line[8] - # raise ValueError(f'Complete_genome_end_fail was given as input!\n' - # f'gff: {gff_name}\n' - # f'last_core_gene_cluster: {core_genes[gff_name][first_core_gene_gff_line[8]]}') - + """ + Function to record the connection between the first and the last genome on a closed contig + + :param core_genes: A dict of dicts mapping genomes to gene IDs to pan-genome clusters of core genes. + :param gff_name: Name of the gff file currently being examined + :param previous_core_gene_id: ID of the previous gene recorded in pair, can be Sequence_break. + :param previous_core_gene_end_coor: End coordinate of the previously recorded genome. + :param first_core_gene_gff_line: List for the gene first encountered on the contig. + :param acc_genes_in_region: List of recorded accessory-frequency genes in the region after last core genes + :param first_core_accessory_content: List of recorded accessory-frequency genes in the region before first core genes + :param low_freq_genes_in_region: List of recorded low-frequency genes in the region after last core genes + :param first_core_low_freq_genes: List of recorded low-frequency genes in the region before first core genes + :param contig_size: Size of the contig currently being looked at. + :param core_gene_pairs: List of core pairs recorded + :param core_gene_pair_distance: Dict of distances between core pairs recorded. key being the pair and value the distance in base-pairs + :param accessory_gene_content: Dict of accessory frequency genes and their mapping to genomes + :param low_freq_gene_content: Dict of low frequency genes and their mapping to genomes + :param master_info: Large dict holding key for each pair recorded with a list of info around the pair as value + + :return previous_core_gene_id: String of the last core genes ID/locus_tag + :return previous_core_gene_end_coor: Int of the end coordinate for the latest core gene + :return acc_genes_in_region: Empty list to store accessory genes + :return low_freq_genes_in_region: Empty list to store low-frequency genes + :return core_gene_pairs: List of core gene pairs recorded + :return core_gene_pair_distance: A dict of distances between a specific pair of core genes. + :return accessory_gene_content: A dict of the accessory genes found between a pair of core genes. + :return low_freq_gene_content: A dict of the low-frequency genes found between a pair of core genes. + :return master_info: A dict of multiple pieces of info for each core gene pair. + """ last_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] first_core_gene_cluster = core_genes[gff_name][first_core_gene_gff_line[8]] - # TODO - Handle core gene that is on contig alone? if first_core_gene_cluster == last_core_gene_cluster: - print('Same gene') - # raise ValueError(f'First and last core gene is the same!\n' - # f'gff: {gff_name}\n' - # f'first_core_gene_cluster: {first_core_gene_cluster}\n' - # f'last_core_gene_cluster: {last_core_gene_cluster}') + print('Same gene') # TODO - Log this? report or what? # Add core neighbours core_gene_neighbours = sorted([last_core_gene_cluster, first_core_gene_cluster]) @@ -198,8 +222,7 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene core_gene_pairs.append(core_gene_neighbours_str) # Add core neighbour distance - core_core_distance = contig_size - previous_core_gene_end_coor \ - + int(first_core_gene_gff_line[3]) # TODO - make sure this -1 is correct + core_core_distance = contig_size - previous_core_gene_end_coor + int(first_core_gene_gff_line[3]) - 1 core_gene_pair_distance[core_gene_neighbours_str] = core_core_distance @@ -207,7 +230,10 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene last_first_accessory_content = acc_genes_in_region.copy() + first_core_accessory_content.copy() last_first_low_freq_count = low_freq_genes_in_region.copy() + first_core_low_freq_genes.copy() - accessory_gene_content[core_gene_neighbours_str] = last_first_accessory_content.copy() # Copies has been added here + last_first_accessory_content = list(set(last_first_accessory_content)) + last_first_low_freq_count = list(set(last_first_low_freq_count)) + + accessory_gene_content[core_gene_neighbours_str] = last_first_accessory_content.copy() low_freq_gene_content[core_gene_neighbours_str] = last_first_low_freq_count.copy() # Add to master info dict @@ -220,8 +246,8 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene last_first_accessory_content, last_first_low_freq_count] - previous_core_gene_id = 'Complete_genome_end_fail' - previous_core_gene_end_coor = int(first_core_gene_gff_line[4]) + previous_core_gene_id = "" + previous_core_gene_end_coor = int(first_core_gene_gff_line[4]) # TODO - should this be the end or just some random large number? acc_genes_in_region = [] low_freq_genes_in_region = [] @@ -229,33 +255,48 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene core_gene_pairs, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, master_info) +def record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, contig_name): + acc_genes_in_region = list(set(acc_genes_in_region)) + low_freq_genes_in_region = list(set(low_freq_genes_in_region)) + if len(acc_genes_in_region) + len(low_freq_genes_in_region) > 0: + coreless_contigs[f'{gff_name}--{contig_name}'] = [acc_genes_in_region, low_freq_genes_in_region] + + return coreless_contigs + + def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc_genes, complete_genomes): - """ Function that takes a gff generator, core and low frequency genes and identify each core-core gene region + """ + Function that takes a gff generator, core, accessory, and low frequency genes and identify each core-core gene region counts the number of accessory genes in the region, records the number of low frequency genes in the region, - and records the distance from one core gene to the next""" + and records the distance from one core gene to the next. + :param gff_generator: A generator object providing each CDS line from a gff file to be segmented. + :param core_genes: A dict of dicts mapping genomes to gene IDs to pan-genome clusters of core genes. + :param low_freq_genes: Same structure as core_genes, but for low-frequency genes. + :param gff_path: List of file paths to gff files. + :param acc_genes: Same structure as core_genes, but for accessory genes. + :param complete_genomes: List of gff names that are to be handled as complete. + + :return core_gene_pairs: A list of core genes (or sequence breaks) that are found to be neighbouring each other in a given gff file. + :return core_gene_pair_distance: A dict of distances between a specific pair of core genes. + :return accessory_gene_content: A dict of the accessory genes found between a pair of core genes. + :return low_freq_gene_content: A dict of the low-frequency genes found between a pair of core genes. + :return master_info: A dict of multiple pieces of info for each core gene pair (Gff file, core gene 1, core gene 2, distnace between them, genes between them, list of accesspry genes, list of low-frequency genes) + :return coreless_contigs: Dict of contigs found to not encode any core genes on them. The accessory and low-frequency genes are recorded. + """ # Initialize data structures to be returned - # List of sorted core gene neighbours separated by '--'. (Sorted as the orientation of contigs is sometimes random) core_gene_pairs = [] - # Dict of distances between core gene neighbours. Key is the core_gene_pairs, and values is the distance in basepairs from end to start for neighbouring genes. core_gene_pair_distance = {} - # Dict of number of accessory genes between core gene neighbours, Keys are core_gene_pairs and values are the number of accessory genes in the region - num_acc_genes_in_region = {} - # Dict of low fequency genes between core gene pairs. Key is value from core_gene_pairs, and value is the names of low frequency genes found in region low_freq_gene_content = {} - # Dict of lists with keys being the core_gene_pairs value with the value being the number of accessory genes found between the core genes accessory_gene_content = {} - # Dict containing master information - used to write comprehensive output files master_info = {} - # Dict to store info on accessory genes from contigs where no core gene is present. coreless_contigs = {} - # initiate variable that holds the first gene for file is genome is complete. - # How do you determine from a gff file if a genome is complete? - start_gene_cluster = False - # Examine if a complete genomes has been given + + # Examine if a complete genome has been given if complete_genomes is None: complete_genome = False else: - if os.path.basename(gff_path).replace('.gff', '').replace('_corrected', '') in complete_genomes: # TODO - Use remove instead of replace? + if os.path.basename(gff_path).replace('.gff', '').replace('_corrected', '') in complete_genomes: complete_genome = True else: complete_genome = False @@ -263,35 +304,32 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # Get size of contigs for given gff contig_sizes = get_contig_lengths(gff_path) - # Split input path to gff to get genome name + # Split input path of gff to get genome name gff_name = gff_path.split('/')[-1] - gff_name = gff_name.split('.')[0] + gff_name = gff_name.replace('.gz', '').rsplit('.', 1)[0] if 'corrected' in gff_name: gff_name = gff_name.split('_corrected')[0] # Set that first core gene has not been found first_core_gene = True - # Set Initialise variable to be used globally + # Set Initialise variable for the ID of previous encountered core gene previous_core_gene_id = "" - # Set variable to determine of genome is complete or single contig, or if multi contigs are present - #single_contig = True # Set variable to get first contig first_contig = True - # Initialise the accessory gene counter and low frequency gene list + # Initialise the accessory and low-frequency gene lists low_freq_genes_in_region = [] acc_genes_in_region = [] # Go through each line of GFF file for line in gff_generator: - # Set first contig fofund in file + # Set first contig to fund in file if first_contig: previous_contig = line[0] first_contig = False - # Check if contig has changed - if then finish contig, if not examine next gene on contig gene + # Check if contig has changed - if then finish contig, if not examine next gene on contig if line[0] == previous_contig: - - # Check if gene is core - if then test if fist or not, else assume to be accessory gene + # Check if gene is core, if then test if it is the first, else assume to be accessory gene if line[8] in core_genes[gff_name]: # Check if core gene is the first observed in file - if then set information else record information @@ -309,8 +347,8 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc first_core_gene = False # Reset accessory and low frequency gene counters - acc_genes_in_region = [] low_freq_genes_in_region = [] + acc_genes_in_region = [] else: # Record core gene pair information @@ -322,12 +360,11 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) = record_core_core_region(core_genes, gff_name, line, None, previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, low_freq_genes_in_region, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, - core_gene_pairs, num_acc_genes_in_region, master_info) + core_gene_pairs, master_info) else: # Check if accessory is low frequency - else just regular accessory @@ -336,17 +373,15 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc else: try: acc_genes_in_region.append(acc_genes[gff_name][line[8]]) - except KeyError: + except KeyError: # TODO - WHAT DOES THIS DO? - Likely search for fragment within composite, as fragments were previously storred in their composit strings. gene_key = [key for key in acc_genes[gff_name].keys() if line[8] in key] if len(gene_key) > 1: acc_genes_in_region.append(acc_genes[gff_name][gene_key][0]) else: - # Note that gff has multiple contigs - #single_contig = False # Check if there is a core gene on traversed contig or if a core gene is present on the first contig - # if then record it, if not record the accessory and low frequency genes found on contig and reset. - if previous_core_gene_id is not "Sequence_break" and previous_core_gene_id is not "": + if previous_core_gene_id != "Sequence_break" and previous_core_gene_id != "": if complete_genome: # TODO - Write a unit check for this good danm thing! - May be wrap in function to be used further down too, when last line in file has been read? *** (previous_core_gene_id, previous_core_gene_end_coor, @@ -368,38 +403,6 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc first_core_gene = True - # last_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] - # first_core_gene_cluster = core_genes[gff_name][first_core_gene_gff_line[8]] - # - # # Add core neighbours - # core_gene_neighbours = sorted([last_core_gene_cluster, first_core_gene_cluster]) - # core_gene_neighbours_str = f'{core_gene_neighbours[0]}--{core_gene_neighbours[1]}' - # core_gene_pairs.append(core_gene_neighbours_str) - # - # # Add core neighbour distance - # core_core_distance = contig_sizes[first_core_gene_gff_line[0]] - previous_core_gene_end_coor \ - # + int(first_core_gene_gff_line[3]) # TODO - make sure this -1 is correct - # - # core_gene_pair_distance[core_gene_neighbours_str] = core_core_distance - # - # # Add accessory information from between last and first core gene - # last_first_accessory_content = acc_genes_in_region + first_core_accessory_content - # accessory_gene_content[core_gene_neighbours_str] = len(last_first_accessory_content) - # last_first_low_freq_count = low_freq_genes_in_region + first_core_low_freq_genes - # low_freq_gene_content[core_gene_neighbours_str] = last_first_low_freq_count - # - # # Add to master info dict - # master_info[f'{core_gene_neighbours_str}--{gff_name}'] = [gff_name, - # core_gene_neighbours[0], - # core_gene_neighbours[1], - # core_core_distance, - # len(last_first_accessory_content) + - # len(last_first_low_freq_count), - # last_first_accessory_content, - # last_first_low_freq_count] - - # TODO - check if genome is given as complete, if then record the fist and last gene as being neighbours and set first core gene to True to record a new first gene. - # Else add the core gene as being next to a seqeunce break else: # Record the core gene neighbouring a sequence break (previous_core_gene_id, @@ -410,69 +413,67 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], previous_core_gene_id, - previous_core_gene_end_coor, acc_genes_in_region, - low_freq_genes_in_region, core_gene_pair_distance, - accessory_gene_content, low_freq_gene_content, - core_gene_pairs, num_acc_genes_in_region, - master_info) - - # Check if first gene on contig is a core gene, if the record it. - if line[8] in core_genes[gff_name]: - previous_core_gene_id = "Sequence_break" - - # Get the starting position of the first core gene on contig to record the gene. - # Make it negative to fit the calculation of the distance between genes. Add one to further adjust - cur_core_gene_start = -int(line[3]) - - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes, gff_name, line, 0, previous_core_gene_id, - cur_core_gene_start, acc_genes_in_region, - low_freq_genes_in_region, core_gene_pair_distance, - accessory_gene_content, low_freq_gene_content, - core_gene_pairs, num_acc_genes_in_region, - master_info) - - # Add as accessory - else: - # Check if accessory is low frequency - else just regular accessory - if line[8] in low_freq_genes[gff_name]: - low_freq_genes_in_region.append(low_freq_genes[gff_name][line[8]]) - else: - try: - acc_genes_in_region.append(acc_genes[gff_name][line[8]]) - except KeyError: - gene_key = [key for key in acc_genes[gff_name].keys() if line[8] in key] - if len(gene_key) > 1: - acc_genes_in_region.append(acc_genes[gff_name][gene_key][0]) - - # TDOD - Add an else that count the accessory/low frequency gene. - # Set new contig - previous_contig = line[0] - + master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, master_info) else: # Record info on accessory genes on core-less contig, if any accessory genes are present - if len(acc_genes_in_region) + len(low_freq_genes_in_region) > 0: - coreless_contigs[f'{gff_name}--{previous_contig}'] = [acc_genes_in_region, low_freq_genes_in_region] + coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, + low_freq_genes_in_region, gff_name, previous_contig) + + # Reset accessory and low-frequency gene lists acc_genes_in_region = [] low_freq_genes_in_region = [] # Set new contig previous_contig = line[0] + # Check if first gene on new contig is core or not + + # Check if first gene on new contig is a core gene, if the record it. + if line[8] in core_genes[gff_name]: + previous_core_gene_id = "Sequence_break" + + # Get the starting position of the first core gene on contig to record the gene. + # Make it negative to fit the calculation of the distance between genes. # TODO - Do we need to add one to further adjust + cur_core_gene_start = -int(line[3]) + + (previous_core_gene_id, + previous_core_gene_end_coor, + acc_genes_in_region, + low_freq_genes_in_region, + core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, + core_gene_pairs, + master_info) = record_core_core_region(core_genes, gff_name, line, 0, previous_core_gene_id, + cur_core_gene_start, acc_genes_in_region, + low_freq_genes_in_region, core_gene_pair_distance, + accessory_gene_content, low_freq_gene_content, + core_gene_pairs, master_info) # TODO - Why zero here? + + # Add as accessory - if first gene is not core + else: + # Check if accessory is low frequency - else just regular accessory + if line[8] in low_freq_genes[gff_name]: + low_freq_genes_in_region.append(low_freq_genes[gff_name][line[8]]) + else: + try: + acc_genes_in_region.append(acc_genes[gff_name][line[8]]) + except KeyError: + gene_key = [key for key in acc_genes[gff_name].keys() if line[8] in key] + if len(gene_key) > 1: + acc_genes_in_region.append(acc_genes[gff_name][gene_key][0]) + + # Set new contig + previous_contig = line[0] + # Check if genome is complete or a single contig. If then add information for last and first core gene, if not # then add the first and last core gene as being neighbours to sequence breaks. if complete_genome: - if previous_core_gene_id is not 'Complete_genome_end_fail': + if previous_core_gene_id != "": (previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, @@ -492,42 +493,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc master_info) else: # Record info on accessory genes on core-less contig, if any accessory genes are present - if len(acc_genes_in_region) + len(low_freq_genes_in_region) > 0: - coreless_contigs[f'{gff_name}--{previous_contig}'] = [acc_genes_in_region, low_freq_genes_in_region] - - - # last_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] - # first_core_gene_cluster = core_genes[gff_name][first_core_gene_gff_line[8]] - # - # # Add core neighbours - # core_gene_neighbours = sorted([last_core_gene_cluster, first_core_gene_cluster]) - # core_gene_neighbours_str = f'{core_gene_neighbours[0]}--{core_gene_neighbours[1]}' - # core_gene_pairs.append(core_gene_neighbours_str) - # - # # Add core neighbour distance - # core_core_distance = contig_sizes[first_core_gene_gff_line[0]] - previous_core_gene_end_coor \ - # + int(first_core_gene_gff_line[3]) # TODO - make sure this -1 is correct - # - # core_gene_pair_distance[core_gene_neighbours_str] = core_core_distance - # - # # Add accessory information from between last and first core gene - # last_first_accessory_content = acc_genes_in_region + first_core_accessory_content - # accessory_gene_content[core_gene_neighbours_str] = len(last_first_accessory_content) - # last_first_low_freq_count = low_freq_genes_in_region + first_core_low_freq_genes - # low_freq_gene_content[core_gene_neighbours_str] = last_first_low_freq_count - # - # # Add to master info dict - # master_info[f'{core_gene_neighbours_str}--{gff_name}'] = [gff_name, - # core_gene_neighbours[0], - # core_gene_neighbours[1], - # core_core_distance, - # len(last_first_accessory_content) + - # len(last_first_low_freq_count), - # last_first_accessory_content, - # last_first_low_freq_count] - - # Add first gene in file to list, to aid as start point for consensus core gene graph - # start_gene_cluster = first_core_gene_cluster + record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0]) else: # Add first core gene as being neighbour to a sequence break @@ -538,7 +504,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) = record_core_core_region(core_genes, gff_name, first_core_gene_gff_line, 0, + master_info) = record_core_core_region(core_genes, gff_name, first_core_gene_gff_line, 0, 'Sequence_break', 9999999999999999, first_core_accessory_content, @@ -547,11 +513,10 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) # Add last core gene as being neighbour to a sequence break # if the last core gene has not been recorded already - if previous_core_gene_id is not "Sequence_break": + if previous_core_gene_id != "Sequence_break": (previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, @@ -559,7 +524,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], + master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, @@ -568,26 +533,43 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) + else: + # Add a core-less contig if there has been accessory genes: + coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0]) return core_gene_pairs, core_gene_pair_distance, accessory_gene_content, \ - low_freq_gene_content, master_info, coreless_contigs, start_gene_cluster + low_freq_gene_content, master_info, coreless_contigs + + +def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, i, complete_genomes): + """ + Single function segmenting the gff into core gene regions to be used for simple multi processing + :param input_gff_file: File-path to the given gff file to be segmented + :param core_genes: Dictionary over core genes + :param low_freq_genes: Dictionary over low-frequency genes + :param acc_gene_dict: Dictionary over accessory genes + :param i: The index of which this process is in loop + :param complete_genomes: Bool indicating if this genome should be considered as a complete genome + + :return input_gff_file: File path to the gff being searched + :return core_genes: Dict of core genes passed to genomes and the pan-genome clusters. + :return low_freq_genes: Same structure as core_genes, but for low-frequency genes. + :return acc_gene_dict: Same structure as core_genes, but for accessory genes. + :return i: The index of the gff in the larger scheme of the analysis + :return complete_genomes: List of genomes given as complete by the user. + """ + if (i+1) % 25 == 0 or i == 0: + print(f"Determining core-core synteny for GFF file #{i+1}") # TODO - look what have been done for Magphi in recording progress! + # TODO - likely check if genome should be corrected at this point in the process. - Would require more inputs. -def segment_genome_content(input_file, core_genes, low_freq_genes, acc_gene_dict, i, complete_genomes): - """ Single function segmenting the gff into core gene regions to be used for simple multi processing""" - if (i+1) % 25 == 0 or i == 0: - print(f"Determining core-core synteny for GFF file #{i+1}") - gff_generator = parse_gff(input_file) + gff_generator = parse_gff(input_gff_file) return_data = segment_gff_content(gff_generator=gff_generator, - gff_path=input_file, + gff_path=input_gff_file, core_genes=core_genes, low_freq_genes=low_freq_genes, acc_genes=acc_gene_dict, complete_genomes=complete_genomes) return return_data - - -# TODO - Add in so that the length between a given core gene and a sequnece break is conserved and given in master output. \ No newline at end of file diff --git a/Code_to_transfer/merge_dicts.py b/Corekaburra/merge_dicts.py similarity index 100% rename from Code_to_transfer/merge_dicts.py rename to Corekaburra/merge_dicts.py diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 70875a3..d6ff7ef 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -68,7 +68,7 @@ def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): min_frag_coor = min(frag_coors) region = (first_fragment_contig, min_frag_coor, max_frag_coor) - # Find all features that are completly within the region + # Find all features that are completely within the region region_features = gff_database.region(region=region, completely_within=True) # find all genes that are not part of the fragmented gene @@ -80,8 +80,11 @@ def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): return_list.append(False) else: return_list.append(True) + else: + return_list.append(False) return return_list + # TODO - find out what the non-closed file problem is here! Can be seen when running unit-tests. # TODO - Find out how the gff parser handles this? Does there need to be a check if a gene cluster is being paired to it self and if then drop it and change the end coordinates. diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index e944c5c..762a271 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -17,6 +17,7 @@ from Corekaburra import read_complete_genome_file from Corekaburra import check_inputs from Corekaburra import parse_gene_presence_absence +from Corekaburra import gff_parser @@ -272,6 +273,24 @@ def test_fragmented_gene_mutiple_genes_fasle(self): self.assertEqual(expected_return, return_bool) + def test_fragments_on_separate_contigs(self): + """ One gene fragmented with parts on separate contigs """ + fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', + 'Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + input_gffs = ['TestCheckingFragmentedGenes/Silas_the_Salmonella.gff', + 'TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff', + 'TestCheckingFragmentedGenes/Silas_the_Legionella.gff', + 'TestCheckingFragmentedGenes/Lilly_the_Shigella.gff'] + tmp_folder_path = 'test_tmp_folder' + + expected_return = [False, False] + + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + + self.assertEqual(expected_return, return_bool) + + # TODO - Can a fragmented gene be recognised, if spanning contigs? + class TestParsingGenePresenceAbsenceFile(unittest.TestCase): """ @@ -641,5 +660,1845 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_acc_gene_dict, acc_gene_dict) +class TestParsingGffFile(unittest.TestCase): + """ Test of the function that is used to pass a gff file and return a generator object of CDS lines """ + def test_gff_generator_generation_not_corrected(self): + input_gff_file = 'TestParsingGffFile/Silas_the_Salmonella.gff' + + expected_output = [['contig_1', '.', 'CDS', '1', '90', '.', '.', '.', 'Silas_the_Salmonella_tag-1-1'], + ['contig_1', '.', 'CDS', '100', '190', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.1'], + ['contig_1', '.', 'CDS', '200', '290', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.2'], + ['contig_1', '.', 'CDS', '300', '390', '.', '.', '.', 'Silas_the_Salmonella_tag-1-3'], + ['contig_1', '.', 'CDS', '400', '490', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.1'], + ['contig_1', '.', 'CDS', '500', '590', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.2'], + ['contig_1', '.', 'CDS', '600', '690', '.', '.', '.', 'Silas_the_Salmonella_tag-1-5.1'], + ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], + ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"]] + + return_generator = gff_parser.parse_gff(input_gff_file) + + for expected, generated in zip(expected_output, return_generator): + self.assertEqual(expected, generated) + + def test_gff_generator_generation_corrected_gff(self): + input_gff_file = 'TestParsingGffFile/Silas_the_Salmonella_corrected.gff' + + expected_output = [['contig_1', '.', 'CDS', '1', '90', '.', '.', '.', 'Silas_the_Salmonella_tag-1-1'], + ['contig_1', '.', 'CDS', '100', '190', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.1'], + ['contig_1', '.', 'CDS', '200', '290', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.2'], + ['contig_1', '.', 'CDS', '300', '390', '.', '.', '.', 'Silas_the_Salmonella_tag-1-3'], + ['contig_1', '.', 'CDS', '400', '490', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.1'], + ['contig_1', '.', 'CDS', '500', '590', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.2'], + ['contig_1', '.', 'CDS', '600', '690', '.', '.', '.', 'Silas_the_Salmonella_tag-1-5.1'], + ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], + ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"], + ['contig_1', 'Panaroo', 'CDS', '900', '1000', '.', '+', '0', 'refound_gene_1']] + + return_generator = gff_parser.parse_gff(input_gff_file) + + for expected, generated in zip(expected_output, return_generator): + self.assertEqual(expected, generated) + + +class TestGetContigLenth(unittest.TestCase): + """ + Test function that passes a gff file and counts the length of each contig in attached genome + """ + def test_single_contig(self): + input_gff_path = 'TestGetContigLenth/single_contig_unwrapped.txt' + expected_dict = {'contig_1': 1300} + + return_dict = gff_parser.get_contig_lengths(input_gff_path) + + self.assertEqual(expected_dict, return_dict) + + def test_single_wrapped_contig(self): + input_gff_path = 'TestGetContigLenth/single_contig_wrapped.txt' + expected_dict = {'contig_1': 1300} + + return_dict = gff_parser.get_contig_lengths(input_gff_path) + + self.assertEqual(expected_dict, return_dict) + + def test_multiple_contigs(self): + input_gff_path = 'TestGetContigLenth/multi_contig_unwrapped.txt' + expected_dict = {'contig_1': 1300, + 'contig_2': 1300} + + return_dict = gff_parser.get_contig_lengths(input_gff_path) + + self.assertEqual(expected_dict, return_dict) + + def test_multiple_wrapped_contigs(self): + input_gff_path = 'TestGetContigLenth/multi_contig_wrapped.txt' + expected_dict = {'contig_1': 1300, + 'contig_2': 1300} + + return_dict = gff_parser.get_contig_lengths(input_gff_path) + + self.assertEqual(expected_dict, return_dict) + + +class TestRecordCoreCoreRegion(unittest.TestCase): + def test_recording_neighbouring_core_genes(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '150', '.', '.', '.', 'Core_ID_2'] + contig_end = 1500 + previous_core_gene_id = 'Core_ID_1' + previous_core_gene_end_coor = 10 + acc_genes_in_region = [] + low_freq_genes_in_region = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Core_ID_2' + expected_previous_core_gene_end_coor = 150 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_2': 79} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_2': []} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': []} + expected_core_gene_pairs = ['pan_gene_1--pan_gene_2'] + expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', + 79, 0, [], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_neighbouring_core_genes_w_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '150', '.', '.', '.', 'Core_ID_2'] + contig_end = 1500 + previous_core_gene_id = 'Core_ID_1' + previous_core_gene_end_coor = 10 + acc_genes_in_region = ['acc_gene_1', 'acc_gene_2'] + low_freq_genes_in_region = ['low_freq_1'] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Core_ID_2' + expected_previous_core_gene_end_coor = 150 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_2': 79} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_2': ['acc_gene_1', 'acc_gene_2']} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': ['low_freq_1']} + expected_core_gene_pairs = ['pan_gene_1--pan_gene_2'] + expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', + 79, 3, ['acc_gene_1', 'acc_gene_2'], ['low_freq_1']]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_w_fragment_given(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_1'}} + gff_name = 'gff_name' + gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '150', '.', '.', '.', 'Core_ID_2'] + contig_end = 1500 + previous_core_gene_id = 'Core_ID_1' + previous_core_gene_end_coor = 10 + acc_genes_in_region = [] + low_freq_genes_in_region = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Core_ID_2' + expected_previous_core_gene_end_coor = 150 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {} + expected_accessory_gene_content = {} + expected_low_freq_gene_content = {} + expected_core_gene_pairs = [] + expected_master_info = {} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_core_gene_before_seqeuncebreak(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1'}} + gff_name = 'gff_name' + gff_line = None + contig_end = 1500 + previous_core_gene_id = 'Core_ID_1' + previous_core_gene_end_coor = 150 + acc_genes_in_region = [] + low_freq_genes_in_region = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Sequence_break' + expected_previous_core_gene_end_coor = 150 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1349} + expected_accessory_gene_content = {'pan_gene_1--Sequence_break': []} + expected_low_freq_gene_content = {'pan_gene_1--Sequence_break': []} + expected_core_gene_pairs = ['pan_gene_1--Sequence_break'] + expected_master_info = {'pan_gene_1--Sequence_break--gff_name': ['gff_name', 'pan_gene_1', 'Sequence_break', + 1349, 0, [], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_core_gene_before_seqeuncebreak_w_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1'}} + gff_name = 'gff_name' + gff_line = None + contig_end = 1500 + previous_core_gene_id = 'Core_ID_1' + previous_core_gene_end_coor = 150 + acc_genes_in_region = ['acc_1'] + low_freq_genes_in_region = ['low_1', "low_2"] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Sequence_break' + expected_previous_core_gene_end_coor = 150 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1349} + expected_accessory_gene_content = {'pan_gene_1--Sequence_break': ['acc_1']} + expected_low_freq_gene_content = {'pan_gene_1--Sequence_break': ['low_1', "low_2"]} + expected_core_gene_pairs = ['pan_gene_1--Sequence_break'] + expected_master_info = {'pan_gene_1--Sequence_break--gff_name': ['gff_name', 'pan_gene_1', 'Sequence_break', + 1349, 3, ['acc_1'], ['low_1', "low_2"]]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_first_core_gene_on_contig_as_first_gene(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1'}} + gff_name = 'gff_name' + gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + contig_end = 0 + previous_core_gene_id = 'Sequence_break' + previous_core_gene_end_coor = 150 + acc_genes_in_region = [] + low_freq_genes_in_region = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Core_ID_1' + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'Sequence_break--pan_gene_1': 89} + expected_accessory_gene_content = {'Sequence_break--pan_gene_1': []} + expected_low_freq_gene_content = {'Sequence_break--pan_gene_1': []} + expected_core_gene_pairs = ['Sequence_break--pan_gene_1'] + expected_master_info = {'Sequence_break--pan_gene_1--gff_name': ['gff_name', 'Sequence_break', 'pan_gene_1', + 89, 0, [], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_recording_first_core_gene_on_contig_w_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1'}} + gff_name = 'gff_name' + gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + contig_end = None + previous_core_gene_id = 'Sequence_break' + previous_core_gene_end_coor = 150 + acc_genes_in_region = ['acc_1', 'acc_2', 'acc_3'] + low_freq_genes_in_region = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + core_gene_pairs = [] + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = 'Core_ID_1' + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pair_distance = {'Sequence_break--pan_gene_1': 89} + expected_accessory_gene_content = {'Sequence_break--pan_gene_1': ['acc_1', 'acc_2', 'acc_3']} + expected_low_freq_gene_content = {'Sequence_break--pan_gene_1': []} + expected_core_gene_pairs = ['Sequence_break--pan_gene_1'] + expected_master_info = {'Sequence_break--pan_gene_1--gff_name': ['gff_name', 'Sequence_break', 'pan_gene_1', + 89, 3, ['acc_1', 'acc_2', 'acc_3'], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ + return_low_freq_gene_content, return_core_gene_pairs, \ + return_master_info = gff_parser.record_core_core_region(core_genes, gff_name, gff_line, contig_end, + previous_core_gene_id, previous_core_gene_end_coor, + acc_genes_in_region, low_freq_genes_in_region, + core_gene_pair_distance, accessory_gene_content, + low_freq_gene_content, core_gene_pairs, + master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + +class TestConnectFirstNLastGeneOnContig(unittest.TestCase): + """ + Test for the function recordning connections between the first and the last gene on a contig in a complete genome + """ + def test_connect_last_n_first_gene_different_genes_no_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + previous_core_gene_id = "Core_ID_2" + previous_core_gene_end_coor = 1450 + first_core_gene_gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + acc_genes_in_region = [] + first_core_accessory_content = [] + low_freq_genes_in_region = [] + first_core_low_freq_genes = [] + contig_size = 1500 + core_gene_pairs = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = "" + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pairs = ['pan_gene_1--pan_gene_2'] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_2': 139} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_2': []} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': []} + expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', 139, 0, [], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info = gff_parser.connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene_id, + previous_core_gene_end_coor, + first_core_gene_gff_line, + acc_genes_in_region, + first_core_accessory_content, + low_freq_genes_in_region, + first_core_low_freq_genes, contig_size, + core_gene_pairs, core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_connect_last_n_first_gene_different_genes_w_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + previous_core_gene_id = "Core_ID_2" + previous_core_gene_end_coor = 1450 + first_core_gene_gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + acc_genes_in_region = ['acc_1'] + first_core_accessory_content = ['first_acc_1'] + low_freq_genes_in_region = ['low_acc_1'] + first_core_low_freq_genes = ['first_low_1'] + contig_size = 1500 + core_gene_pairs = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = "" + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pairs = ['pan_gene_1--pan_gene_2'] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_2': 139} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_2': ['acc_1', 'first_acc_1']} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': ['low_acc_1', 'first_low_1']} + expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', 139, 4, ['acc_1', 'first_acc_1'], ['low_acc_1', 'first_low_1']]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info = gff_parser.connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene_id, + previous_core_gene_end_coor, + first_core_gene_gff_line, + acc_genes_in_region, + first_core_accessory_content, + low_freq_genes_in_region, + first_core_low_freq_genes, contig_size, + core_gene_pairs, core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_connect_same_gene_as_last_n_first_gene_no_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + previous_core_gene_id = "Core_ID_1" + previous_core_gene_end_coor = 180 + first_core_gene_gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + acc_genes_in_region = [] + first_core_accessory_content = [] + low_freq_genes_in_region = [] + first_core_low_freq_genes = [] + contig_size = 1500 + core_gene_pairs = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = "" + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pairs = ['pan_gene_1--pan_gene_1'] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_1': 1409} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_1': []} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_1': []} + expected_master_info = {'pan_gene_1--pan_gene_1--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_1', 1409, 0, [], []]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info = gff_parser.connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene_id, + previous_core_gene_end_coor, + first_core_gene_gff_line, + acc_genes_in_region, + first_core_accessory_content, + low_freq_genes_in_region, + first_core_low_freq_genes, contig_size, + core_gene_pairs, core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + def test_connect_same_gene_as_last_n_first_gene_w_accessory(self): + core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', + 'Core_ID_2': 'pan_gene_2'}} + gff_name = 'gff_name' + previous_core_gene_id = "Core_ID_1" + previous_core_gene_end_coor = 180 + first_core_gene_gff_line = ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'Core_ID_1'] + acc_genes_in_region = ['acc_2', 'acc_3'] + first_core_accessory_content = ['acc_1'] + low_freq_genes_in_region = ['low_1', 'low_2'] + first_core_low_freq_genes = ['low_3'] + contig_size = 1500 + core_gene_pairs = [] + core_gene_pair_distance = {} + accessory_gene_content = {} + low_freq_gene_content = {} + master_info = {} + + # Set up the expected return values + expected_previous_core_gene_id = "" + expected_previous_core_gene_end_coor = 180 + expected_acc_genes_in_region = [] + expected_low_freq_genes_in_region = [] + expected_core_gene_pairs = ['pan_gene_1--pan_gene_1'] + expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_1': 1409} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_1': ['acc_2', 'acc_3', 'acc_1']} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_1': ['low_1', 'low_2', 'low_3']} + expected_master_info = {'pan_gene_1--pan_gene_1--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_1', 1409, 6, ['acc_2', 'acc_3', 'acc_1'], ['low_1', 'low_2', 'low_3']]} + + return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ + return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info = gff_parser.connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene_id, + previous_core_gene_end_coor, + first_core_gene_gff_line, + acc_genes_in_region, + first_core_accessory_content, + low_freq_genes_in_region, + first_core_low_freq_genes, contig_size, + core_gene_pairs, core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, master_info) + + self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) + self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) + self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) + self.assertEqual(expected_low_freq_genes_in_region, return_low_freq_genes_in_region) + self.assertEqual(expected_accessory_gene_content, return_accessory_gene_content) + self.assertEqual(expected_low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(expected_core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(expected_core_gene_pairs, return_core_gene_pairs) + self.assertEqual(expected_master_info, return_master_info) + + +class TestRecordCorelessContig(unittest.TestCase): + def test_adding_coreless_contig(self): + coreless_contigs = {} + acc_genes_in_region = ['acc_1'] + low_freq_genes_in_region = ['low_1'] + gff_name = 'gff_name' + contig_name = 'gff_contig_1' + + expected_return = {'gff_name--gff_contig_1': [['acc_1'], ['low_1']]} + + return_dict = gff_parser.record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, contig_name) + + self.assertEqual(expected_return, return_dict) + + def test_not_adding_coreless_contig(self): + coreless_contigs = {} + acc_genes_in_region = [] + low_freq_genes_in_region = [] + gff_name = 'gff_name' + contig_name = 'gff_contig_1' + + expected_return = {} + + return_dict = gff_parser.record_coreless_contig(coreless_contigs, acc_genes_in_region, + low_freq_genes_in_region, gff_name, contig_name) + + self.assertEqual(expected_return, return_dict) + + +class TestSegmentingMockGffs(unittest.TestCase): + def test_single_chromosome_complete(self): + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3']] + core_genes = {'test_single_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4'}} + complete_genomes = ['test_single_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1']} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8']} + + master_info = {'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_7', 478, 2, ['pan_gene_1'], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_single_chromosome_draft(self): + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3']] + core_genes = {'test_single_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4'}} + complete_genomes = [] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': []} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8']} + + master_info = { + 'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_single_chromosome': ['test_single_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_single_chromosome': ['test_single_chromosome', 'pan_gene_7', 'Sequence_break', 299, 1, [], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_two_chromosomes_complete(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_3'], + ['gff_name_contig_2', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_5'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ['gff_name_contig_2', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_6'], + ['gff_name_contig_2', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_6'] + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4': 'pan_gene_10', + 'Core_ID_5': 'pan_gene_13', + 'Core_ID_6': 'pan_gene_15' + }} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14', + 'low_freq_6': 'pan_gene_16'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_3': 'pan_gene_9', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = ['test_double_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7', + 'pan_gene_10--pan_gene_13', 'pan_gene_13--pan_gene_15', 'pan_gene_10--pan_gene_15'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478, + 'pan_gene_10--pan_gene_13': 359, + 'pan_gene_13--pan_gene_15': 269, + 'pan_gene_10--pan_gene_15': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1'], + 'pan_gene_10--pan_gene_13': ['pan_gene_12'], + 'pan_gene_13--pan_gene_15': [], + 'pan_gene_10--pan_gene_15': ['pan_gene_9'] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8'], + 'pan_gene_10--pan_gene_13': ['pan_gene_11'], + 'pan_gene_13--pan_gene_15': ['pan_gene_14'], + 'pan_gene_10--pan_gene_15': ['pan_gene_16'], + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_7', 478, 2, ['pan_gene_1'], ['pan_gene_8']], + 'pan_gene_10--pan_gene_13--test_double_chromosome': ['test_double_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 2, ['pan_gene_12'], ['pan_gene_11']], + 'pan_gene_13--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_13', 'pan_gene_15', 269, 1, [], ['pan_gene_14']], + 'pan_gene_10--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_10', 'pan_gene_15', 478, 2, ['pan_gene_9'], ['pan_gene_16']] + } + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_two_daft_contigs(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_5'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ['gff_name_contig_2', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_6'], + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4': 'pan_gene_10', + 'Core_ID_5': 'pan_gene_13', + 'Core_ID_6': 'pan_gene_15' + }} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = [] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break', + 'pan_gene_10--pan_gene_13', 'pan_gene_13--pan_gene_15', + 'Sequence_break--pan_gene_10', 'pan_gene_15--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299, + 'pan_gene_10--pan_gene_13': 359, + 'pan_gene_13--pan_gene_15': 269, + 'Sequence_break--pan_gene_10': 178, + 'pan_gene_15--Sequence_break': 299} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': [], + 'pan_gene_10--pan_gene_13': ['pan_gene_12'], + 'pan_gene_13--pan_gene_15': [], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_15--Sequence_break': [] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8'], + 'pan_gene_10--pan_gene_13': ['pan_gene_11'], + 'pan_gene_13--pan_gene_15': ['pan_gene_14'], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_15--Sequence_break': [] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'pan_gene_10--pan_gene_13--test_double_chromosome': ['test_double_chromosome', 'pan_gene_10', 'pan_gene_13', + 359, 2, ['pan_gene_12'], ['pan_gene_11']], + 'pan_gene_13--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_13', 'pan_gene_15', + 269, 1, [], ['pan_gene_14']], + 'Sequence_break--pan_gene_10--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], + 'pan_gene_15--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_15', 'Sequence_break', 299, 0, [], []] + + } + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_with_coreless_contig_draft_last_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = [] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': [] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8'] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', + 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', + 'Sequence_break', 299, 1, [], ['pan_gene_8']] + } + + coreless_contigs = {'test_double_chromosome--gff_name_contig_2': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_with_coreless_contig_complete_last_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = ['test_double_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1'] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8'] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_7', 478, 2, ['pan_gene_1'], ['pan_gene_8']] + } + + coreless_contigs = { + 'test_double_chromosome--gff_name_contig_2': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_with_coreless_contig_draft_first_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_2', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_2', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_2', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_2', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'] + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = [] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': [] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8'] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', + 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', + 'Sequence_break', 299, 1, [], ['pan_gene_8']] + } + + coreless_contigs = { + 'test_double_chromosome--gff_name_contig_1': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_with_coreless_contig_middle_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + # Contig 3 annotations + ['gff_name_contig_3', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_4'], + ['gff_name_contig_3', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_5'] + ] + core_genes = {'test_triple_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4': 'pan_gene_10', + 'Core_ID_5': 'pan_gene_13' + }} + low_freq_genes = {'test_triple_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_triple_chromosome.gff' + acc_genes = {'test_triple_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = [] + + # Construct expected results + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break', + 'Sequence_break--pan_gene_10', 'pan_gene_10--pan_gene_13', 'pan_gene_13--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299, + 'Sequence_break--pan_gene_10': 178, + 'pan_gene_10--pan_gene_13': 359, + 'pan_gene_13--Sequence_break': 619} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': [], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_10--pan_gene_13': [], + 'pan_gene_13--Sequence_break': [] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8'], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_10--pan_gene_13': [], + 'pan_gene_13--Sequence_break': [] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', + 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_7', + 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'Sequence_break--pan_gene_10--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], + 'pan_gene_10--pan_gene_13--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 0, [], []], + 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 619, 0, [], []] + } + + coreless_contigs = { + 'test_triple_chromosome--gff_name_contig_2': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_single_core_on_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ['gff_name_contig_2', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_4'] + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4': 'pan_gene_15'}} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = ['test_double_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'pan_gene_2--pan_gene_7', 'pan_gene_15--pan_gene_15'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478, + 'pan_gene_15--pan_gene_15': 1249} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1'], + 'pan_gene_15--pan_gene_15': ['pan_gene_12'] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8'], + 'pan_gene_15--pan_gene_15': ['pan_gene_11', 'pan_gene_14'] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_7', + 478, 2, ['pan_gene_1'], ['pan_gene_8']], + 'pan_gene_15--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_15', 'pan_gene_15', + 1249, 3, ['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']] + } + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_segmentation_of_fragmented_core_gene(self): + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1_1'], + ['gff_name_contig_1', '.', 'CDS', '251', '270', '.', '.', '.', 'Core_ID_1_2'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3']] + core_genes = {'test_single_chromosome': {'Core_ID_1_1': 'pan_gene_2', + 'Core_ID_1_2': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4'}} + complete_genomes = ['test_single_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 339, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1']} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8']} + + master_info = { + 'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', + 339, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_7', + 478, 2, ['pan_gene_1'], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_segmentation_of_fragmented_core_gene_lone_contig(self): + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + ['gff_name_contig_2', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_4_1'], + ['gff_name_contig_2', '.', 'CDS', '10020', '1030', '.', '.', '.', 'Core_ID_4_2'] + ] + core_genes = {'test_double_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4_1': 'pan_gene_15', + 'Core_ID_4_2': 'pan_gene_15'}} + low_freq_genes = {'test_double_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_double_chromosome.gff' + acc_genes = {'test_double_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4': 'pan_gene_12'}} + complete_genomes = ['test_double_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'pan_gene_2--pan_gene_7', 'pan_gene_15--pan_gene_15'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478, + 'pan_gene_15--pan_gene_15': 1219} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1'], + 'pan_gene_15--pan_gene_15': ['pan_gene_12'] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8'], + 'pan_gene_15--pan_gene_15': ['pan_gene_11', 'pan_gene_14'] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_2', 'pan_gene_7', + 478, 2, ['pan_gene_1'], ['pan_gene_8']], + 'pan_gene_15--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_15', 'pan_gene_15', + 1219, 3, ['pan_gene_12'], + ['pan_gene_11', 'pan_gene_14']] + } + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_segmentation_of_fragmented_acc_gene(self): + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '270', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '460', '.', '.', '.', 'acc_ID_2_1'], + ['gff_name_contig_1', '.', 'CDS', '470', '500', '.', '.', '.', 'acc_ID_2_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3']] + core_genes = {'test_single_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2_1': 'pan_gene_4', + 'acc_ID_2_2': 'pan_gene_4'}} + complete_genomes = ['test_single_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 339, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1']} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8']} + + master_info = { + 'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', + 339, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_7', + 478, 2, ['pan_gene_1'], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_segmentation_of_fragmented_acc_gene_between_first_n_last_gene(self): + + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1150', '.', '.', '.', 'low_freq_3_1'], + ['gff_name_contig_1', '.', 'CDS', '1170', '1250', '.', '.', '.', 'low_freq_3_2']] + core_genes = {'test_single_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3_1': 'pan_gene_8', + 'low_freq_3_2': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4'}} + complete_genomes = ['test_single_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'pan_gene_2--pan_gene_7': ['pan_gene_1']} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8']} + + master_info = {'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_7', 478, 2, ['pan_gene_1'], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_segmentation_of_fragmented_acc_gene_on_coreless_contig(self): + + # Set up input + gff_generator = [ + # Contig 1 annotations + ['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3'], + # Contig 2 annotations + ['gff_name_contig_2', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_4'], + ['gff_name_contig_2', '.', 'CDS', '450', '500', '.', '.', '.', 'acc_ID_4_1'], + ['gff_name_contig_2', '.', 'CDS', '501', '503', '.', '.', '.', 'acc_ID_4_2'], + ['gff_name_contig_2', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_5'], + # Contig 3 annotations + ['gff_name_contig_3', '.', 'CDS', '179', '250', '.', '.', '.', 'Core_ID_4'], + ['gff_name_contig_3', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_5'] + ] + core_genes = {'test_triple_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7', + 'Core_ID_4': 'pan_gene_10', + 'Core_ID_5': 'pan_gene_13' + }} + low_freq_genes = {'test_triple_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8', + 'low_freq_4': 'pan_gene_11', + 'low_freq_5': 'pan_gene_14'}} + gff_path = 'TestSegmentingMockGffs/test_triple_chromosome.gff' + acc_genes = {'test_triple_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2': 'pan_gene_4', + 'acc_ID_4_1': 'pan_gene_12', + 'acc_ID_4_2': 'pan_gene_12'}} + complete_genomes = [] + + # Construct expected results + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', + 'Sequence_break--pan_gene_2', 'pan_gene_7--Sequence_break', + 'Sequence_break--pan_gene_10', 'pan_gene_10--pan_gene_13', 'pan_gene_13--Sequence_break'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, + 'pan_gene_5--pan_gene_7': 269, + 'Sequence_break--pan_gene_2': 178, + 'pan_gene_7--Sequence_break': 299, + 'Sequence_break--pan_gene_10': 178, + 'pan_gene_10--pan_gene_13': 359, + 'pan_gene_13--Sequence_break': 619} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': [], + 'Sequence_break--pan_gene_2': ['pan_gene_1'], + 'pan_gene_7--Sequence_break': [], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_10--pan_gene_13': [], + 'pan_gene_13--Sequence_break': [] + } + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'Sequence_break--pan_gene_2': [], + 'pan_gene_7--Sequence_break': ['pan_gene_8'], + 'Sequence_break--pan_gene_10': [], + 'pan_gene_10--pan_gene_13': [], + 'pan_gene_13--Sequence_break': [] + } + + master_info = { + 'pan_gene_2--pan_gene_5--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_2', 'pan_gene_5', + 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 1, [], ['pan_gene_6']], + 'Sequence_break--pan_gene_2--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', + 'pan_gene_2', 178, 1, ['pan_gene_1'], []], + 'pan_gene_7--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_7', + 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'Sequence_break--pan_gene_10--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], + 'pan_gene_10--pan_gene_13--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 0, [], []], + 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 619, 0, [], []] + } + + coreless_contigs = { + 'test_triple_chromosome--gff_name_contig_2': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_single_fragmented_gene_on_either_side_of_core_gene(self): + + # Set up input + gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '179', '270', '.', '.', '.', 'Core_ID_1'], + ['gff_name_contig_1', '.', 'CDS', '300', '425', '.', '.', '.', 'low_freq_1'], + ['gff_name_contig_1', '.', 'CDS', '450', '460', '.', '.', '.', 'acc_ID_2_1'], + ['gff_name_contig_1', '.', 'CDS', '610', '680', '.', '.', '.', 'Core_ID_2'], + ['gff_name_contig_1', '.', 'CDS', '685', '690', '.', '.', '.', 'acc_ID_2_2'], + ['gff_name_contig_1', '.', 'CDS', '700', '850', '.', '.', '.', 'low_freq_2'], + ['gff_name_contig_1', '.', 'CDS', '950', '1000', '.', '.', '.', 'Core_ID_3'], + ['gff_name_contig_1', '.', 'CDS', '1100', '1250', '.', '.', '.', 'low_freq_3']] + core_genes = {'test_single_chromosome': {'Core_ID_1': 'pan_gene_2', + 'Core_ID_2': 'pan_gene_5', + 'Core_ID_3': 'pan_gene_7'}} + low_freq_genes = {'test_single_chromosome': {'low_freq_1': 'pan_gene_3', + 'low_freq_2': 'pan_gene_6', + 'low_freq_3': 'pan_gene_8'}} + gff_path = 'TestSegmentingMockGffs/test_single_chromosome.gff' + acc_genes = {'test_single_chromosome': {'acc_ID_1': 'pan_gene_1', + 'acc_ID_2_1': 'pan_gene_4', + 'acc_ID_2_2': 'pan_gene_4'}} + complete_genomes = ['test_single_chromosome'] + + # Set up expected outputs + core_gene_pairs = ['pan_gene_2--pan_gene_5', 'pan_gene_5--pan_gene_7', 'pan_gene_2--pan_gene_7'] + core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 339, + 'pan_gene_5--pan_gene_7': 269, + 'pan_gene_2--pan_gene_7': 478} + + accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], + 'pan_gene_5--pan_gene_7': ['pan_gene_4'], + 'pan_gene_2--pan_gene_7': ['pan_gene_1']} + + low_freq_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_3'], + 'pan_gene_5--pan_gene_7': ['pan_gene_6'], + 'pan_gene_2--pan_gene_7': ['pan_gene_8']} + + master_info = { + 'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', + 339, 2, ['pan_gene_4'], ['pan_gene_3'], ], + 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', + 269, 2, ['pan_gene_4'], ['pan_gene_6']], + 'pan_gene_2--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_7', + 478, 2, ['pan_gene_1'], ['pan_gene_8']]} + + coreless_contigs = {} + + # Run function + return_core_gene_pairs, return_core_gene_pair_distance, \ + return_accessory_gene_content, return_low_freq_gene_content, \ + return_master_info, return_coreless_contigs = gff_parser.segment_gff_content(gff_generator, core_genes, + low_freq_genes, gff_path, + acc_genes, + complete_genomes) + + # Evaluate + self.assertEqual(core_gene_pairs, return_core_gene_pairs) + self.assertEqual(core_gene_pair_distance, return_core_gene_pair_distance) + self.assertEqual(accessory_gene_content, return_accessory_gene_content) + self.assertEqual(low_freq_gene_content, return_low_freq_gene_content) + self.assertEqual(master_info, return_master_info) + self.assertEqual(coreless_contigs, return_coreless_contigs) + + def test_something(self): # TODO - What other wired and wonderfull examples can we come up with? + pass + + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/.DS_Store b/unit_tests/unit_test_data/.DS_Store deleted file mode 100644 index 879bb33168d979b7daa04763aeeac8c8ff81a80c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5T4Z@ih8Jc5kbf+^bKN75j+VMJXZTJG)=J;yqU8P;G1cjWLVH5<7 ze$+r_v3Y;@as2vtkpH=1)LJ2X*h~HDZ8~K>SgERX19oFNv$2PJRsmXMaLtjGcontig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Legionella.gff b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Legionella.gff new file mode 100644 index 0000000..5ee9f6a --- /dev/null +++ b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Legionella.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Legionella_tag-9-1;locus_tag=Silas_the_Legionella_tag-9-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Legionella_tag-9-2;locus_tag=Silas_the_Legionella_tag-9-2 +contig_1 . CDS 200 290 . . . ID=Silas_the_Legionella_tag-9-3;locus_tag=Silas_the_Legionella_tag-9-3 +contig_1 . CDS 300 390 . . . ID=Silas_the_Legionella_tag-9-4;locus_tag=Silas_the_Legionella_tag-9-4 +contig_1 . CDS 400 490 . . . ID=Silas_the_Legionella_tag-9-5;locus_tag=Silas_the_Legionella_tag-9-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Salmonella.gff b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Salmonella.gff new file mode 100644 index 0000000..72dc88e --- /dev/null +++ b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Salmonella.gff @@ -0,0 +1,12 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.1;locus_tag=Silas_the_Salmonella_tag-1-2.1 +contig_2 . CDS 200 290 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 +contig_2 . CDS 300 390 . . . ID=Silas_the_Salmonella_tag-1-3;locus_tag=Silas_the_Salmonella_tag-1-3 +contig_2 . CDS 400 490 . . . ID=Silas_the_Salmonella_tag-1-4.1;locus_tag=Silas_the_Salmonella_tag-1-4.1 +contig_2 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_the_Salmonella_tag-1-4.2 +contig_2 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 +contig_2 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag-1.7;locus_tag=Silas_the_Salmonella_tag-1.7 +contig_2 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff new file mode 100644 index 0000000..36aa721 --- /dev/null +++ b/unit_tests/unit_test_data/TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff @@ -0,0 +1,8 @@ +contig_1 . CDS 1 90 . . . ID=Zion_the_Streptococcus_tag-7-1;locus_tag=Zion_the_Streptococcus_tag-7-1 +contig_1 . CDS 100 190 . . . ID=Zion_the_Streptococcus_tag-7-2;locus_tag=Zion_the_Streptococcus_tag-7-2 +contig_1 . CDS 200 290 . . . ID=Zion_the_Streptococcus_tag-7-3;locus_tag=Zion_the_Streptococcus_tag-7-3 +contig_1 . CDS 300 390 . . . ID=Zion_the_Streptococcus_tag-7-4;locus_tag=Zion_the_Streptococcus_tag-7-4 +contig_1 . CDS 400 490 . . . ID=Zion_the_Streptococcus_tag-7-5;locus_tag=Zion_the_Streptococcus_tag-7-5 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt new file mode 100644 index 0000000..0f5e5e2 --- /dev/null +++ b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt @@ -0,0 +1,6 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +##FASTA +>contigcontig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt new file mode 100644 index 0000000..90f02c2 --- /dev/null +++ b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt @@ -0,0 +1,33 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +##FASTA +>contigcontigo newline at end of file diff --git a/unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt new file mode 100644 index 0000000..4e804fe --- /dev/null +++ b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt @@ -0,0 +1,4 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +##FASTA +>contigo newline at end of file diff --git a/unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt new file mode 100644 index 0000000..b315d87 --- /dev/null +++ b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt @@ -0,0 +1,22 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +##FASTA +>contig_1 testo newline at end of file diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store b/unit_tests/unit_test_data/TestPangenomeSourceProgram/.DS_Store deleted file mode 100644 index 69ff4c2450e0078397ed92420f5908fc7a8d045e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeI1ze@u#6vtn*Xt5504h0czaC4A~lRBK`90jq9oAwt~Y`s!-=yr>%e~*H8@sD!# z`;xSG<#HBVDS|JMeD8AaC9j``OJ7Pvs{Gg}5fzCjkIK4MK;tOJ_bJDz)N4)v`@MvXfu<4(%XtZas&^z88S zv^go?sI)>r2&4&c?!HQ=6jGgT$MW}(!gi}vsfV3z!Y$Lb69z%89oAr*e+{Sz?Pcy@4 z$qnaw`hI4^Jb z(?RxseBDxnz#IuInz@Rc|DR-^|IaZZnMw!contigdiff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff new file mode 100644 index 0000000..0979504 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff @@ -0,0 +1,13 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.1;locus_tag=Silas_the_Salmonella_tag-1-2.1 +contig_1 . CDS 200 290 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 +contig_1 . CDS 300 390 . . . ID=Silas_the_Salmonella_tag-1-3;locus_tag=Silas_the_Salmonella_tag-1-3 +contig_1 . CDS 400 490 . . . ID=Silas_the_Salmonella_tag-1-4.1;locus_tag=Silas_the_Salmonella_tag-1-4.1 +contig_1 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_the_Salmonella_tag-1-4.2 +contig_1 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 +contig_1 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag-1.7;locus_tag=Silas_the_Salmonella_tag-1.7 +contig_1 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 +contig_1 Panaroo CDS 900 1000 . + 0 ID=Silas_the_Salmonella_tag-1000;annotaitons=CureToCancer;locus_tag=Silas_the_Salmonella_tag-1000;old_locus_tag=refound_gene_1 +##FASTA +>contigdiff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff new file mode 100644 index 0000000..be71c95 --- /dev/null +++ b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff @@ -0,0 +1,5 @@ +##FASTA +>gff_name_contiggff_name_contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff new file mode 100644 index 0000000..0cfdd43 --- /dev/null +++ b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff @@ -0,0 +1,3 @@ +##FASTA +>gff_name_contigo newline at end of file diff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff new file mode 100644 index 0000000..14eb0d3 --- /dev/null +++ b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff @@ -0,0 +1,7 @@ +##FASTA +>gff_name_contiggff_name_contiggff_name_contigo newline at end of file From 1f7501bfe15d41c1b6be7d653eaed86a71316c44 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 3 Jan 2022 12:53:15 +1100 Subject: [PATCH 014/135] Add a sort step when unique accessory genes are identified, as list(set()) is non-deterministic in its order. --- .travis/unit-test.sh | 2 +- Corekaburra/__main__.py | 16 ++++++++-------- Corekaburra/gff_parser.py | 12 ++++++------ unit_tests/Corekaburra_test.py | 24 ++++++++++++++++++++---- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/.travis/unit-test.sh b/.travis/unit-test.sh index 0dad5f1..f27bbac 100755 --- a/.travis/unit-test.sh +++ b/.travis/unit-test.sh @@ -4,7 +4,7 @@ set -e errors=0 # Run unit tests -python Corekaburra/Corekaburra_test.py || { +python unit_tests/Corekaburra_test.py || { echo "'python python/Corekaburra/Corekaburra_test.py' failed" let errors+=1 } diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index ee15dff..b34d80e 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -208,14 +208,14 @@ def main(): core_less_contigs_return = output.result() # Merge results into single/master dictionaries - core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) - core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) - core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) - core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) - master_info_total.update(master_info_return) - non_core_contig_info.update(core_less_contigs_return) - - time_calculator(time_start, time.time(), "searching gff files for core genes") + # core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) + # core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) + # core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) + # core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) + # master_info_total.update(master_info_return) + # non_core_contig_info.update(core_less_contigs_return) + # + # time_calculator(time_start, time.time(), "searching gff files for core genes") if __name__ == '__main__': main() diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 5f3b89e..ac90ed4 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -144,8 +144,8 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous core_gene_pair_distance[core_gene_neighbours_str] = core_core_distance # Add counts and annotation for accessory and low frequency genes - acc_genes_in_region = list(set(acc_genes_in_region)) - low_freq_genes_in_region = list(set(low_freq_genes_in_region)) + acc_genes_in_region = sorted(list(set(acc_genes_in_region))) + low_freq_genes_in_region = sorted(list(set(low_freq_genes_in_region))) accessory_gene_content[core_gene_neighbours_str] = acc_genes_in_region.copy() low_freq_gene_content[core_gene_neighbours_str] = low_freq_genes_in_region.copy() @@ -230,8 +230,8 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene last_first_accessory_content = acc_genes_in_region.copy() + first_core_accessory_content.copy() last_first_low_freq_count = low_freq_genes_in_region.copy() + first_core_low_freq_genes.copy() - last_first_accessory_content = list(set(last_first_accessory_content)) - last_first_low_freq_count = list(set(last_first_low_freq_count)) + last_first_accessory_content = sorted(list(set(last_first_accessory_content))) + last_first_low_freq_count = sorted(list(set(last_first_low_freq_count))) accessory_gene_content[core_gene_neighbours_str] = last_first_accessory_content.copy() low_freq_gene_content[core_gene_neighbours_str] = last_first_low_freq_count.copy() @@ -256,8 +256,8 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene def record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, contig_name): - acc_genes_in_region = list(set(acc_genes_in_region)) - low_freq_genes_in_region = list(set(low_freq_genes_in_region)) + acc_genes_in_region = sorted(list(set(acc_genes_in_region))) + low_freq_genes_in_region = sorted(list(set(low_freq_genes_in_region))) if len(acc_genes_in_region) + len(low_freq_genes_in_region) > 0: coreless_contigs[f'{gff_name}--{contig_name}'] = [acc_genes_in_region, low_freq_genes_in_region] diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 762a271..25011ea 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -1155,8 +1155,8 @@ def test_connect_last_n_first_gene_different_genes_w_accessory(self): expected_core_gene_pairs = ['pan_gene_1--pan_gene_2'] expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_2': 139} expected_accessory_gene_content = {'pan_gene_1--pan_gene_2': ['acc_1', 'first_acc_1']} - expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': ['low_acc_1', 'first_low_1']} - expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', 139, 4, ['acc_1', 'first_acc_1'], ['low_acc_1', 'first_low_1']]} + expected_low_freq_gene_content = {'pan_gene_1--pan_gene_2': ['first_low_1', 'low_acc_1']} + expected_master_info = {'pan_gene_1--pan_gene_2--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_2', 139, 4, ['acc_1', 'first_acc_1'], ['first_low_1', 'low_acc_1']]} return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ @@ -1172,6 +1172,7 @@ def test_connect_last_n_first_gene_different_genes_w_accessory(self): accessory_gene_content, low_freq_gene_content, master_info) + # Assert expected against returned self.assertEqual(expected_previous_core_gene_id, return_previous_core_gene_id) self.assertEqual(expected_previous_core_gene_end_coor, return_previous_core_gene_end_coor) self.assertEqual(expected_acc_genes_in_region, return_acc_genes_in_region) @@ -1260,9 +1261,9 @@ def test_connect_same_gene_as_last_n_first_gene_w_accessory(self): expected_low_freq_genes_in_region = [] expected_core_gene_pairs = ['pan_gene_1--pan_gene_1'] expected_core_gene_pair_distance = {'pan_gene_1--pan_gene_1': 1409} - expected_accessory_gene_content = {'pan_gene_1--pan_gene_1': ['acc_2', 'acc_3', 'acc_1']} + expected_accessory_gene_content = {'pan_gene_1--pan_gene_1': ['acc_1', 'acc_2', 'acc_3']} expected_low_freq_gene_content = {'pan_gene_1--pan_gene_1': ['low_1', 'low_2', 'low_3']} - expected_master_info = {'pan_gene_1--pan_gene_1--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_1', 1409, 6, ['acc_2', 'acc_3', 'acc_1'], ['low_1', 'low_2', 'low_3']]} + expected_master_info = {'pan_gene_1--pan_gene_1--gff_name': ['gff_name', 'pan_gene_1', 'pan_gene_1', 1409, 6, ['acc_1', 'acc_2', 'acc_3'], ['low_1', 'low_2', 'low_3']]} return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ return_low_freq_genes_in_region, return_core_gene_pairs, return_core_gene_pair_distance, \ @@ -1967,6 +1968,21 @@ def test_with_coreless_contig_middle_contig(self): low_freq_genes, gff_path, acc_genes, complete_genomes) + # Sort expected and returned lists in dicts + low_freq_gene_content = {x: sorted(low_freq_gene_content[x]) for x in + low_freq_gene_content.keys()} + accessory_gene_content = {x: sorted(accessory_gene_content[x]) for x in + accessory_gene_content.keys()} + master_info = { + x: [sorted(element) if element is list else element for element in master_info[x]] for x in + master_info.keys()} + + return_low_freq_gene_content = {x: sorted(return_low_freq_gene_content[x]) for x in + return_low_freq_gene_content.keys()} + return_accessory_gene_content = {x: sorted(return_accessory_gene_content[x]) for x in + return_accessory_gene_content.keys()} + return_master_info = {x: [sorted(element) if element is list else element for element in return_master_info[x]] + for x in return_master_info.keys()} # Evaluate self.assertEqual(core_gene_pairs.sort(), return_core_gene_pairs.sort()) From f943abaac8839d1414ffa6ab2b586c2f81f23229 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 3 Jan 2022 13:21:07 +1100 Subject: [PATCH 015/135] Add in functions for merging dictionaries along with tests for these. Fix the setup.py script --- Corekaburra/__main__.py | 17 +++-- Corekaburra/merge_dicts.py | 43 +++++------ setup.py | 2 +- unit_tests/Corekaburra_test.py | 132 +++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 31 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index b34d80e..dbaa621 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -40,6 +40,11 @@ except ModuleNotFoundError: from gff_parser import segment_genome_content +try: + from Corekaburra.merge_dicts import merge_dicts_lists, merge_dicts_counts +except ModuleNotFoundError: + from merge_dicts import merge_dicts_lists, merge_dicts_counts + from argparse import ArgumentParser from math import floor import sys @@ -208,12 +213,12 @@ def main(): core_less_contigs_return = output.result() # Merge results into single/master dictionaries - # core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) - # core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) - # core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) - # core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) - # master_info_total.update(master_info_return) - # non_core_contig_info.update(core_less_contigs_return) + core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) + core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) + core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) + core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) + master_info_total.update(master_info_return) + non_core_contig_info.update(core_less_contigs_return) # # time_calculator(time_start, time.time(), "searching gff files for core genes") diff --git a/Corekaburra/merge_dicts.py b/Corekaburra/merge_dicts.py index 2f802c3..55f492b 100644 --- a/Corekaburra/merge_dicts.py +++ b/Corekaburra/merge_dicts.py @@ -1,11 +1,16 @@ -import numpy +def merge_dicts_counts(parent_dict, merge_object): + """ + Function that can merge two dicts by keys and adding 1 to the value each time key is observed + :param parent_dict: Dict to which the second with should be merged into + :param merge_object: Dict or List to be merged into the first. -def merge_dicts_counts(parent_dict, merge_dict): - """ Function that can merge two dicts by keys and adding 1 to the value each time key is observed""" - if isinstance(merge_dict, dict): - keys = merge_dict.keys() - elif isinstance(merge_dict, list): - keys = merge_dict + :return: Resulting dict following merge + """ + + if isinstance(merge_object, dict): + keys = merge_object.keys() + elif isinstance(merge_object, list): + keys = merge_object for key in keys: if key in parent_dict: @@ -17,7 +22,13 @@ def merge_dicts_counts(parent_dict, merge_dict): def merge_dicts_lists(parent_dict, merge_dict): - """ Function to add two dictionaries by adding lists of matching keys """ + """ + Function to add two dictionaries by adding lists of matching keys + :param parent_dict: The Dict to which the second dict should be merged with + :param merge_dict: Dict to be merge with the parent + + :return: Dict having the two inputs merged + """ for key in merge_dict.keys(): # Check if key is present, if then append the value to the key @@ -35,19 +46,3 @@ def merge_dicts_lists(parent_dict, merge_dict): parent_dict[key] = merge_dict[key] return parent_dict - - -def merge_first_genes(start_gene_cluster, merged_start_gene_clusters, merged_second_gene_clusters, first_core_pair): - """ Function that merge first genes and find second genes found in complete genomes, - used to inform core-genome synteny direction and start gene """ - # Check if first gene is available - if start_gene_cluster: - # Record first gene - merged_start_gene_clusters = merged_start_gene_clusters + [start_gene_cluster] - - # Record second gene in file - second_gene = first_core_pair.split('--') - second_gene.remove(start_gene_cluster) - merged_second_gene_clusters = merged_second_gene_clusters + second_gene - - return merged_start_gene_clusters, merged_second_gene_clusters diff --git a/setup.py b/setup.py index aa6edca..e7fe7c4 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ packages=['Corekaburra'], package_dir={'Corekaburra': 'Corekaburra'}, entry_points={ - 'console_scripts': ['Corekaburra = Corekaburra.Corekaburra:main'] + 'console_scripts': ['Corekaburra = Corekaburra.__main__:main'] }, url='https://github.com/milnus/Corekaburra', license='LICENSE', diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 25011ea..890cfe0 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -18,6 +18,7 @@ from Corekaburra import check_inputs from Corekaburra import parse_gene_presence_absence from Corekaburra import gff_parser +from Corekaburra import merge_dicts @@ -2516,5 +2517,136 @@ def test_something(self): # TODO - What other wired and wonderfull examples can pass +class TestergingDicts(unittest.TestCase): + """ Functions to merge dictionaries and lists into dictionaries """ + # Test merge_dicts_counts + def test_merge_dicts_counts_list_empty(self): + input_dict = {} + input_list = ['x', 'y', 'z'] + + expected_dict = {'x': 1, + 'y': 1, + 'z': 1} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + def test_merge_dicts_counts_dict_empty(self): + input_dict = {} + input_list = {'x': 2, 'y': 2, 'z': 2} + + expected_dict = {'x': 1, + 'y': 1, + 'z': 1} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + def test_merge_dicts_counts_list_adding(self): + input_dict = {'x': 1, + 'y': 1, + 'z': 1} + input_list = ['x', 'y', 'z'] + + expected_dict = {'x': 2, + 'y': 2, + 'z': 2} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + def test_merge_dicts_counts_dict_adding(self): + input_dict = {'x': 1, + 'y': 1, + 'z': 1} + input_list = {'x': 1, 'y': 1, 'z': 1} + + expected_dict = {'x': 2, + 'y': 2, + 'z': 2} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + def test_merge_dicts_counts_dict_mix(self): + input_dict = {'x': 1, + 'y': 1, + 'z': 1} + input_list = {'x': 1, 'y': 1} + + expected_dict = {'x': 2, + 'y': 2, + 'z': 1} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + def test_merge_dicts_counts_list_mix(self): + input_dict = {'x': 1, + 'y': 1} + input_list = ['x', 'y', 'z'] + + expected_dict = {'x': 2, + 'y': 2, + 'z': 1} + + return_dict = merge_dicts.merge_dicts_counts(input_dict, input_list) + + self.assertEqual(return_dict, expected_dict) + + # Test merge_dicts_lists + def test_merge_dicts_lists_empty(self): + input_dict = {} + merge_dict = {'x': ['test_3'], + 'y': ['test_2'], + 'z': ['test_1']} + + expected_dict = {'x': ['test_3'], + 'y': ['test_2'], + 'z': ['test_1']} + + return_dict = merge_dicts.merge_dicts_lists(input_dict, merge_dict) + + self.assertEqual(expected_dict, return_dict) + + def test_merge_dicts_lists_adding(self): + input_dict = {'x': ['init_3'], + 'y': ['init_2'], + 'z': ['init_1']} + + merge_dict = {'x': ['test_3'], + 'y': ['test_2'], + 'z': ['test_1']} + + expected_dict = {'x': ['init_3', 'test_3'], + 'y': ['init_2', 'test_2'], + 'z': ['init_1', 'test_1']} + + return_dict = merge_dicts.merge_dicts_lists(input_dict, merge_dict) + + self.assertEqual(expected_dict, return_dict) + + def test_merge_dicts_lists_mix(self): + input_dict = {'x': ['init_3'], + 'y': ['init_2']} + + merge_dict = {'x': ['test_3'], + 'y': ['test_2'], + 'z': ['test_1']} + + expected_dict = {'x': ['init_3', 'test_3'], + 'y': ['init_2', 'test_2'], + 'z': ['test_1']} + + return_dict = merge_dicts.merge_dicts_lists(input_dict, merge_dict) + + self.assertEqual(expected_dict, return_dict) + + if __name__ == '__main__': unittest.main() From d9df0b5c1766f9220aa0ee093bd8932e81950a28 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 4 Jan 2022 15:32:28 +1100 Subject: [PATCH 016/135] Add in functions to identify segments in the genome, with and without accessory genes. Additionally add in tests for these functions --- Code_to_transfer/consesus_core_genome.py | 991 ------------------ Corekaburra/__main__.py | 23 +- Corekaburra/commandline_interface.py | 2 +- Corekaburra/consesus_core_genome.py | 281 +++++ .../test_data/empty_file.expected | 33 +- unit_tests/Corekaburra_test.py | 476 ++++++++- 6 files changed, 810 insertions(+), 996 deletions(-) delete mode 100644 Code_to_transfer/consesus_core_genome.py create mode 100644 Corekaburra/consesus_core_genome.py diff --git a/Code_to_transfer/consesus_core_genome.py b/Code_to_transfer/consesus_core_genome.py deleted file mode 100644 index d589f21..0000000 --- a/Code_to_transfer/consesus_core_genome.py +++ /dev/null @@ -1,991 +0,0 @@ -import networkx as nx -import numpy as np -from os.path import basename, join -import concurrent.futures -from itertools import repeat -from time import time - - -def construct_core_graph(core_neighbour_pairs): - # Initiate core gene graph - G = nx.Graph() - - # Add all core gene pairs and their edges - for core_set in core_neighbour_pairs.keys(): - # split core genes - core_genes = core_set.split('--') - - # Check that sequence break is not present: - if 'Sequence_break' != core_genes[0] and 'Sequence_break' != core_genes[1]: - # Construct edge list - core_genes = [(core_genes[0], core_genes[1], {'weight': core_neighbour_pairs[core_set]})] - # Add edge - G.add_edges_from(ebunch_to_add=core_genes) - - return G - - -def clean_core_graph(core_gene_graph, output_path): - """ Function to clean the core genome for any nodes that has more than two degrees. This also resolves any cliques. - Additionally all genes with more than two dregrees are noted as being possible rearrangement spots.""" - - # Get all nodes in graph - graph_nodes = list(core_gene_graph.nodes()) - - # Initialise gene and pas counters - j = 0 - n_pas = 1 - - # Initialise list to hold possible rearrangements - possible_rearrangement_genes = [] - - # Get the number of cliques and nodes that do not have 2 degrees - n_cliques = len([clique for clique in nx.enumerate_all_cliques(core_gene_graph) if len(clique) >= 3]) - n_degrees = sum([1 for node in core_gene_graph.degree if node[1] != 2]) - - # Print initial info - print(f'Cleaning core genome graph\n') #TODO - MAKE MORE CLEAR - - print(f'Pas number: {n_pas}') - print(f'Number of cliques present in graph: {n_cliques}') - print(f'Number of nodes not having degree of 2 present in graph: {n_degrees}\n') - - # Start cleaning core gene graph until done - while True: - neighbours = 0 - - # Search until a node with at least three degrees is found - while neighbours < 3: - # Check if the last node is reached and if there are no more cliques and if all nodes has 2 degrees, - # if then return the graph - all is done - if j == len(graph_nodes) and n_cliques == 0 and n_degrees == 0: - print('Finished cleaning core genome graph. No more cliques and all nodes has two edges') - nx.write_gml(core_gene_graph, join(output_path, 'cleaned_core_graph.gml')) - return core_gene_graph, possible_rearrangement_genes - - # Check if last node is reached, then restart as not all nodes has been resolved for additional degrees - elif j == len(graph_nodes): - j = 0 - n_pas += 1 - - nx.write_gml(core_gene_graph, join(output_path, f'prelim_core_graph_{n_pas}.gml')) - print(f'Pas number: {n_pas}') - print(f'Number of cliques present in graph: {n_cliques}') - print(f'Number of nodes not having degree of 2 present in graph: {n_degrees}\n') - - if n_pas == 50: - cluster_2_degrees = [node[0] for node in core_gene_graph.degree if node[1] != 2] - - nx.write_gml(core_gene_graph, join(output_path, 'prelim_core_graph.gml')) - - raise NotImplementedError(f'The core graph cleaning process reached 50 rounds. Is it stuck? ' - f'Please check that the number of cliques and degrees not having 2 edges ' - f'has gone down during the couple of passes!\n' - f'Core genes with multiple edges: {cluster_2_degrees}') - - # If the final node is not reached fetch the next node in line and examine its number of degrees - else: - edges_oi = core_gene_graph.edges(graph_nodes[j]) - - neighbours = len(edges_oi) - - j += 1 - - # Get the weight of each edge leading for a given node - edge_weights = [core_gene_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in edges_oi] - - # Find the least weighted edge - min_edge_index = list(np.where(edge_weights == np.amin(edge_weights))[0]) - - # If a single minimum weight can be found or - # if the number of degrees is at least three larger than the number of degrees with the minimum weight, - # then pressed remove the degrees - if len(min_edge_index) == 1 or len(edges_oi) > len(min_edge_index) + 3: - edges_oi = list(edges_oi) - - minimum_edge_pairs = [edges_oi[index] for index in min_edge_index] - # print(f"minimum_edge_pairs {minimum_edge_pairs}") - # Check if node at the end of edge has at least 3 or more edges, if then remove if not then keep edge - # print(f'Node in question: {graph_nodes[j - 1]}') - # neighbour_nodes = set([node for tup in minimum_edge_pairs for node in tup]) - - # neighbour_nodes.remove(graph_nodes[j - 1]) - # print(f'Nodes round with low edge weight: {neighbour_nodes}') - # minimum_edge_pairs = [node for node in neighbour_nodes if core_gene_graph.degree(node) > 2] - minimum_edge_pairs = [pair for pair in minimum_edge_pairs if core_gene_graph.degree(pair[1]) > 2] - # print(f'Number of degrees for neighbours: {[core_gene_graph.degree(node) for node in neighbour_nodes]}') - # print(f'Nodes to which edges should be removed: {minimum_edge_pairs}') - - # [core_gene_graph.remove_edge(*list(edges_oi)[index]) for index in minimum_edge_pairs] - [core_gene_graph.remove_edge(*pair) for pair in minimum_edge_pairs] - - # possible_rearrangement_genes += [edges_oi[index] for index in minimum_edge_pairs] - possible_rearrangement_genes += [pair for pair in minimum_edge_pairs] - - # Recalculate degree and clique numbers - n_cliques = len([clique for clique in nx.enumerate_all_cliques(core_gene_graph) if len(clique) >= 3]) - n_degrees = sum([1 for node in core_gene_graph.degree if node[1] != 2]) - - -def get_most_frequent_gene(gene_list): - genes, frequency = np.unique(gene_list, return_counts=True) - max_index = np.where(frequency == np.amax(frequency)) - most_frequent_gene = genes[max_index][0] - - return most_frequent_gene - - -def get_new_neighbours(current_gene, visited_genes, core_gene_graph): - """ Function to get new neighbours and their weights and filter away already visited neighbours """ - neighbours = core_gene_graph.neighbors(current_gene) - - new_neighbours = [neighbour for neighbour in neighbours if neighbour not in visited_genes] - - edge_weights = [core_gene_graph.get_edge_data(current_gene, neighbour)['weight'] for neighbour in new_neighbours] - - return new_neighbours, edge_weights - - -def find_core_gene_synteny(core_gene_graph, start_gene_clusters, second_gene_cluster): - """ Function to identify core gene synteny from core gene graph - Also returns a set of genes that possibly are associated with rearrangements""" - # Check that at least one start gene is found, if then get most frequent gene in first and second position. - if len(start_gene_clusters) > 0: - start_gene = get_most_frequent_gene(start_gene_clusters) - second_gene = get_most_frequent_gene(second_gene_cluster) - - print(f"Most frequent first gene in complete genomes and " - f"chosen as starting gene for consensus core gene synteny is: {start_gene}") - print(f"Most frequent second gene in complete genes and used to inform core gene synteny is: {second_gene}") - - # Get place to start core gene synteny - visited_list = [start_gene, second_gene] - new_neighbours, edge_weights = get_new_neighbours(second_gene, visited_list, core_gene_graph) - - # Initialise list to hold the weights for core-core edges. - # Connection to first core gene - _, edge_weights_first = get_new_neighbours(second_gene, [], core_gene_graph) - # Connection to second core gene - third_gene, edge_weights_second = get_new_neighbours(second_gene, [start_gene], core_gene_graph) - # start core coverage output - core_path_coverage = [[start_gene, second_gene, edge_weights_first[0]], - [second_gene, third_gene[0], edge_weights_second[0]]] - - # TODO - Else, when only contigged genomes are pressent (Search for dnaA? first naively using regex in list then more sofisticated) - - # Initialise list to store genes of interest and narrow the search for rearrangements - possible_rearrangement_genes = [] - - # Search Graph for path - # TODO - The logic of running until new_neoghbours is zero can be dangerous if only incomplete genomes are available - # as a break in the same place in all of them may lead to a truncated consensus. Implement a check to see - # if all core genes are included. - - # TODO - Fix so that all nodes are visited in graph to constuct consensus core genome. - while len(new_neighbours) > 0: - # Check if more than one neighbour is present if, then choose the one with largest edge weight, - # Else walk along only path available - if len(new_neighbours) > 1: - max_weight = np.max(edge_weights) - largest_weight_index = np.where([index == max_weight for index in edge_weights])[0]#[0] - if len(largest_weight_index) == 1: - best_neighbour = new_neighbours[largest_weight_index[0]] - else: - raise ValueError(f'Multiple core gene neighbours were equally likely to neighbour to: {visited_list[-1]},\n' - f'The new neighbours variable was: {new_neighbours}\n' - f'The edge weights were: {edge_weights}.') - # TODO - Handle better! - - else: - best_neighbour = new_neighbours[0] - - # Add next node in core genome synteny - visited_list.append(best_neighbour) - new_neighbours, edge_weights = get_new_neighbours(best_neighbour, visited_list, core_gene_graph) - coverage_pairs = map(lambda best, new, cov: - [best, new, cov], - [best_neighbour]*len(new_neighbours), new_neighbours, edge_weights) - # for i in coverage_pairs: - # print(i) - # print(list(coverage_pairs)) - # core_path_coverage = core_path_coverage + [edge_weights] - core_path_coverage = core_path_coverage + list(coverage_pairs) - # TODO - Use core_path_coverage to produce a plot that show the how much coverage each edge has. - # TDOD - possibly produce a list and an output file that contain only the coverage of consensus core gene synteny. - - # Check if more than one neighbour is available due to possible rearrangements - if len(new_neighbours) > 1: - # Map possible combinations of branch in core-core genome path - possible_pairs = list(map(lambda e: [best_neighbour, e], new_neighbours)) - possible_rearrangement_genes = possible_rearrangement_genes + possible_pairs - - # Insert connections from last gene to first gene - _, end_to_start_coverage = get_new_neighbours(visited_list[-1], [visited_list[-2]], core_gene_graph) - core_path_coverage = core_path_coverage + [[visited_list[-1], start_gene, end_to_start_coverage[0]]] - - print(f"visited_list[-1] {visited_list[-1]}") - print(f"start_gene{start_gene}") - - print(f"Number of nodes in core graph: {core_gene_graph.number_of_nodes()}") - print(f'Number of nodes visited: {len(visited_list)}') - - return visited_list, possible_rearrangement_genes, core_path_coverage - - -def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count): - # Go through each segment flanked by a core gene with >2 edges and identify segments with no accessory between them - - # Create dict of subsegments of the larger segments - # DO not use dict.fromkeys() with value being a list, as this uses the same list in all keys. - sub_segment_dict = {key: [] for key in double_edge_segements} - - # Go through segments - for segment in double_edge_segements: - empty_segment_genes = [] - - cur_segment = double_edge_segements[segment] - # Check each region of the segment for core genes - for i in range(0, len(cur_segment)-1): - core_neighbours = sorted([cur_segment[i], cur_segment[i+1]]) - core_region = f'{core_neighbours[0]}--{core_neighbours[1]}' - # Get accessory genes in region - core_region_acc_genes = combined_acc_gene_count[core_region] - - # If number of core genes is zero then add the core pair to current segment and increment the counter for the length of current pair - # Else add the segment to the directory of sub segments and reset counters. - - # If core region does not contain accessory genes, add to current segment. Else add the segment and start a new - if core_region_acc_genes == 0: - # If first pair in segment add both, if not first only add the last gene - if len(empty_segment_genes) == 0: - empty_segment_genes += [cur_segment[i], cur_segment[i+1]] - else: - empty_segment_genes += [cur_segment[i+1]] - - else: - # Check if first pair in subsegment and add first gene as being 'lonely' - if len(empty_segment_genes) == 0: - empty_segment_genes += [cur_segment[i]] - - # Record the segment and reset the subsegment to contain no core genes - sub_segment_dict[segment].append(empty_segment_genes) - empty_segment_genes = [] - - # Check of segment end has been reached and more than two genes are in the segment, if then add the segment - if i == len(cur_segment) - 2 and len(empty_segment_genes) >= 2: - sub_segment_dict[segment].append(empty_segment_genes) - empty_segment_genes = [] - # Check if the second gene in pair is last in segment, and accessory genes are present between second to last and last core gene, - # if then add the last gene as being 'lonely' - elif i == len(cur_segment) - 2: - empty_segment_genes += [cur_segment[i + 1]] - sub_segment_dict[segment].append(empty_segment_genes) - - return sub_segment_dict - - -def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs): - # Construct a graph from core gene neighbours - core_graph = construct_core_graph(core_neighbour_pairs) - - # Find segments in the genome between core genes with multiple neighbors - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, num_gffs) - - if double_edge_segements is not None: - # Find segments of core genes with no accessory in between - no_acc_segments = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - else: - no_acc_segments = None - - return double_edge_segements, no_acc_segments - - -def determine_core_gene_consesus(core_neighbour_pairs, start_gene_cluster, second_gene_cluster, output_path): - core_gene_graph = construct_core_graph(core_neighbour_pairs, output_path) - - # TODO - Clean up graph and find alternative core pairs - core_gene_graph, possible_rearrangement_genes = clean_core_graph(core_gene_graph, output_path) - - consensus_core_genome, \ - _, \ - core_path_coverage = find_core_gene_synteny(core_gene_graph, - start_gene_cluster, - second_gene_cluster) - - # TODO - Guess size of rearrangement using distance from genome and fill in with average, if missing. - - return consensus_core_genome, possible_rearrangement_genes, core_path_coverage - - -def assign_core_gene_synteny_types(alternative_core_pairs, gff_names): - """ Function that assigns a type to each genome based on its core genome synteny and the consensus. - returns a dict with each genome as a key and the value being the consensus core genome type. - a type = 1 is consensus everything else is not consensus.""" - # Get name of genomes - gff_names = [basename(gff).split('.')[0] for gff in gff_names] - if '_corrected' in gff_names[0]: - gff_names = [gff.split('_corrected')[0] for gff in gff_names] - - # construct dict with each genome as key and value as core gene synteny type - type_dict = dict.fromkeys(gff_names) - - # Get list of all genomes with an alternative core pair - alternative_genomes = alternative_core_pairs.keys() - - # Construct dict to hold all alternative core-pair and their associated type - alt_core_comp_types = {} - - # Initialise alternative type counter - type_counter = 2 - # Go through all genomes and assign core gene synteny type dependant on their composition - for key in type_dict.keys(): - # Check if alternative core gene neighbours are present, - # If they give search for type, if not then give consencus type, 1. - if key in alternative_genomes: - # extract alternative core neighbours and sort and make string to be possible key in dict - current_pairs = alternative_core_pairs[key] - current_pairs.sort() - current_pairs = str(current_pairs) - - # Check if combination of alternative core gene neighbours has been assigned a type, - # if not assign new type. - if current_pairs in list(alt_core_comp_types.keys()): - type_dict[key] = alt_core_comp_types[current_pairs] - else: - alt_core_comp_types[str(alternative_core_pairs[key])] = type_counter - type_dict[key] = type_counter - type_counter = type_counter + 1 - else: - type_dict[key] = 1 - - return type_dict, alt_core_comp_types - - -def identify_rearrangements(consensus_core_genome, possible_rearrangement_genes, master_info_dict, gff_names): - - # Correct gff file names - # if '_corrected' in gff_names[0]: - # gff_names = [name.split('corrected') for name in gff_names] - - # Pair all neighbouring genes in the consensus core genome - core_genome_pairs = [] - for i in range(len(consensus_core_genome)): - sorted_neighbours = sorted([consensus_core_genome[i], consensus_core_genome[i-1]]) - core_genome_pairs.append(f'{sorted_neighbours[0]}--{sorted_neighbours[1]}') - - # Make searchable set of all core gene pairs - core_genome_pairs = set(core_genome_pairs) - - # TODO! - # Search each key that contain a gene with possible rearrangements. - # If two core gene pairs is not present then predict rearrangement and size. - store genome and rearrangements - # If only one is present predict possible rearrangements, but unsure due to only one core pair identified - # If three or more then it is complex and no further prediction can be made. - Possible contamination? - - # initialize dict of alternative core gene neighbours - alt_core_pairs = {} - - # Identify the alternative core gene neighbours for each genome: - for key in master_info_dict.keys(): - split_key = key.split("--") - if "Sequence_break" != split_key[0] and "Sequence_break" != split_key[1]: - core_pair = {f"{split_key[0]}--{split_key[1]}"} - - difference = core_pair.difference(core_genome_pairs) - if len(difference): - try: - alt_core_pairs[split_key[2]] = alt_core_pairs[split_key[2]] + list(difference) - except KeyError: - alt_core_pairs[split_key[2]] = list(difference) - - - print(f'Number of genomes with alternative core neighbours {len(alt_core_pairs.keys())} - ' - f'{round(len(alt_core_pairs.keys())/len(gff_names) * 100, ndigits=1)}%\n') # TODO - Change to say number out of total instead of a percentage - - # Record the number of times each alternative core pair occur - alt_core_pair_count = {} - for key in alt_core_pairs.keys(): - for pair in alt_core_pairs[key]: - if pair in list(alt_core_pair_count.keys()): - alt_core_pair_count[pair] += 1 - else: - alt_core_pair_count[pair] = 1 - - # Assign core gene synteny types to genomes - core_genome_types, alt_core_comp_types = assign_core_gene_synteny_types(alt_core_pairs, gff_names) - - return alt_core_pairs, alt_core_pair_count, core_genome_types, alt_core_comp_types - - -def simple_rearrangement_prediction(gene_pairs, consensus_core_genome): - # print("predicting rearrangements") - # Pair all neighbouring genes in consensus sequence - core_genome_pairs = [] - for i in range(len(consensus_core_genome)): - sorted_neighbours = sorted([consensus_core_genome[i], consensus_core_genome[i - 1]]) - core_genome_pairs.append(f'{sorted_neighbours[0]}--{sorted_neighbours[1]}') - - - # TODO - Check if alternative neighbours are neighbours in the concensus if paired differently - individual_genes = gene_pairs[0].split('--') + gene_pairs[1].split('--') - - new_gene_pairs = [f'{individual_genes[0]}--{individual_genes[2]}', - f'{individual_genes[1]}--{individual_genes[3]}'] - - if new_gene_pairs[0] in core_genome_pairs and new_gene_pairs[1] in core_genome_pairs: - print("Solution found") - else: - new_gene_pairs = [f'{individual_genes[1]}--{individual_genes[2]}', - f'{individual_genes[0]}--{individual_genes[3]}'] - - if new_gene_pairs[0] in consensus_core_genome and new_gene_pairs[1] in consensus_core_genome: - print("Solution found") - - -def characterise_rearrangements(alt_core_pairs, consensus_core_genome): - print("characterising rearrangements") - rearrangement_predictions = [[], []] - - # Go through each genome with alternative core pairs - for genome in alt_core_pairs.keys(): - - # If only one alternative pair is found, too little info is available. (May be due to contig breaks) - if len(alt_core_pairs[genome]) == 1: - rearrangement_predictions[0].append(genome) - rearrangement_predictions[1].append('Too little information') - - # If exactly two alternative core pairs are present it is possible to make a prediction, with some uncertainty. - if len(alt_core_pairs[genome]) == 2: - rearrangement_predictions[0].append(genome) - rearrangement_predictions[1].append('Possible prediction') - - simple_rearrangement_prediction(alt_core_pairs[genome], consensus_core_genome) - - # If more than two are present then more than one solution is possible and no prediction can be made. - if len(alt_core_pairs[genome]) > 2: - rearrangement_predictions[0].append(genome) - rearrangement_predictions[1].append('Possible prediction') - # TODO - Look at predicting more complex rearrangements. - - '''Use consensus core synteny to work out how rearrangements may have occured. - If A and B are connected in the concesus and C and D - are connected, then if A and C are connected, and B and D are found next to - sequence breaks then it may be a possible recombination''' - - - return rearrangement_predictions # TODO - Use this output - - -def determine_partners_neighbours(alt_core_gene, core_genome_pairs, cur_genome_pairs): - # Find consensus neighbours for genes in the alternative pair - consensus_pairs = [gene for gene in core_genome_pairs if gene[0] is alt_core_gene or gene[1] is alt_core_gene] - - # Remove consensus pairs found in the current genome - consensus_pairs = [pair for pair in consensus_pairs if pair not in cur_genome_pairs] - - # Flatten list of list to list - consensus_neighbours = [gene for pair in consensus_pairs for gene in pair] - - # Find remove the gene from the alternative pair and keep consensus neighbours - consensus_neighbours = [gene for gene in consensus_neighbours if gene is not alt_core_gene] - - # Find neighbours of the consesus neighbours in the current genome - cur_consensus_neighbours = [pair for pair in cur_genome_pairs if pair[1] in consensus_neighbours - or pair[0] in consensus_neighbours] - - # Flatten the list of neighbours to the consensus neighbours - consensus_neighbours = [gene for pair in cur_consensus_neighbours for gene in pair] - - # Count the number of sequence breaks - partner_sequence_breaks = consensus_neighbours.count('Sequence_break') - - return partner_sequence_breaks - - -def core_pair_matrix(core_genome_types, alt_core_comp_types, alt_core_pair_count, master_info_total, consensus_genome): - """ Function to produce a matrix of presence or absence of alternative core neighbours to be viewed in Phandango""" - # Construct header for output file - header = ['genome'] - for pair in alt_core_pair_count.keys(): - header.append(pair) - - # Initialise the dict that will become the matrix with gene pairs as coluns and genomes as rows. - alt_core_pair_matrix = {} - for genome in core_genome_types: - alt_core_pair_matrix[genome] = dict.fromkeys(header[1:], 0) - alt_core_pair_matrix[genome].update({'genome': genome}) - - # Construct dict that contain the genome type as key and alternative core neghbours associated with it as values. - genome_type_gene_dict = {} - for gene_pairs in alt_core_comp_types: - core_gene_type = alt_core_comp_types[gene_pairs] - gene_pairs = gene_pairs.replace('[', '') - gene_pairs = gene_pairs.replace(']', '') - gene_pairs = gene_pairs.split(', ') - - gene_pairs = [eval(gene_pair) for gene_pair in gene_pairs] - - genome_type_gene_dict[core_gene_type] = gene_pairs - - # Fill the matrix with presence (3) or leave as absent (0) - for key in core_genome_types: - genome_type = core_genome_types[key] - if genome_type is not 1: - for core_pair in genome_type_gene_dict[genome_type]: - alt_core_pair_matrix[key][core_pair] = 3 - - # TODO - Go though and see of an isolate contain a sequence break by both genes in an alternative pair. - # if so then mark it as 0.5 or probable. - # Find core genes where both genes are next to a sequence break and can be found in a alternative pair - sequence_break_pairs = [pair.split('--') for pair in master_info_total.keys() if 'Sequence_break' in pair] - # Remove the Sequence_break enteries. Keep gene neighbouring sequnece break and genome name - sequence_break_pairs = [[element for element in pair if element != 'Sequence_break'] for pair in sequence_break_pairs] - - # Pair all neighbouring genes in the consensus core genome - core_genome_pairs = [] - for i in range(len(consensus_genome)): - sorted_neighbours = sorted([consensus_genome[i], consensus_genome[i - 1]]) - core_genome_pairs.append(sorted_neighbours) - - - # TODO - predict if a consensus - core-core neighbourship is possible. - # Initiate counters to keep track of core-core variants searched and found - core_variants_predicted_weak = 1 - core_variants_predicted_strong = 1 - core_variants_predicted = 1 - genomes_searched = 1 - # Go through each header and find genomes that contain both core genes of a alternative pair next to a sequence break - for cur_genome in alt_core_pair_matrix.keys(): - # Find breaks for current genome - breaks_cur_genome = [pair for pair in sequence_break_pairs if cur_genome in pair] - if len(breaks_cur_genome): - # Isolate core gene neighbour pairs for genome - cur_genome_core_pairs = [pair.split('--')[0:2] for pair in master_info_total.keys() if cur_genome in pair] - - # Go through all alternative core pairs and examine the ones not present in the current genome. - for alt_core_pair in alt_core_pair_matrix[cur_genome]: - # Check that genome entry is not checked - if alt_core_pair is not 'genome': - # Check that the alternative core neighbours are not present - if alt_core_pair_matrix[cur_genome][alt_core_pair] != 2: - # Split alternative variant into two elements in list - core_pair = alt_core_pair.split('--') - - # Search if a combination of core genes near sequnce breaks can match the alternative pair, - # If then set evidence level to 1. - if len([gene for gene in breaks_cur_genome if gene[0] in core_pair[0]]) and \ - len([gene for gene in breaks_cur_genome if gene[0] in core_pair[1]]) \ - or \ - len([gene for gene in breaks_cur_genome if gene[0] in core_pair[1]]) and \ - len([gene for gene in breaks_cur_genome if gene[0] in core_pair[0]]): - - # Check that the possible alternative pair is not because of many breaks in genome - alt_core_pair_possible = False - - seq_breaks_gene_1 = determine_partners_neighbours(core_pair[0], - core_genome_pairs, - cur_genome_core_pairs) - seq_breaks_gene_2 = determine_partners_neighbours(core_pair[1], - core_genome_pairs, - cur_genome_core_pairs) - if seq_breaks_gene_1 > 0 and seq_breaks_gene_2 > 0: - alt_core_pair_possible = True - - if alt_core_pair_possible: - alt_core_pair_matrix[cur_genome][alt_core_pair] = 1 - core_variants_predicted_weak += 1 - else: - alt_core_pair_matrix[cur_genome][alt_core_pair] = 2 - core_variants_predicted_strong += 1 - - core_variants_predicted += 1 - - - # Increment the counter for variants searched - genomes_searched += 1 - - # TODO - set as verbose operated: - print(f'A total of {genomes_searched} genomes were searched for alternative' - f' core-core neighbours separated by a sequence break') - print(f'{core_variants_predicted} alternative core-core neighbours were predicted, when examining genomes with ' - f'sequnce breaks.') - print(f'{core_variants_predicted_weak} of the predicted alternative core neighbours had weak evidence') - print(f'{core_variants_predicted_strong} of the predicted alternative core neighbours had strong evidence') - - # Convert to list to be writen as output file - matrix_list = [dict_content for dict_content in alt_core_pair_matrix.values()] - - return [matrix_list, header] - - -def construct_directed_graph(core_graph): - # Find a node that contains only two edges - node_degrees = list(core_graph.degree) - degree_counter = 0 - - n_degrees = node_degrees[degree_counter][1] - while True: - degree_counter += 1 - - if degree_counter == len(node_degrees): - raise NotImplementedError("No nodes in the network has two degrees!") - - n_degrees = node_degrees[degree_counter][1] - - if n_degrees == 2: - neighbours = list(core_graph.neighbors(node_degrees[degree_counter][0])) - neighbour_1_ndegree, neighbour_2_ndegree = [core_graph.degree(neighbour) for neighbour in neighbours] - - if neighbour_1_ndegree == 2: - split_index = 0 - - elif neighbour_2_ndegree == 2: - split_index = 1 - - split_neighbour = neighbours[split_index] - break - - # Extract the node with only two degrees - split_node = node_degrees[degree_counter][0] - - print(f'split_node: {split_node}') - print(f'split_neighbour: {split_neighbour}') - # Remove the edge between the two nodes: - core_graph.remove_edge(split_node, split_neighbour) - - # Go through and construct all edges in directed graph - di_core_graph = nx.DiGraph() - - previous_nodes = [split_node] - all_visited_nodes = [] - last_visited_nodes = [] - all_edges_added = [] - alternative_edges = [] - - print(f'Number of of nodes in input graph: {len(core_graph.nodes)}') - # REMOVE - counter = 0 - - while previous_nodes != [split_neighbour] and len(previous_nodes) != 0: - if counter == 200: - print(counter) - break - - next_pairs = [(node, gene) for node in previous_nodes for gene in get_new_neighbours(node, last_visited_nodes, core_graph)[0]] - - # filter edges that are already added with direction - next_pairs = [pair for pair in next_pairs if (pair[1], pair[0]) not in all_edges_added] - - dup_next_pair = [pair for pair in next_pairs if (pair[1], pair[0]) in next_pairs] - - # print(f'Number of duplicated pairs: {len(set(dup_next_pair))}') - dup_next_pair = list(set(dup_next_pair)) - # Go though all duplicated pairs until they have been resolved - while len(dup_next_pair) > 0: - cur_pair = dup_next_pair.pop() - - # Constuct the alternative - alt_dup_pair = (cur_pair[1], cur_pair[0]) - - # Remove the pair and the alternative - next_pairs = [pair for pair in next_pairs if pair != alt_dup_pair] - next_pairs = [pair for pair in next_pairs if pair != cur_pair] - - alternative_edges.append((cur_pair[1], cur_pair[0])) - - dup_next_pair.remove((cur_pair[1], cur_pair[0])) - - next_pairs.append(cur_pair) - - [di_core_graph.add_edge(*pair, weight=core_graph.get_edge_data(*pair)['weight']) for pair in next_pairs] - - all_visited_nodes += [pair[1] for pair in next_pairs] - all_edges_added += next_pairs - last_visited_nodes = [pair[0] for pair in next_pairs] - previous_nodes = [pair[1] for pair in next_pairs] - - counter += 1 - - print(f'Number of node in di_core_graph: {len(di_core_graph.nodes)}') - print(f'Number of edges {len(di_core_graph.edges)}') - - # print([node if node in enumerate(di_core_graph.degree) if node[1] > ]) - degrees = core_graph.degree - for node in di_core_graph.degree: - for ori_node in degrees: - if ori_node[0] == node[0] and ori_node[1] < node[1]: - print(f'node {node}') - print(f'ori_node {ori_node}') - - return di_core_graph, alternative_edges - # Remove one of the edges from that node - - # Remember what the nodes' names were - - # Make all edges from one node to the other directed with weight - - -def identify_segments(core_graph, num_gffs): - """ Identify all segments (paths) of the graph the goes from one node with >2 degrees to the next, - where all nodes in between contain only two degrees and thus the path has no ambiguity """ - - # Identify all nodes that contain more than two degrees. - multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] - - # Check if any multi node edges are present, if not then return. - if len(multi_edge_nodes) == 0: - return None, None, None - # raise NotImplementedError("A core gene graph with no nodes having more than two degrees was constructed.") - - # identify the multi connected nodes that are connected to one another, as these do not need to be searched for a simple path between them. - connect_dict = {} - - # Identify neighbouring nodes with >2 degrees for all the nodes with >2 degrees themself - for node in multi_edge_nodes: - connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes] - - # Turn the weight into a 'distance' or number of times not found together. - for edge in core_graph.edges(data=True): - core_graph[edge[0]][edge[1]]['weight'] = num_gffs - core_graph[edge[0]][edge[1]]['weight'] - - # find all simple paths between nodes with >2 degrees - double_edge_segements = {} - - # Go through all source and taget nodes and see if a path can be found where all nodes between them have only two degrees - for source_node in multi_edge_nodes: - for target_node in multi_edge_nodes: - if target_node != source_node and target_node not in connect_dict[source_node]: - # Get path (segment) segment from source to target - segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight') - - # Get length of path - segment_length = len(segment) - - # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes]) - - # Check if no node between the source and target has more than two edges, if not then record the segment/path - if segment_length - 2 == two_degree_segment_length: - # Construct name for path - source_target_name = sorted([source_node, target_node]) - source_target_name = f'{source_target_name[0]}--{source_target_name[1]}' - - # Check that path has not been recorded in the opposite direction, if not then record it - if source_target_name not in double_edge_segements: - double_edge_segements[source_target_name] = segment - else: - if double_edge_segements[source_target_name] != segment[::-1]: - raise NotImplementedError("Path from one node to another was found, but did not match previously found path!") - - # Calculate the expected number of paths - total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) - num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict]) - expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) - - # Check if less than the number of expected paths has been found, if then try to identify missing paths - if expected_segment_number != len(double_edge_segements): - # Get number of expected edges for each node that has more than two edges - expected_edge_num_dict = {node: connections for node, connections in core_graph.degree if connections > 2} - - # Get the number of edges directly between multi connected nodes - identified_edge_num_dict = {node: len(connect_dict[node]) for node in connect_dict} - - # Get the number of paths connecting multi connected nodes via segments found in previous loop - for connection in double_edge_segements: - connection_nodes = connection.split('--') - for node in connection_nodes: - identified_edge_num_dict[node] += 1 - - # Compare the identified number of connections expected to the identified, to find nodes that are missing connections - nodes_missing_connections = [] - for node in expected_edge_num_dict: - if identified_edge_num_dict[node] != expected_edge_num_dict[node]: - nodes_missing_connections.append(node) - - # Go through nodes that are missing at least one path and try to identify missing paths - for node in nodes_missing_connections: - - for current_target_node in nodes_missing_connections: - # Check that the current target node is not a neighbouring node or the current node itself - if current_target_node not in connect_dict[node] and current_target_node != node: - - # Copy the graph to manipulate it - core_graph_copy = core_graph.copy() - - # Extract the nodes for source and target - source_node = node - target_node = current_target_node - - # Construct a pair name - suspected_pair = sorted([source_node, target_node]) - suspected_pair = f'{suspected_pair[0]}--{suspected_pair[1]}' - - # Check that the pair has not been found in a previous run - if suspected_pair not in double_edge_segements: - # Counter to stop loop - counter = 0 - # Identifier to see if path has been found, to stop loop - path_identified = False - while not path_identified: - counter += 1 - - # Get all shortest path between source and target. - all_shortest_paths = nx.all_shortest_paths(core_graph_copy, source_node, target_node) - - # Go through each path to see if is satisfies the criteria - try: - for index, path in enumerate(all_shortest_paths): - # Get length of path - segment_length = len(path) - - # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in path if node not in multi_edge_nodes]) - - # Check that the path does not contain nodes with >2 degrees outside of source and target, - # if then add path if not then find nodes that has >2 edges and remove an edge that leads to the to break path for next run through loop - if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: - # Add in path - double_edge_segements[suspected_pair] = path - path_identified = True - pass - - else: - # Check if path is length >2, if then find >2 degree nodes and remove an edge to them, if not just remove edge found between nodes. - if len(path) > 2: - multi_node_in_path = [[path[index], path[index+1]] for index, node in enumerate(path) if node in multi_edge_nodes and node != source_node and node != target_node] - # print(f'list(set(multi_node_in_path)) {list(set(multi_node_in_path))}') - for multi_node_pair in multi_node_in_path: - # Try to remove edge found to multi node, if already removed move on. - try: - core_graph_copy.remove_edge(*multi_node_pair) - except nx.exception.NetworkXError: - continue - # raise ValueError("NetworkX was not able to remove an edge in the core graph network during segment identification!") - - else: - core_graph_copy.remove_edge(*path) - - if counter == 1000: - raise IndexError("Counter reached limit! in detecting a new path for pair.") - except nx.NetworkXNoPath: - # No simple paths could be found for the source and target thus the while loop is terminated. - path_identified = True - - return double_edge_segements, connect_dict, multi_edge_nodes - - - - # TODO - Add in genes that contain more than two edges and are only connected to other genes with more than two edges. These should be added as segments! - - # This idea scales exponentially with the number of degrees that has >2 degrees. - - # Another idea could be to move out from a node, until another node is identifies and then save the path. - -def find_min_path(simple_path, recquired_len, i): - print(f'process {i}') - print(simple_path) - print(recquired_len) - if len(simple_path) == recquired_len: - print("nice length") - return True - else: - print("too short") - return False - - -def connect_segments(double_edge_segements, connect_dict, multi_edge_nodes, core_graph): - # Construct some represenatation of edge weights between end nodes in segments - # Calculate the best path through them. All simple paths, that fulfill the number of all nodes and then search for largest value? - - # Build simplified network - segment_graph = nx.Graph() - - for node in connect_dict: - # Add in direct connections - for conencted_node in connect_dict[node]: - conenction_weight = core_graph.get_edge_data(node, conencted_node)['weight'] - conenction_distance = 532 - conenction_weight + 1 - - segment_graph.add_weighted_edges_from([(node, conencted_node, conenction_distance)]) - - # Add in segment connections - for segment in double_edge_segements: - source, target = segment.split('--') - conenction_distance = 0 - - segment_graph.add_weighted_edges_from([(source, target, conenction_distance)]) - - # Remove all edges that connect a core gene, between two segments, to another core gene elsewhere in the graph. - - nx.write_gml(G=segment_graph, path='/Users/mjespersen/Documents/Davies_scripts/segment_graph.gml') - - all_simple_paths = nx.all_simple_paths(segment_graph, source, target) - - num_nodes = len(segment_graph.nodes) - num_segments = sum([1 for *_, info in list(segment_graph.edges(data=True)) if info['weight'] == 0]) - - # TODO - TRY TO SPEED UP THIS PROCEESS BY MULTIPROCESSING - # TODO - insert check if more than one core is given, if then use multiprocessing, if not then run in for-loop. - # with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: - # results = [executor.submit(find_min_path, path, num_nodes, i) for i, path in enumerate(all_simple_paths)] - # - # print(results) - # for re_value in concurrent.futures.as_completed(results): - # print(re_value.result()) - - # for index, path in enumerate(all_simple_paths): - # if index == 100: - # exit() - # - # print(find_min_path(path, num_nodes)) - - # counter = 1 - min_path = [] - min_path_weight = 999999999 - best_path_weights = [] - alternative_bests = [] - # TODO - speed this up by multi proccessing. Divide the list of all simple paths into smaller chunks and get the smallest value as the return from them. - start_time = time() - for index, path in enumerate(all_simple_paths): - if len(path) == num_nodes: - - sum_cur_path_weight = 0 - cur_path_weights = [] - for pair in zip(path[1:], path[:-1]): - path_weight = segment_graph.get_edge_data(*pair)['weight'] - sum_cur_path_weight += path_weight - cur_path_weights.append(path_weight) - - # Check if the found path has a accumulated wight small than the previous best edge and if all segments has been visited - # if min_path_weight > sum_cur_path_weight and cur_path_weights.count(0) == num_segments: - if sum_cur_path_weight < min_path_weight: # cur_path_weights.count(0) == num_segments: - print('new best path') - min_path = path - min_path_weight = sum_cur_path_weight - best_path_weights = cur_path_weights - num_segments = cur_path_weights.count(0) - print(f"num_segments {num_segments}") - - # Rest alternative bests - alternative_bests = [] - print(best_path_weights) - - elif min_path_weight > sum_cur_path_weight: - alternative_bests.append(path) - - if index % 10000000 == 0: - print(f'Round: {index} reached') - - print(f"num_segments {num_segments}") - print(f'time: {time() - start_time}') - -if __name__ == '__main__': - G = nx.read_gml('/Users/mjespersen/Downloads/core_graph.gml') - - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(G) - - connect_segments(double_edge_segements, connect_dict, multi_edge_nodes, G) - -# TODO - Find a way to give the segments in an output. -# * Find all segments that contain no core genes with >2 degrees -# * Find all segments that contain no accesory genes between core genes. \ No newline at end of file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index dbaa621..3fba4e6 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -45,6 +45,11 @@ except ModuleNotFoundError: from merge_dicts import merge_dicts_lists, merge_dicts_counts +try: + from Corekaburra.consesus_core_genome import determine_genome_segments +except ModuleNotFoundError: + from consesus_core_genome import determine_genome_segments + from argparse import ArgumentParser from math import floor import sys @@ -142,7 +147,7 @@ def main(): source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan) # Check if gene_data file is present if Panaroo input is given an gffs should be annotated - if args.annotate and source_program is 'Panaroo': + if args.annotate and source_program == 'Panaroo': gene_data_path = check_gene_data(args.input_pan) if not args.quiet: print(f"Pan genome determined to come from {source_program}") @@ -222,5 +227,21 @@ def main(): # # time_calculator(time_start, time.time(), "searching gff files for core genes") + print(f"\n--------------Identifying segments in pan genome--------------") + time_start = time.time() + # Count number of unique accessory genes inserted into a core-core region across the genomes + acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} + # Count number of unique low frequency genes inserted into a core-core region across the genomes + low_frew_region_count = {key: len(set(core_neighbour_accessory_count[key])) for key in + core_neighbour_accessory_count} + + # Combine the accessory and low frequency counts: + combined_acc_gene_count = {key: low_frew_region_count[key] + acc_region_count[key] for key in low_frew_region_count} + + double_edge_segements, no_acc_segments = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, + len(args.input_gffs), core_dict) + + # time_calculator(time_start, time.time(), "identifying segments in pan genome") + if __name__ == '__main__': main() diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 74f138d..35e79dc 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -21,8 +21,8 @@ def get_commandline_arguments(args): parser.add_argument('-ig', '--input_gffs', help='Path to gff files used for pan-genome', - required=True, metavar='file_1.gff ... file_n.gff', + required=True, dest='input_gffs', nargs='+') diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py new file mode 100644 index 0000000..ba293f2 --- /dev/null +++ b/Corekaburra/consesus_core_genome.py @@ -0,0 +1,281 @@ +import networkx as nx + + +def construct_core_graph(core_neighbour_pairs): + """ + Function to construct a graph from the core pairs and number of times each is observed. + :param core_neighbour_pairs: Dict of core pairs and the number of times each is observed + :return: A graph with nodes being core genes, edges being a connection between them with a weight of the number of times they are connected + """ + # Initiate core gene graph + G = nx.Graph() + + # Add all core gene pairs and their edges + for core_set in core_neighbour_pairs.keys(): + # split core genes + core_genes = core_set.split('--') + + # Check that sequence break is not present: + if 'Sequence_break' != core_genes[0] and 'Sequence_break' != core_genes[1]: + # Construct edge list + core_genes = [(core_genes[0], core_genes[1], {'weight': core_neighbour_pairs[core_set]})] + # Add edge + G.add_edges_from(ebunch_to_add=core_genes) + + return G + + +def gene_co_occurrence(core_gene_dict, two_gene_segment): + """ + Function to find the number of genomes in which two genes co-occur across the input genomes. + :param core_gene_dict: Dictionary over core genes mapped from genome, to locus_tag, to pan-genome cluster + :param two_gene_segment: List of two genes forming a segment + :return: Int - number of co-occurrences for the two genes in the input two_gene_segment + """ + co_occurrence = 0 + + # Get pan-genome clusters for all genomes in a list of lists + core_gene_presences = [list(core_genes.values()) for core_genes in core_gene_dict.values()] + + # Go through all genomes and check if genes co-occur + for core_gene_set in core_gene_presences: + if set(two_gene_segment).issubset(core_gene_set): + co_occurrence += 1 + + return co_occurrence + + +def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count): + """ + Function that takes segments between multi connected core genes, and a Dict of number of accessory genes in core-core regions. + Divides segments into smaller subsegments, in which no accessory genes can be found between core pairs. + :param double_edge_segements: Dict of segments of core genes identified. Keys are genes at edges of segments. Value is a List of genes in the segment from one side to the other. + :param combined_acc_gene_count: Dict of the number of accessory genes (value) identified between a set core genes (Key) + :return: Dict of subsegments. Same keys as for the segment dict, but keys are a list of lists. Each sub-list is a subsegment. + """ + # Create dict of subsegments of the larger segments + sub_segment_dict = {key: [] for key in double_edge_segements} + + # Go through segments to identify subsegments + for segment in double_edge_segements: + empty_segment_genes = [] + + cur_segment = double_edge_segements[segment] + # Check each region of the segment for core genes + for i in range(0, len(cur_segment)-1): + core_neighbours = sorted([cur_segment[i], cur_segment[i+1]]) + core_region = f'{core_neighbours[0]}--{core_neighbours[1]}' + # Get accessory genes in region + core_region_acc_genes = combined_acc_gene_count[core_region] + + # If core region does not contain accessory genes, add to current segment. + # Else add the segment and start a new + if core_region_acc_genes == 0: + # If first pair in segment add both, if not first only add the last gene + if len(empty_segment_genes) == 0: + empty_segment_genes += [cur_segment[i], cur_segment[i+1]] + else: + empty_segment_genes += [cur_segment[i+1]] + + else: + # Check if first pair in subsegment and add first gene as being 'lonely' + if len(empty_segment_genes) == 0: + empty_segment_genes += [cur_segment[i]] + + # Record the segment and reset the subsegment to contain no core genes + sub_segment_dict[segment].append(empty_segment_genes) + empty_segment_genes = [] + + # Check of segment end has been reached and more than two genes are in the segment, if then add the segment + if i == len(cur_segment) - 2 and len(empty_segment_genes) >= 2: + sub_segment_dict[segment].append(empty_segment_genes) + empty_segment_genes = [] + # Check if the second gene in pair is last in segment, and accessory genes are present between second to last and last core gene, + # if then add the last gene as being 'lonely' + elif i == len(cur_segment) - 2: + empty_segment_genes += [cur_segment[i + 1]] + sub_segment_dict[segment].append(empty_segment_genes) + + return sub_segment_dict + + +def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict): + """ + Function to be called from main that collects the functions for determining core segments in pan-genome + + :param core_neighbour_pairs: Dict of the number of times core pairs have been detected + :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs + :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes + + :return double_edge_segements: + :return no_acc_segments: + """ + # Construct a graph from core gene neighbours + core_graph = construct_core_graph(core_neighbour_pairs) + + # Find segments in the genome between core genes with multiple neighbors + double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) + + if double_edge_segements is not None: + # Find segments of core genes with no accessory in between + no_acc_segments = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + else: + no_acc_segments = None + + return double_edge_segements, no_acc_segments + + +def identify_segments(core_graph, num_gffs, core_gene_dict): + """ + Function to identify stretches of core genes between core genes neighbouring multiple different genes + :param core_graph: Graph over core genes with weights being the number of connections between the genes + :param num_gffs: Number of gffs inputted + :return: Dict over stretches of core genes found in the core gene graph. + """ + + # Identify all nodes that contain more than two degrees. + multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] + + # Check if any node have multiple edges, if not then return. + if len(multi_edge_nodes) == 0: + return None, None, None # TODO - log and report better that this is the outcome! + + # Dict to hold connections between >2 edge nodes + connect_dict = {} + + # for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees + for node in multi_edge_nodes: + connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes] + + # Turn the weight into a 'distance' or number of times not found together. + for edge in core_graph.edges(data=True): + core_graph[edge[0]][edge[1]]['weight'] = num_gffs - core_graph[edge[0]][edge[1]]['weight'] + + # find all simple paths between nodes with >2 degrees + double_edge_segements = {} + + # Go through all source and taget nodes, + # see if a path can be found where all nodes between them have only two degrees + for source_node in multi_edge_nodes: + for target_node in multi_edge_nodes: + if target_node != source_node: #and target_node not in connect_dict[source_node]: + # Get path (segment) from source to target + segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra + + # Get length of path + segment_length = len(segment) + + # Get length of segment with multi nodes removed + two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes]) + + # Check if no node between the source and target has more than two edges, + # if then move to record the segment/path + if segment_length - 2 == two_degree_segment_length: + # Check if two gene segment occur in every possible genome, if not then skip + if segment_length == 2: + if num_gffs - core_graph[segment[0]][segment[1]]['weight'] < gene_co_occurrence(core_gene_dict, segment): + continue + + # Construct name for path + source_target_name = sorted([source_node, target_node]) + source_target_name = f'{source_target_name[0]}--{source_target_name[1]}' + + # Check that path has not been recorded in the opposite direction, if not then record it + if source_target_name not in double_edge_segements: + double_edge_segements[source_target_name] = segment + else: + if double_edge_segements[source_target_name] != segment[::-1]: + raise NotImplementedError("Path from one node to another was found, but did not match previously found path!") # TODO log and nice exit! + + # Calculate the expected number of paths + total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) + num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict]) + expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + + # Check if less than the number of expected paths has been found, + # if then try to identify missing paths + if expected_segment_number != len(double_edge_segements): + # Get number of expected edges for each node that has more than two edges + expected_edge_num_dict = {node: connections for node, connections in core_graph.degree if connections > 2} + + # Get the number of edges directly between multi connected nodes + identified_edge_num_dict = {node: len(connect_dict[node]) for node in connect_dict} + + # Get the number of paths connecting multi connected nodes via segments found in previous loop + for connection in double_edge_segements: + connection_nodes = connection.split('--') + for node in connection_nodes: + identified_edge_num_dict[node] += 1 + + # Compare the number of connections expected to the number identified, to find nodes that are miss connections + nodes_missing_connections = [] + for node in expected_edge_num_dict: + if identified_edge_num_dict[node] != expected_edge_num_dict[node]: + nodes_missing_connections.append(node) + + # Go through nodes that are missing at least one path and try to identify missing paths + for source_node in nodes_missing_connections: + for target_node in nodes_missing_connections: + # Check that the current target node is not a neighbouring node or the current node itself + if target_node not in connect_dict[source_node] and target_node != source_node: + # Copy the graph to manipulate it + core_graph_copy = core_graph.copy() + + # Construct a pair name + suspected_pair = sorted([source_node, target_node]) + suspected_pair = f'{suspected_pair[0]}--{suspected_pair[1]}' + + # Check that the pair has not been found in a previous run + if suspected_pair not in double_edge_segements: + # Counter to stop loop + counter = 0 + # Identifier to see if path has been found, to stop loop + path_identified = False + while not path_identified: + counter += 1 + + # Get all shortest path between source and target. + all_shortest_paths = nx.all_shortest_paths(core_graph_copy, source_node, target_node) + + # Go through each path to see if is satisfies the criteria + try: + for index, path in enumerate(all_shortest_paths): + # Get length of path + segment_length = len(path) + + # Get length of segment with multi nodes removed + two_degree_segment_length = len([node for node in path if node not in multi_edge_nodes]) + + # Check that the path does not contain nodes with >2 degrees outside of source and target, + # if then add path, + # else then find nodes that has >2 edges and remove an edge that leads to the node, to break the path for next run through loop + if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: # TODO - should this != 0 be here? + double_edge_segements[suspected_pair] = path + path_identified = True + continue + else: + # Check if path is length >2, + # if then find >2 degree nodes and remove an edge to them, + # else just remove edge found between nodes. + if len(path) > 2: + multi_node_in_path = [[path[index], path[index+1]] for index, node in enumerate(path) if node in multi_edge_nodes and node != source_node and node != target_node] + for multi_node_pair in multi_node_in_path: + # Try to remove edge found to multi node, if already removed move on. + try: + core_graph_copy.remove_edge(*multi_node_pair) + except nx.exception.NetworkXError: + continue + else: + core_graph_copy.remove_edge(*path) + + if counter == 1000: + raise IndexError("Counter reached limit! in detecting a new path for pair.") + except nx.NetworkXNoPath: + # No simple paths could be found for the source and target thus the while loop is terminated. + path_identified = True + + return double_edge_segements + + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/functional_tests/test_data/empty_file.expected b/functional_tests/test_data/empty_file.expected index 629a47f..0f6b828 100644 --- a/functional_tests/test_data/empty_file.expected +++ b/functional_tests/test_data/empty_file.expected @@ -1,2 +1,31 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -empty_file 0 0 - - - +usage: __main__.py [-h] -ig file_1.gff ... file_n.gff + [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-o path/to/output] + [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] + +Welcome to Corekaburra! Program to determine consensus core sequence from +multiple genomes. Outputs consensus core gene alignment, distance between core +genes, number of accessory genes between core genes and low frequency genes +between core genes + +optional arguments: + -h, --help show this help message and exit + -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...], --input_gffs file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] + Path to gff files used for pan-genome + -ip path/to/pan_genome, --input_pangenome path/to/pan_genome + Path to the folder produced by Panaroo or Roary + -cg complete_genomes.txt, --complete_genomes complete_genomes.txt + text file containing names of genomes that are to be + handled as complete genomes + -o path/to/output, --output path/to/output + Path to where output files will be placed [default: + current folder] + -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX + Prefix for output files, if any is desired + -a, --no_annotate_refound + Flag to toggle off the creation of new gff files, with + annotation of refound genes. Only done if input + pangenome is detected as comming from Panaroo + -c int, --cpu int Give max number of CPUs [default: 1] + -l, --log Record program progress in for debugging purpose + -q, --quiet Only print warnings diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 890cfe0..7459b1e 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -19,6 +19,7 @@ from Corekaburra import parse_gene_presence_absence from Corekaburra import gff_parser from Corekaburra import merge_dicts +from Corekaburra import consesus_core_genome @@ -741,6 +742,9 @@ def test_multiple_wrapped_contigs(self): class TestRecordCoreCoreRegion(unittest.TestCase): + """ + Test function that is used to record information of a region identified between two core genes. + """ def test_recording_neighbouring_core_genes(self): core_genes = {'gff_name': {'Core_ID_1': 'pan_gene_1', 'Core_ID_2': 'pan_gene_2'}} @@ -1292,6 +1296,9 @@ def test_connect_same_gene_as_last_n_first_gene_w_accessory(self): class TestRecordCorelessContig(unittest.TestCase): + """ + Test for function that records a contig on which there is no core genes. + """ def test_adding_coreless_contig(self): coreless_contigs = {} acc_genes_in_region = ['acc_1'] @@ -1321,6 +1328,9 @@ def test_not_adding_coreless_contig(self): class TestSegmentingMockGffs(unittest.TestCase): + """ + Tests for function that takes in a gff file and segments it into core-core regions + """ def test_single_chromosome_complete(self): # Set up input gff_generator = [['gff_name_contig_1', '.', 'CDS', '90', '180', '.', '.', '.', 'acc_ID_1'], @@ -2517,7 +2527,7 @@ def test_something(self): # TODO - What other wired and wonderfull examples can pass -class TestergingDicts(unittest.TestCase): +class TestMergingDicts(unittest.TestCase): """ Functions to merge dictionaries and lists into dictionaries """ # Test merge_dicts_counts def test_merge_dicts_counts_list_empty(self): @@ -2648,5 +2658,469 @@ def test_merge_dicts_lists_mix(self): self.assertEqual(expected_dict, return_dict) +class TestCoreGraphConstruction(unittest.TestCase): + """ + Test the construction of a network made from core gene pairs and their number of connections. + """ + def test_core_gene_graph_construction_circle_case(self): + expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_2', 'pan_cluster_3'), + ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] + + expected_degrees = [('pan_cluster_1', 2), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 2), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] + + expected_edge_weights = [10, 10, 10, 10, 10, 10] + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 10, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_1--pan_cluster_6': 10} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + # Get edge weights: + edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] + + # Assert outputs + self.assertEqual(expected_edges, list(core_graph.edges)) + self.assertEqual(expected_degrees, list(core_graph.degree)) + self.assertEqual(expected_edge_weights, edge_weights) + + def test_core_gene_graph_construction_circle_case_with_single_break(self): + expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_2', 'pan_cluster_3'), + ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] + + expected_degrees = [('pan_cluster_1', 2), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 2), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] + + expected_edge_weights = [9, 10, 9, 10, 10, 10] + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, + 'pan_cluster_1--Sequence_break': 1, + 'Sequence_break--pan_cluster_2': 1, + 'pan_cluster_2--pan_cluster_3': 9, + 'pan_cluster_3--pan_cluster_4': 10, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_1--pan_cluster_6': 10} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + # Get edge weights: + edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] + + # Assert outputs + self.assertEqual(expected_edges, list(core_graph.edges)) + self.assertEqual(expected_degrees, list(core_graph.degree)) + self.assertEqual(expected_edge_weights, edge_weights) + + def test_core_gene_graph_construction_three_degree_case(self): + expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_1', 'pan_cluster_4'), ('pan_cluster_2', 'pan_cluster_3'), + ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] + + expected_degrees = [('pan_cluster_1', 3), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 3), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] + + expected_edge_weights = [10, 8, 2, 10, 8, 10, 10] + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 8, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_1--pan_cluster_6': 8, + 'pan_cluster_1--pan_cluster_4': 2} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + # Get edge weights: + edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] + + # Assert outputs + self.assertEqual(expected_edges, list(core_graph.edges)) + self.assertEqual(expected_degrees, list(core_graph.degree)) + self.assertEqual(expected_edge_weights, edge_weights) + + def test_core_gene_graph_construction_three_degree_n_sequence_breaks_case(self): + expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_1', 'pan_cluster_4'), ('pan_cluster_2', 'pan_cluster_3'), + ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] + + expected_degrees = [('pan_cluster_1', 3), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 3), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] + + expected_edge_weights = [10, 8, 2, 10, 8, 10, 10] + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 8, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_1--pan_cluster_6': 8, + 'pan_cluster_1--pan_cluster_4': 2, + 'pan_cluster_3--Sequence_break': 2, + 'pan_cluster_6--Sequence_break': 2} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + # Get edge weights: + edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] + + # Assert outputs + self.assertEqual(expected_edges, list(core_graph.edges)) + self.assertEqual(expected_degrees, list(core_graph.degree)) + self.assertEqual(expected_edge_weights, edge_weights) + + +class TestGeneCoOccurrence(unittest.TestCase): + """ + Test function that identifies the number of genomes in which two core genes co-occur. + """ + def test_gene_co_occurrence(self): + core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", + 'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B"}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B"}, + 'Cari_the_Listeria': {'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B"}, + 'Zion_the_Streptococcus': {'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + + segment = ["B", "A"] + + expected_value = 10 + + return_value = consesus_core_genome.gene_co_occurrence(core_gene_dict, segment) + + self.assertEqual(expected_value, return_value) + + def test_gene_co_occurrence_no_occurence(self): + core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-2.1': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B"}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-2': "B"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-2': "B"}, + 'Cari_the_Listeria': {'Cari_the_Listeria_tag-5-1': "A"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-2': "B"}, + 'Zion_the_Streptococcus': {'Zion_the_Streptococcus_tag-7-1': "A"}, + 'Dina_the_Shigella': {'Dina_the_Shigella_tag-8-1': "A"}, + 'Silas_the_Legionella': {'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A"}} + + segment = ["B", "A"] + + expected_value = 0 + + return_value = consesus_core_genome.gene_co_occurrence(core_gene_dict, segment) + + self.assertEqual(expected_value, return_value) + + +class TestSegmentationIdentification(unittest.TestCase): + """ + Test the function that identifies core gene segments from a pan-genome. + """ + def test_double_edge_segment_identification_all_2_degree_input(self): + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 10, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_6--pan_cluster_1': 10} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + return_1, return_2, return_3 = consesus_core_genome.identify_segments(core_graph, 10, {}) + + self.assertEqual(None, return_1) + self.assertEqual(None, return_2) + self.assertEqual(None, return_3) + + def test_double_edge_segment_identification_two_segments(self): + expected_segments = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, + 'pan_cluster_1--pan_cluster_4': 1, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 10, + 'pan_cluster_2--pan_cluster_5': 1, + 'pan_cluster_4--pan_cluster_5': 9, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_6--pan_cluster_1': 10} + + core_gene_dict = {'genome_1': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_2': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_3': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_4': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_5': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_6': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_7': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_8': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_9': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'}, + 'genome_10': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'},} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_four_segments(self): + expected_segments = {'pan_cluster_1--pan_cluster_3': ['pan_cluster_1', 'pan_cluster_2', 'pan_cluster_3'], + 'pan_cluster_1--pan_cluster_9': ['pan_cluster_1', 'pan_cluster_10', 'pan_cluster_9'], + 'pan_cluster_3--pan_cluster_6': ['pan_cluster_6', 'pan_cluster_5', 'pan_cluster_4', 'pan_cluster_3'], + 'pan_cluster_6--pan_cluster_9': ['pan_cluster_6', 'pan_cluster_7', 'pan_cluster_8', 'pan_cluster_9']} + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, + 'pan_cluster_1--pan_cluster_6': 1, + 'pan_cluster_2--pan_cluster_3': 10, + 'pan_cluster_3--pan_cluster_4': 9, + 'pan_cluster_3--pan_cluster_9': 1, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_6--pan_cluster_7': 10, + 'pan_cluster_7--pan_cluster_8': 10, + 'pan_cluster_8--pan_cluster_9': 9, + 'pan_cluster_9--pan_cluster_10': 10, + 'pan_cluster_1--pan_cluster_10': 10} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_segments_node_w_four_degrees(self): + # expected_segments = {'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', 'pan_cluster_5', 'pan_cluster_6']} + expected_segments = {'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', + 'pan_cluster_3', + 'pan_cluster_4'], + 'pan_cluster_2--pan_cluster_6': ['pan_cluster_2', + 'pan_cluster_1', + 'pan_cluster_6'], + 'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', + 'pan_cluster_5', + 'pan_cluster_6']} + + core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, + 'pan_cluster_2--pan_cluster_3': 9, + 'pan_cluster_2--pan_cluster_4': 1, + 'pan_cluster_2--pan_cluster_6': 1, + 'pan_cluster_3--pan_cluster_4': 9, + 'pan_cluster_4--pan_cluster_5': 10, + 'pan_cluster_5--pan_cluster_6': 10, + 'pan_cluster_6--pan_cluster_1': 9} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_segments_node_w_challenging_paths(self): + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_E', 'pan_cluster_F', 'pan_cluster_G', 'pan_cluster_B']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_C': 4, + 'pan_cluster_A--pan_cluster_D': 4, + 'pan_cluster_A--pan_cluster_E': 2, + 'pan_cluster_B--pan_cluster_C': 5, + 'pan_cluster_B--pan_cluster_D': 3, + 'pan_cluster_B--pan_cluster_G': 2, + 'pan_cluster_C--pan_cluster_D': 1, + 'pan_cluster_E--pan_cluster_F': 2, + 'pan_cluster_F--pan_cluster_G': 2, + } + core_gene_dict = {'genome_1': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_2': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_3': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_4': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_5': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + + self.assertEqual(expected_segments, double_edge_segements)#TODO + + def test_double_edge_segment_identification_segments_node_w_challenging_paths_2(self): + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_F', 'pan_cluster_B'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_B', 'pan_cluster_I', 'pan_cluster_C']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_D': 2, + 'pan_cluster_A--pan_cluster_E': 1, + 'pan_cluster_A--pan_cluster_F': 7, + 'pan_cluster_B--pan_cluster_F': 7, + 'pan_cluster_B--pan_cluster_I': 8, + 'pan_cluster_B--pan_cluster_D': 1, + 'pan_cluster_C--pan_cluster_E': 1, + 'pan_cluster_C--pan_cluster_D': 1, + 'pan_cluster_C--pan_cluster_I': 8, + 'pan_cluster_D--pan_cluster_E': 1 + } + core_gene_dict = {'genome_1': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_2': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_3': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_4': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_5': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_6': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_7': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }, + 'genome_8': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_segments_node_w_all_challenging_paths(self): + expected_segments = {'pan_cluster_A--pan_cluster_D': ['pan_cluster_A', 'pan_cluster_G', 'pan_cluster_F', 'pan_cluster_E', 'pan_cluster_D'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_B', 'pan_cluster_H', 'pan_cluster_I', 'pan_cluster_J', 'pan_cluster_C']}#,} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 4, + 'pan_cluster_A--pan_cluster_K': 4, + 'pan_cluster_A--pan_cluster_G': 2, + 'pan_cluster_B--pan_cluster_H': 2, + 'pan_cluster_B--pan_cluster_L': 4, + 'pan_cluster_C--pan_cluster_J': 2, + 'pan_cluster_C--pan_cluster_K': 4, + 'pan_cluster_D--pan_cluster_C': 4, + 'pan_cluster_D--pan_cluster_L': 4, + 'pan_cluster_D--pan_cluster_E': 2, + 'pan_cluster_E--pan_cluster_F': 2, + 'pan_cluster_F--pan_cluster_G': 2, + 'pan_cluster_H--pan_cluster_I': 2, + 'pan_cluster_I--pan_cluster_J': 2, + 'pan_cluster_K--pan_cluster_L': 1 + } + core_gene_dict = {'genome_1': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, + 'genome_2': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, + 'genome_3': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, + 'genome_4': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, + 'genome_5': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_segments_node_w_less_than_all_present(self): + expected_segments = {'pan_cluster_B--pan_cluster_D': ['pan_cluster_B', 'pan_cluster_C', 'pan_cluster_D'], + 'pan_cluster_F--pan_cluster_H': ['pan_cluster_H', 'pan_cluster_G', 'pan_cluster_F'], + 'pan_cluster_B--pan_cluster_H': ['pan_cluster_B', 'pan_cluster_A', 'pan_cluster_H'], + 'pan_cluster_D--pan_cluster_F': ['pan_cluster_D', 'pan_cluster_E', 'pan_cluster_F']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 9, + 'pan_cluster_A--pan_cluster_H': 9, + 'pan_cluster_B--pan_cluster_H': 1, + 'pan_cluster_B--pan_cluster_C': 10, + 'pan_cluster_C--pan_cluster_D': 10, + 'pan_cluster_D--pan_cluster_E': 9, + 'pan_cluster_D--pan_cluster_F': 1, + 'pan_cluster_E--pan_cluster_F': 9, + 'pan_cluster_F--pan_cluster_G': 10, + 'pan_cluster_G--pan_cluster_H': 10, + 'pan_cluster_H--pan_cluster_A': 9, + } + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_double_edge_segment_identification_segments_node_w_two_gene_segment(self): # TODO - see TODO on line 791 in consensus_core_genome! + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_B'], + 'pan_cluster_A--pan_cluster_G': ['pan_cluster_A', 'pan_cluster_I', 'pan_cluster_H', 'pan_cluster_G'], + 'pan_cluster_B--pan_cluster_E': ['pan_cluster_B', 'pan_cluster_C', 'pan_cluster_D', 'pan_cluster_E'], + 'pan_cluster_E--pan_cluster_G': ['pan_cluster_G', 'pan_cluster_F', 'pan_cluster_E']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 3, + 'pan_cluster_A--pan_cluster_I': 2, + 'pan_cluster_A--pan_cluster_G': 1, + 'pan_cluster_B--pan_cluster_C': 2, + 'pan_cluster_B--pan_cluster_E': 1, + 'pan_cluster_C--pan_cluster_D': 3, + 'pan_cluster_D--pan_cluster_E': 3, + 'pan_cluster_E--pan_cluster_F': 2, + 'pan_cluster_F--pan_cluster_G': 2, + 'pan_cluster_G--pan_cluster_H': 3, + 'pan_cluster_H--pan_cluster_I': 3 + } + core_gene_dict = {'genome_1': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}, + 'genome_2': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}, + 'genome_3': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict) + + self.assertEqual(expected_segments, double_edge_segements) + + # TODO - Chat to Andrew about this function how it works and how we can test it more - possibly just run some things to see if it breaks + + +class TestNoAccessorySegmentIdentifcation(unittest.TestCase): + """ + Test the function that takes in segments of core genes and divide them into sub-segments based on the accessory content between core genes in segment. + """ + def test_no_accessory_genes_in_segment(self): + expected_sub_sgments = {'pan_cluster_1--pan_cluster_5': [['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5']], + 'pan_cluster_2--pan_cluster_4': [['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']]} + + double_edge_segements = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], + 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} + combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 0, 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} + + sub_segment_dict = consesus_core_genome.identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + self.assertEqual(sub_segment_dict, expected_sub_sgments) + + def test_accessory_genes_in_segment_first_gene_lonely(self): + expected_sub_sgments = {'pan_cluster_1--pan_cluster_5': [['pan_cluster_1'], ['pan_cluster_6', 'pan_cluster_5']]} + + double_edge_segements = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5']} + combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 1, 'pan_cluster_5--pan_cluster_6': 0} + + sub_segment_dict = consesus_core_genome.identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + self.assertEqual(sub_segment_dict, expected_sub_sgments) + + def test_accessory_genes_in_segment_last_gene_lonely(self): + expected_sub_sgments = {'pan_cluster_1--pan_cluster_5': [['pan_cluster_1', 'pan_cluster_6'], ['pan_cluster_5']], + 'pan_cluster_2--pan_cluster_4': [['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']]} + + double_edge_segements = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], + 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} + combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 1, 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} + + sub_segment_dict = consesus_core_genome.identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + self.assertEqual(sub_segment_dict, expected_sub_sgments) + + def test_accessory_genes_in_segment_middle(self): + expected_sub_sgments = {'pan_cluster_1--pan_cluster_4': [['pan_cluster_1', 'pan_cluster_2'], ['pan_cluster_3', 'pan_cluster_4']]} + + double_edge_segements = {'pan_cluster_1--pan_cluster_4': ['pan_cluster_1', 'pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} + combined_acc_gene_count = {'pan_cluster_1--pan_cluster_2': 0, 'pan_cluster_2--pan_cluster_3': 1, 'pan_cluster_3--pan_cluster_4': 0} + + sub_segment_dict = consesus_core_genome.identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + self.assertEqual(sub_segment_dict, expected_sub_sgments) + + def test_accessory_genes_in_multiple_places(self): + expected_sub_sgments = {'pan_cluster_1--pan_cluster_4': [['pan_cluster_1'], ['pan_cluster_2', 'pan_cluster_3'], ['pan_cluster_4']]} + + double_edge_segements = {'pan_cluster_1--pan_cluster_4': ['pan_cluster_1', 'pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} + combined_acc_gene_count = {'pan_cluster_1--pan_cluster_2': 1, 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 1} + + sub_segment_dict = consesus_core_genome.identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + self.assertEqual(sub_segment_dict, expected_sub_sgments) + if __name__ == '__main__': unittest.main() From 0822f8c9838b40cf161b816d417a65bfc3048c9a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 4 Jan 2022 16:12:39 +1100 Subject: [PATCH 017/135] Add in an adjustment for when segments are added between two multi connected core genes, as this messed with the prediction of missin segments --- Corekaburra/consesus_core_genome.py | 14 +++++++++++--- unit_tests/Corekaburra_test.py | 6 ++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index ba293f2..b3d4ef3 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -153,12 +153,13 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # find all simple paths between nodes with >2 degrees double_edge_segements = {} + multi_edge_connect_adjust = [] # Go through all source and taget nodes, # see if a path can be found where all nodes between them have only two degrees for source_node in multi_edge_nodes: for target_node in multi_edge_nodes: - if target_node != source_node: #and target_node not in connect_dict[source_node]: + if target_node != source_node: # Get path (segment) from source to target segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra @@ -175,6 +176,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): if segment_length == 2: if num_gffs - core_graph[segment[0]][segment[1]]['weight'] < gene_co_occurrence(core_gene_dict, segment): continue + else: + if all([x != segment[::-1] for x in multi_edge_connect_adjust]): multi_edge_connect_adjust.append(segment) # Construct name for path source_target_name = sorted([source_node, target_node]) @@ -190,7 +193,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Calculate the expected number of paths total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict]) - expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + len(multi_edge_connect_adjust) # Check if less than the number of expected paths has been found, # if then try to identify missing paths @@ -207,6 +210,11 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): for node in connection_nodes: identified_edge_num_dict[node] += 1 + # Adjust for the number of segments identified between multi connected nodes + for segment in multi_edge_connect_adjust: + for node in segment: + identified_edge_num_dict[node] -= 1 + # Compare the number of connections expected to the number identified, to find nodes that are miss connections nodes_missing_connections = [] for node in expected_edge_num_dict: @@ -252,7 +260,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: # TODO - should this != 0 be here? double_edge_segements[suspected_pair] = path path_identified = True - continue + pass # TODO - is this correct else: # Check if path is length >2, # if then find >2 degree nodes and remove an edge to them, diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 7459b1e..876eda2 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -2927,7 +2927,8 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): self.assertEqual(expected_segments, double_edge_segements) def test_double_edge_segment_identification_segments_node_w_challenging_paths(self): - expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_E', 'pan_cluster_F', 'pan_cluster_G', 'pan_cluster_B']} + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_E', 'pan_cluster_F', 'pan_cluster_G', 'pan_cluster_B'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_C', 'pan_cluster_B']} core_neighbour_pairs = {'pan_cluster_A--pan_cluster_C': 4, 'pan_cluster_A--pan_cluster_D': 4, @@ -2948,7 +2949,7 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) - self.assertEqual(expected_segments, double_edge_segements)#TODO + self.assertEqual(expected_segments, double_edge_segements) # TODO def test_double_edge_segment_identification_segments_node_w_challenging_paths_2(self): expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_F', 'pan_cluster_B'], @@ -3122,5 +3123,6 @@ def test_accessory_genes_in_multiple_places(self): self.assertEqual(sub_segment_dict, expected_sub_sgments) + if __name__ == '__main__': unittest.main() From 4fcf553322681960a70ad7a80a7f10d5c8ff2d88 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 4 Jan 2022 16:20:59 +1100 Subject: [PATCH 018/135] Add in pylin expections that raise warnings for a networkx function call --- Corekaburra/consesus_core_genome.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index b3d4ef3..ab03740 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -1,4 +1,5 @@ import networkx as nx +# pylint: disable=E1123, E1121 def construct_core_graph(core_neighbour_pairs): From ac36912cb35c6676d5491d933b515c73aa4e5d32 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 10:57:51 +1100 Subject: [PATCH 019/135] Add in the function for summarising infromation around core pairs, add in a small change to count number of occurences of core genes. Additionally add unit-tests for the summary table function --- Code_to_transfer/summary_table.py | 59 ------------- Corekaburra/__main__.py | 10 +++ Corekaburra/consesus_core_genome.py | 15 +++- .../output_writer_functions.py | 0 Corekaburra/summary_table.py | 72 ++++++++++++++++ setup.py | 4 +- unit_tests/Corekaburra_test.py | 86 ++++++++++++++++++- 7 files changed, 178 insertions(+), 68 deletions(-) delete mode 100644 Code_to_transfer/summary_table.py rename {Code_to_transfer => Corekaburra}/output_writer_functions.py (100%) create mode 100644 Corekaburra/summary_table.py diff --git a/Code_to_transfer/summary_table.py b/Code_to_transfer/summary_table.py deleted file mode 100644 index 3bf2486..0000000 --- a/Code_to_transfer/summary_table.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np - - -def calculate_n_create_summaries(master_info): # TODO - Add in columns that gives that difference in length and acc genes from smallest to biggest. - # Create dict to hold the information for each core pair with a key being the pair name and the value a 2 dimentional numpy array - summary_dict = {} - - # Group the values of length of accessory gene content for the core pairs into the summary dict - for pair in master_info.keys(): - core_pair = pair.rsplit('--', 1)[0] - - # Try to add the values for a given core pair to the summary dict, - # if not possible then add a two column array to they key and add the values - try: - summary_dict[core_pair].append(master_info[pair][3:5]) - except KeyError: - summary_dict[core_pair] = [master_info[pair][3:5]] - # summary_dict[core_pair] = np.append(summary_dict[core_pair], master_info[pair][3:5]) - - # Take the mean over length or accesory genes for all core-pairs - for core_pair in summary_dict: - # cast the list into a numpy array, format is to have lengths and accessory gene count as a column each - summary_dict[core_pair] = np.array(summary_dict[core_pair]) - summary_dict[core_pair] = summary_dict[core_pair].reshape((-1, 2)) - - # Calculate summary statistics - pair_occurrence = summary_dict[core_pair].shape[0] - min_values = np.min(summary_dict[core_pair], axis=0) - max_values = np.max(summary_dict[core_pair], axis=0) - mean_values = np.mean(summary_dict[core_pair], axis=0) - median_values = np.median(summary_dict[core_pair], axis=0) - - # Add the summary to the dict, round mean and median to one decimal - summary_dict[core_pair] = [core_pair, pair_occurrence, - min_values[0], max_values[0], - round(mean_values[0], 1), round(median_values[0], 1), - min_values[1], max_values[1], - round(mean_values[1], 1), round(median_values[1], 1) - ] - - return summary_dict - -if __name__ == '__main__': - master_info = { - 'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']], - 'pan_cluster_1--pan_cluster_2--genome_2': ['genome_2', 'pan_cluster_1', 'pan_cluster_2', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']], - 'pan_cluster_1--pan_cluster_2--genome_3': ['genome_3', 'pan_cluster_1', 'pan_cluster_2', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']], - 'pan_cluster_2--pan_cluster_3--genome_1': ['genome_1', 'pan_cluster_2', 'pan_cluster_3', 100, 2, - ['Acc_1', 'Acc_2'], []], - 'pan_cluster_2--pan_cluster_3--genome_2': ['genome_2', 'pan_cluster_2', 'pan_cluster_3', 150, 1, - ['Acc_1', ], []], - 'pan_cluster_2--pan_cluster_3--genome_3': ['genome_3', 'pan_cluster_2', 'pan_cluster_3', 200, 0, - [], []] - } - - calculate_n_create_summaries(master_info) \ No newline at end of file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 3fba4e6..e2bef53 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -50,6 +50,12 @@ except ModuleNotFoundError: from consesus_core_genome import determine_genome_segments +try: + from Corekaburra.summary_table import calculate_n_create_summaries +except ModuleNotFoundError: + from summary_table import calculate_n_create_summaries + + from argparse import ArgumentParser from math import floor import sys @@ -243,5 +249,9 @@ def main(): # time_calculator(time_start, time.time(), "identifying segments in pan genome") + # Produce dict containing summarised information from master info. + master_summary_info = calculate_n_create_summaries(master_info_total) + + if __name__ == '__main__': main() diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index ab03740..8cf7c4a 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -26,7 +26,7 @@ def construct_core_graph(core_neighbour_pairs): return G -def gene_co_occurrence(core_gene_dict, two_gene_segment): +def count_gene_co_occurrence(core_gene_dict, two_gene_segment): """ Function to find the number of genomes in which two genes co-occur across the input genomes. :param core_gene_dict: Dictionary over core genes mapped from genome, to locus_tag, to pan-genome cluster @@ -34,16 +34,24 @@ def gene_co_occurrence(core_gene_dict, two_gene_segment): :return: Int - number of co-occurrences for the two genes in the input two_gene_segment """ co_occurrence = 0 + gene_occurrence = dict.fromkeys(two_gene_segment, 0) # Get pan-genome clusters for all genomes in a list of lists core_gene_presences = [list(core_genes.values()) for core_genes in core_gene_dict.values()] # Go through all genomes and check if genes co-occur for core_gene_set in core_gene_presences: + # count the co-occurrences if set(two_gene_segment).issubset(core_gene_set): co_occurrence += 1 - return co_occurrence + # Count the individual occurrences + if two_gene_segment[0] in core_gene_set: + gene_occurrence[two_gene_segment[0]] += 1 + if two_gene_segment[1] in core_gene_set: + gene_occurrence[two_gene_segment[1]] += 1 + + return co_occurrence, gene_occurrence def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count): @@ -175,7 +183,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): if segment_length - 2 == two_degree_segment_length: # Check if two gene segment occur in every possible genome, if not then skip if segment_length == 2: - if num_gffs - core_graph[segment[0]][segment[1]]['weight'] < gene_co_occurrence(core_gene_dict, segment): + gene_co_occurrences, _ = count_gene_co_occurrence(core_gene_dict, segment) + if num_gffs - core_graph[segment[0]][segment[1]]['weight'] < gene_co_occurrences: continue else: if all([x != segment[::-1] for x in multi_edge_connect_adjust]): multi_edge_connect_adjust.append(segment) diff --git a/Code_to_transfer/output_writer_functions.py b/Corekaburra/output_writer_functions.py similarity index 100% rename from Code_to_transfer/output_writer_functions.py rename to Corekaburra/output_writer_functions.py diff --git a/Corekaburra/summary_table.py b/Corekaburra/summary_table.py new file mode 100644 index 0000000..5c4b6d9 --- /dev/null +++ b/Corekaburra/summary_table.py @@ -0,0 +1,72 @@ +import numpy as np + +try: + from Corekaburra.consesus_core_genome import count_gene_co_occurrence +except ModuleNotFoundError: + from consesus_core_genome import count_gene_co_occurrence + + +def calculate_n_create_summaries(master_info, core_gene_dict): + """ + Function used to calculate summary statistics for core gene pairs + :param master_info: Dict over master info from core pairs identified from gff files + :param core_gene_dict: Dict mapping genes to genomes and pan-genome cluster (genes) + :return: Dict containing the summary statistics for each core pair. + """ + # initialize the summary dict to be returned + summary_dict = {} + + # Group the values of length of accessory gene content for the core pairs into the summary dict + for pair in master_info: + core_pair = pair.rsplit('--', 1)[0] + + # Try to add the values for a given core pair to the summary dict, + # if not possible then add a two column array to they key and add the values + try: + summary_dict[core_pair].append(master_info[pair][3:5]) + except KeyError: + summary_dict[core_pair] = [master_info[pair][3:5]] + + occurrence_dict = {key: {} for key in summary_dict} + # dict.fromkeys(summary_dict.keys(), {}) + for core_pair in summary_dict: + gene_list = core_pair.split('--') + # Calculate the occurrence and co-occurrence of core genes + + # Calculate gene occurrence and co-occurrence + occurrence_dict[core_pair]['co_occurrence'], individual_occurrences = count_gene_co_occurrence(core_gene_dict, gene_list) + + for gene in gene_list: + occurrence_dict[core_pair][gene] = individual_occurrences[gene] + + # Take the mean over length of accessory genes for all core-pairs + for core_pair in summary_dict: + gene_list = core_pair.split('--') + # cast the list into a numpy array, format it to have lengths and accessory gene count as a column each + summary_dict[core_pair] = np.array(summary_dict[core_pair]) + summary_dict[core_pair] = summary_dict[core_pair].reshape((-1, 2)) + + # Calculate summary statistics + pair_occurrence = summary_dict[core_pair].shape[0] + min_values = np.min(summary_dict[core_pair], axis=0) + max_values = np.max(summary_dict[core_pair], axis=0) + mean_values = np.mean(summary_dict[core_pair], axis=0) + median_values = np.median(summary_dict[core_pair], axis=0) + + # Add the summary to the dict, round mean and median to one decimal + summary_dict[core_pair] = [core_pair.replace('--', '-'), + pair_occurrence, + occurrence_dict[core_pair][gene_list[0]], + occurrence_dict[core_pair][gene_list[1]], + occurrence_dict[core_pair]['co_occurrence'], # TODO - Add neighbour ratio? + min_values[0], max_values[0], + round(mean_values[0], 1), round(median_values[0], 1), + min_values[1], max_values[1], + round(mean_values[1], 1), round(median_values[1], 1) + ] # TODO - at the moment all sequence breaks are reported as zero and gives a co-occurrence equal to zero - is this acceptable? + + return summary_dict + + +if __name__ == '__main__': + pass diff --git a/setup.py b/setup.py index e7fe7c4..8908109 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name='Corekaburra', - version='0.1.0.0', + version='0.0.1', author='Magnus Ganer Jespersen', author_email='magnus.ganer.j@gmail.com', packages=['Corekaburra'], @@ -25,5 +25,5 @@ license='LICENSE', description=('A prototypical bioinformatics command line tool'), long_description=(LONG_DESCRIPTION), - install_requires=["biopython", "networkx", "gffutils"], + install_requires=["biopython", "networkx", "gffutils", "numpy"], ) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 876eda2..5581b7e 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -20,6 +20,7 @@ from Corekaburra import gff_parser from Corekaburra import merge_dicts from Corekaburra import consesus_core_genome +from Corekaburra import summary_table @@ -2773,7 +2774,7 @@ class TestGeneCoOccurrence(unittest.TestCase): """ Test function that identifies the number of genomes in which two core genes co-occur. """ - def test_gene_co_occurrence(self): + def test_count_gene_co_occurrence(self): core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", 'Silas_the_Salmonella_tag-1-2.2': "B"}, @@ -2799,12 +2800,16 @@ def test_gene_co_occurrence(self): segment = ["B", "A"] expected_value = 10 + a_occurrence = 10 + b_occurrence = 10 - return_value = consesus_core_genome.gene_co_occurrence(core_gene_dict, segment) + return_value, individual_occurrences = consesus_core_genome.count_gene_co_occurrence(core_gene_dict, segment) self.assertEqual(expected_value, return_value) + self.assertEqual(a_occurrence, individual_occurrences["A"]) + self.assertEqual(b_occurrence, individual_occurrences["B"]) - def test_gene_co_occurrence_no_occurence(self): + def test_count_gene_co_occurrence_no_occurence(self): core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-2.1': "B", 'Silas_the_Salmonella_tag-1-2.2': "B"}, 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A"}, @@ -2820,10 +2825,14 @@ def test_gene_co_occurrence_no_occurence(self): segment = ["B", "A"] expected_value = 0 + a_occurrence = 5 + b_occurrence = 5 - return_value = consesus_core_genome.gene_co_occurrence(core_gene_dict, segment) + return_value, individual_occurrences = consesus_core_genome.count_gene_co_occurrence(core_gene_dict, segment) self.assertEqual(expected_value, return_value) + self.assertEqual(a_occurrence, individual_occurrences["A"]) + self.assertEqual(b_occurrence, individual_occurrences["B"]) class TestSegmentationIdentification(unittest.TestCase): @@ -3124,5 +3133,74 @@ def test_accessory_genes_in_multiple_places(self): self.assertEqual(sub_segment_dict, expected_sub_sgments) +class TestSummaryTableConstruction(unittest.TestCase): + def test_summary_table_calculations(self): + master_info = { + 'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--pan_cluster_2--genome_2': ['genome_2', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--pan_cluster_2--genome_3': ['genome_3', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_2--pan_cluster_3--genome_1': ['genome_1', 'pan_cluster_2', 'pan_cluster_3', 100, 2, + ['Acc_1', 'Acc_2'], []], + 'pan_cluster_2--pan_cluster_4--genome_2': ['genome_2', 'pan_cluster_2', 'pan_cluster_3', 150, 1, + ['Acc_1', ], []], + 'pan_cluster_2--pan_cluster_3--genome_3': ['genome_3', 'pan_cluster_2', 'pan_cluster_3', 200, 0, + [], []], + 'pan_cluster_3--pan_cluster_4--genome_1': ['genome_1', 'pan_cluster_2', 'pan_cluster_3', -5, 0, + [], []], + 'pan_cluster_3--pan_cluster_4--genome_3': ['genome_3', 'pan_cluster_2', 'pan_cluster_3', -10, 0, + [], []] + } + + core_gene_dict = {'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2', 'gene_3': 'pan_cluster_3', 'gene_4': 'pan_cluster_4'}, + 'genome_2': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2', 'gene_4': 'pan_cluster_4'}, + 'genome_3': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2', 'gene_3': 'pan_cluster_3', 'gene_4': 'pan_cluster_4'}} + + expected_table = {'pan_cluster_1--pan_cluster_2': ['pan_cluster_1-pan_cluster_2', 3, 3, 3, 3, 99, 99, 99.0, 99.0, 3, 3, 3.0, 3.0], + 'pan_cluster_2--pan_cluster_3': ['pan_cluster_2-pan_cluster_3', 2, 3, 2, 2, 100, 200, 150.0, 150.0, 0, 2, 1.0, 1.0], + 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2-pan_cluster_4', 1, 3, 3, 3, 150, 150, 150.0, 150.0, 1, 1, 1.0, 1.0], + 'pan_cluster_3--pan_cluster_4': ['pan_cluster_3-pan_cluster_4', 2, 2, 3, 2, -10, -5, -7.5, -7.5, 0, 0, 0.0, 0.0]} + + return_table = summary_table.calculate_n_create_summaries(master_info, core_gene_dict) + + self.assertEqual(expected_table, return_table) + + def test_summary_table_calculations_w_sequence_breaks(self): + master_info = { + 'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--Sequence_break--genome_2': ['genome_2', 'pan_cluster_1', 'Sequence_break', 100, 0, + [], []], + 'Sequence_break--pan_cluster_2--genome_2': ['genome_2', 'Sequence_break', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--pan_cluster_2--genome_3': ['genome_3', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_2--pan_cluster_3--genome_1': ['genome_1', 'pan_cluster_2', 'pan_cluster_3', 100, 2, + ['Acc_1', 'Acc_2'], []], + 'pan_cluster_2--pan_cluster_3--genome_3': ['genome_3', 'pan_cluster_2', 'pan_cluster_3', 200, 0, + [], []], + 'pan_cluster_3--Sequence_break--genome_1': ['genome_1', 'pan_cluster_3', 'Sequence_break', 100, 2, + ['Acc_1', 'Acc_2'], []], + 'pan_cluster_3--Sequence_break--genome_3': ['genome_3', 'pan_cluster_3', 'Sequence_break', 200, 0, + [], []] + } + + core_gene_dict = {'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2', 'gene_3': 'pan_cluster_3'}, + 'genome_2': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}, + 'genome_3': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2', 'gene_3': 'pan_cluster_3'}} + + expected_table = {'pan_cluster_1--pan_cluster_2': ['pan_cluster_1-pan_cluster_2', 2, 3, 3, 3, 99, 99, 99.0, 99.0, 3, 3, 3.0, 3.0], + 'pan_cluster_1--Sequence_break': ['pan_cluster_1-Sequence_break', 1, 3, 0, 0, 100, 100, 100.0, 100.0, 0, 0, 0.0, 0.0], + 'Sequence_break--pan_cluster_2': ['Sequence_break-pan_cluster_2', 1, 0, 3, 0, 99, 99, 99.0, 99.0, 3, 3, 3.0, 3.0], + 'pan_cluster_2--pan_cluster_3': ['pan_cluster_2-pan_cluster_3', 2, 3, 2, 2, 100, 200, 150.0, 150.0, 0, 2, 1.0, 1.0], + 'pan_cluster_3--Sequence_break': ['pan_cluster_3-Sequence_break', 2, 2, 0, 0, 100, 200, 150.0, 150.0, 0, 2, 1.0, 1.0]} + + return_table = summary_table.calculate_n_create_summaries(master_info, core_gene_dict) + + self.assertEqual(expected_table, return_table) + + if __name__ == '__main__': unittest.main() From b6c121e8e5709cd1480dc3c830cf1243cefbc146 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 13:21:17 +1100 Subject: [PATCH 020/135] Add in the functions for writing outputs, along with tests for these functions --- Corekaburra/__main__.py | 23 ++++ Corekaburra/output_writer_functions.py | 111 +++++++---------- unit_tests/Corekaburra_test.py | 117 +++++++++++++++++- .../core_segments.txt | 10 ++ .../gene_content.txt | 13 ++ .../TestWritingOutputFunction/low_freq.txt | 9 ++ .../no_acc_segments.txt | 10 ++ .../summary_table.txt | 5 + 8 files changed, 229 insertions(+), 69 deletions(-) create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index e2bef53..0dca923 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -252,6 +252,29 @@ def main(): # Produce dict containing summarised information from master info. master_summary_info = calculate_n_create_summaries(master_info_total) + ### WRITE OUTPUTS ### + print(f"\n-----------------------Printing outputs-----------------------") + # Write master information to output file + time_start = time.time() + master_info_writer(master_info_total, args.output_path, args.output_prefix, args.quiet) + summary_info_writer(master_summary_info, args.output_path, args.output_prefix, args.quiet) + # TODO - Contruct output for segments - parent column. + segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) + no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) + # print(non_core_contig_info) TODO - Print core less contigs. + # TODO - Possibly output core gene graph. with segment annotations? + + # time_calculator(time_start, time.time(), "writing output files") + + # Finish up running + # time_calculator(total_time_start, time.time(), "running the entire program") + + # Remove temporary database holding gff databases + # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - do so in nice exit function + # print(isdir(temp_folder_path)) + # if isdir(temp_folder_path): + # rmdir(temp_folder_path) + if __name__ == '__main__': main() diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 110b9f2..d980c5c 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -4,13 +4,21 @@ def master_info_writer(master_info, out_path, prefix, quiet): + """ + Function to write two output .tsv files related to regions content and size for each genome + :param master_info: Dict of info for each core gene pair across all genomes + :param out_path: Path to the output folder + :param prefix: A possible prefix for the output files. + :param quiet: + :return: Nothing + """ if not quiet: print("Printing master output") # Write general content out_file_name = 'low_frequency_gene_placement.tsv' if prefix is not None: - out_file_name = prefix + '_' + out_file_name + out_file_name = f'{prefix}_{out_file_name}' with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file: writer = csv.writer(out_file, delimiter="\t") @@ -24,12 +32,11 @@ def master_info_writer(master_info, out_path, prefix, quiet): info = master_info[key][0:5] writer.writerow(info) - out_file.close() # Write gene content in long format out_file_name = 'core_core_accessory_gene_content.tsv' if prefix is not None: - out_file_name = prefix + '_' + out_file_name + out_file_name = f'{prefix}_{out_file_name}' with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file: writer = csv.writer(out_file, delimiter="\t") @@ -59,10 +66,16 @@ def master_info_writer(master_info, out_path, prefix, quiet): 'intermediate_frequency'] writer.writerow(row) - out_file.close() - def summary_info_writer(master_summary_info, out_path, prefix, quiet): + """ + Function for writing the summary table for regions identified across genomes + :param master_summary_info: Dict holding summary statistics for core pair region identified + :param out_path: Path to the output folder + :param prefix: Prefix for any output files + :param quiet: # TODO - log instead + :return: Nothing + """ if not quiet: print("Printing master output") @@ -77,6 +90,7 @@ def summary_info_writer(master_summary_info, out_path, prefix, quiet): # Create header header = ['Core_pair', 'n', + 'occurrence_core_1', 'occurrence_core_2', 'co_occurrence', 'min_dist', 'max_dist', 'mean_dist', 'median_dist', 'min_acc', 'max_acc', 'mean_acc', 'median_acc'] writer.writerow(header) @@ -86,9 +100,17 @@ def summary_info_writer(master_summary_info, out_path, prefix, quiet): info = master_summary_info[key] writer.writerow(info) - out_file.close() + def segment_writer(segments, out_path, prefix, quiet): + """ + Function to write segments of core genes identified across the pan-genome + :param segments: Dict of segments (lists) in values, under name of segments as keys. + :param out_path: Path to output folder + :param prefix: Prefix for any output files + :param quiet: # TODO - logger + :return: Nothing + """ if not quiet: print("Printing core segments") @@ -102,19 +124,26 @@ def segment_writer(segments, out_path, prefix, quiet): writer = csv.writer(out_file) # Create header - header = ['Segment_name', 'segment_position', 'core_gene'] + header = ['Segment_name', 'Segment_position', 'Core_gene'] writer.writerow(header) # Write remaining rows: for key in sorted(segments.keys()): for index, gene in enumerate(segments[key]): - info = [key, index+1, gene] + info = [key.replace('--', '-'), index+1, gene] writer.writerow(info) - out_file.close() def no_acc_segment_writer(no_acc_segments, out_path, prefix, quiet): + """ + Function for writing segments of core genes with no accessory between them. + :param no_acc_segments: Dict of segments with (lists) in values with sub-lists being segments with no accessory genes between them, under name of segments as keys. + :param out_path: Path to output folder + :param prefix: Prefix for any output files + :param quiet: # TODO - logger + :return: Nothing + """ if not quiet: print("Printing core segments without accessory content") @@ -128,72 +157,18 @@ def no_acc_segment_writer(no_acc_segments, out_path, prefix, quiet): writer = csv.writer(out_file) # Create header - header = ['Parent_Segment_name', 'Sub_segment_name', 'Parent_segment_position', 'Sub_segment_position', 'core_gene'] + header = ['Parent_segment_name', 'Sub_segment_name', 'Parent_segment_position', 'Sub_segment_position', 'Core_gene'] writer.writerow(header) # Write remaining rows: for key in sorted(no_acc_segments.keys()): for sub_index, subsegment in enumerate(no_acc_segments[key]): - sub_name = f'{subsegment[0]}--{subsegment[-1]}' + sub_name = f'{subsegment[0]}-{subsegment[-1]}' for index, gene in enumerate(subsegment): - info = [key, sub_name, sub_index + 1, index + 1, gene] + info = [key.replace('--', '-'), sub_name, sub_index + 1, index + 1, gene] writer.writerow(info) - out_file.close() - - - - -def write_consensus_core_gene_synteny(core_gene_synteny): - with open('consensus_core_gene_synteny.txt', 'w', newline='', encoding='utf-8') as out_file: - for gene in core_gene_synteny: - out_file.write(f'{gene}\n') - out_file.close() - - -def write_core_gene_coverage(core_path_coverage): - """ Function to write """ - with open('core_gene_coverage.tsv', 'w', newline='', encoding='utf-8') as out_file: - writer = csv.writer(out_file, delimiter='\t') - - header = ['Core_gene_1', 'Core_gene_2', 'Connections'] - writer.writerow(header) - - for connection in core_path_coverage: - writer.writerow(connection) - out_file.close() - - -def write_alternative_core_gene_counts(alternative_core_gene_counts): - with open('alternative_core_pairs_count.tsv', 'w', newline='', encoding='utf-8') as out_file: - writer = csv.writer(out_file, delimiter='\t') - - header = ['Core_gene_1', 'Core_gene_2', 'Num._connections'] - writer.writerow(header) - - for key in alternative_core_gene_counts.keys(): - split_key = key.split('--') - - row_info = [split_key[0].strip(), split_key[1].strip(), alternative_core_gene_counts[key]] - - writer.writerow(row_info) - out_file.close() - - -def write_core_gene_types(core_genome_types, alt_core_pair_matrix): - with open('core_genome_synteny_types.csv', 'w', newline='', encoding='utf-8') as out_file: - header = ['Genome', 'Type'] - - writer = csv.writer(out_file, delimiter=',') - writer.writerow(header) - for key in core_genome_types: - writer.writerow([key, core_genome_types[key]]) - - out_file.close() - - with open('core_pair_matrix.csv', 'w', newline='', encoding='utf-8') as out_file: - writer = csv.DictWriter(out_file, fieldnames=alt_core_pair_matrix[1]) - writer.writeheader() - writer.writerows(alt_core_pair_matrix[0]) +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 5581b7e..b8e9ae4 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -7,7 +7,7 @@ # import import unittest import os -import json +import io from shutil import copyfile import logging # pylint: disable=no-name-in-module @@ -21,6 +21,7 @@ from Corekaburra import merge_dicts from Corekaburra import consesus_core_genome from Corekaburra import summary_table +from Corekaburra import output_writer_functions @@ -3202,5 +3203,119 @@ def test_summary_table_calculations_w_sequence_breaks(self): self.assertEqual(expected_table, return_table) +class TestWritingOutputFunction(unittest.TestCase): + """ + Function to test the creation of output files + """ + def tearDown(self): + """ Class to remove created database files of gff files in tmp-folder""" + for file in os.listdir('TestWritingOutputFunction'): + if "sv" in file: + db_path = os.path.join('TestWritingOutputFunction', file) + os.remove(db_path) + + def test_master_info_writer(self): + master_info = { + 'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--pan_cluster_2--genome_2': ['genome_2', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_1--pan_cluster_2--genome_3': ['genome_3', 'pan_cluster_1', 'pan_cluster_2', 99, 3, + ['Acc_1', 'Acc_2'], ['low_1']], + 'pan_cluster_2--pan_cluster_3--genome_1': ['genome_1', 'pan_cluster_2', 'pan_cluster_3', 100, 2, + ['Acc_1', 'Acc_2'], []], + 'pan_cluster_2--pan_cluster_4--genome_2': ['genome_2', 'pan_cluster_2', 'pan_cluster_4', 150, 1, + ['Acc_1', ], []], + 'pan_cluster_2--pan_cluster_3--genome_3': ['genome_3', 'pan_cluster_2', 'pan_cluster_3', 200, 0, + [], []], + 'pan_cluster_3--pan_cluster_4--genome_1': ['genome_1', 'pan_cluster_3', 'pan_cluster_4', -5, 0, + [], []], + 'pan_cluster_3--pan_cluster_4--genome_3': ['genome_3', 'pan_cluster_3', 'pan_cluster_4', -10, 0, + [], []] + } + out_path = 'TestWritingOutputFunction' + prefix = 'test' + + expected_low_freq = 'TestWritingOutputFunction/low_freq.txt' + expected_gene_content = 'TestWritingOutputFunction/gene_content.txt' + + output_writer_functions.master_info_writer(master_info, out_path, prefix, True) + + with open(expected_low_freq, 'r') as expected: + with open('TestWritingOutputFunction/test_low_frequency_gene_placement.tsv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + + with open(expected_gene_content, 'r') as expected: + with open('TestWritingOutputFunction/test_core_core_accessory_gene_content.tsv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + + def test_summary_info_writer(self): + + input_dict = { + 'pan_cluster_1--pan_cluster_2': ['pan_cluster_1-pan_cluster_2', 3, 3, 3, 3, 99, 99, 99.0, 99.0, 3, 3, 3.0, + 3.0], + 'pan_cluster_2--pan_cluster_3': ['pan_cluster_2-pan_cluster_3', 2, 3, 2, 2, 100, 200, 150.0, 150.0, 0, 2, + 1.0, 1.0], + 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2-pan_cluster_4', 1, 3, 3, 3, 150, 150, 150.0, 150.0, 1, 1, + 1.0, 1.0], + 'pan_cluster_3--pan_cluster_4': ['pan_cluster_3-pan_cluster_4', 2, 2, 3, 2, -10, -5, -7.5, -7.5, 0, 0, 0.0, + 0.0]} + + out_path = 'TestWritingOutputFunction' + prefix = 'test' + + expected_summary_table = 'TestWritingOutputFunction/summary_table.txt' + + output_writer_functions.summary_info_writer(input_dict, out_path, prefix, True) + + with open(expected_summary_table, 'r') as expected: + with open('TestWritingOutputFunction/test_core_pair_summary.csv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + + def test_segment_writer(self): + input_segments = {'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', + 'pan_cluster_3', + 'pan_cluster_4'], + 'pan_cluster_2--pan_cluster_6': ['pan_cluster_2', + 'pan_cluster_1', + 'pan_cluster_6'], + 'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', + 'pan_cluster_5', + 'pan_cluster_6']} + + out_path = 'TestWritingOutputFunction' + prefix = 'test' + + expected_summary_table = 'TestWritingOutputFunction/core_segments.txt' + + output_writer_functions.segment_writer(input_segments, out_path, prefix, True) + + with open(expected_summary_table, 'r') as expected: + with open('TestWritingOutputFunction/test_core_segments.csv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + + def test_no_acc_segment_writer(self): + input_segments = {'pan_cluster_2--pan_cluster_4': [['pan_cluster_2'], + ['pan_cluster_3', + 'pan_cluster_4']], + 'pan_cluster_2--pan_cluster_6': [['pan_cluster_2'], + ['pan_cluster_1'], + ['pan_cluster_6']], + 'pan_cluster_4--pan_cluster_6': [['pan_cluster_4', + 'pan_cluster_5'], + ['pan_cluster_6']]} + + out_path = 'TestWritingOutputFunction' + prefix = 'test' + + expected_summary_table = 'TestWritingOutputFunction/no_acc_segments.txt' + + output_writer_functions.no_acc_segment_writer(input_segments, out_path, prefix, True) + + with open(expected_summary_table, 'r') as expected: + with open('TestWritingOutputFunction/test_no_accessory_core_segments.csv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt new file mode 100644 index 0000000..5898524 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt @@ -0,0 +1,10 @@ +Segment_name,Segment_position,Core_gene +pan_cluster_2-pan_cluster_4,1,pan_cluster_2 +pan_cluster_2-pan_cluster_4,2,pan_cluster_3 +pan_cluster_2-pan_cluster_4,3,pan_cluster_4 +pan_cluster_2-pan_cluster_6,1,pan_cluster_2 +pan_cluster_2-pan_cluster_6,2,pan_cluster_1 +pan_cluster_2-pan_cluster_6,3,pan_cluster_6 +pan_cluster_4-pan_cluster_6,1,pan_cluster_4 +pan_cluster_4-pan_cluster_6,2,pan_cluster_5 +pan_cluster_4-pan_cluster_6,3,pan_cluster_6 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt new file mode 100644 index 0000000..b35a276 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_1 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency +genome_1 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency +genome_1 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency +genome_2 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency +genome_2 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency +genome_2 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency +genome_3 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency +genome_3 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency +genome_3 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency +genome_1 pan_cluster_2 pan_cluster_3 Acc_1 low_frequency +genome_1 pan_cluster_2 pan_cluster_3 Acc_2 low_frequency +genome_2 pan_cluster_2 pan_cluster_4 Acc_1 low_frequency diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt new file mode 100644 index 0000000..b36c0e8 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_1 pan_cluster_1 pan_cluster_2 99 3 +genome_2 pan_cluster_1 pan_cluster_2 99 3 +genome_3 pan_cluster_1 pan_cluster_2 99 3 +genome_1 pan_cluster_2 pan_cluster_3 100 2 +genome_2 pan_cluster_2 pan_cluster_4 150 1 +genome_3 pan_cluster_2 pan_cluster_3 200 0 +genome_1 pan_cluster_3 pan_cluster_4 -5 0 +genome_3 pan_cluster_3 pan_cluster_4 -10 0 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt new file mode 100644 index 0000000..6eba408 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt @@ -0,0 +1,10 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +pan_cluster_2-pan_cluster_4,pan_cluster_2-pan_cluster_2,1,1,pan_cluster_2 +pan_cluster_2-pan_cluster_4,pan_cluster_3-pan_cluster_4,2,1,pan_cluster_3 +pan_cluster_2-pan_cluster_4,pan_cluster_3-pan_cluster_4,2,2,pan_cluster_4 +pan_cluster_2-pan_cluster_6,pan_cluster_2-pan_cluster_2,1,1,pan_cluster_2 +pan_cluster_2-pan_cluster_6,pan_cluster_1-pan_cluster_1,2,1,pan_cluster_1 +pan_cluster_2-pan_cluster_6,pan_cluster_6-pan_cluster_6,3,1,pan_cluster_6 +pan_cluster_4-pan_cluster_6,pan_cluster_4-pan_cluster_5,1,1,pan_cluster_4 +pan_cluster_4-pan_cluster_6,pan_cluster_4-pan_cluster_5,1,2,pan_cluster_5 +pan_cluster_4-pan_cluster_6,pan_cluster_6-pan_cluster_6,2,1,pan_cluster_6 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt new file mode 100644 index 0000000..64a6bb6 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt @@ -0,0 +1,5 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +pan_cluster_1-pan_cluster_2,3,3,3,3,99,99,99.0,99.0,3,3,3.0,3.0 +pan_cluster_2-pan_cluster_3,2,3,2,2,100,200,150.0,150.0,0,2,1.0,1.0 +pan_cluster_2-pan_cluster_4,1,3,3,3,150,150,150.0,150.0,1,1,1.0,1.0 +pan_cluster_3-pan_cluster_4,2,2,3,2,-10,-5,-7.5,-7.5,0,0,0.0,0.0 From 8a5d9a21c29b365e9e7746a72218ff53e0585ec0 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 13:34:11 +1100 Subject: [PATCH 021/135] Remove standard Bioinitio files from functional test folder. Tidy unit-test script, and add in import of output function to main script --- Corekaburra/__main__.py | 4 +++ .../test_data/empty_file.expected | 31 ------------------- functional_tests/test_data/no_header | 1 - functional_tests/test_data/no_input.expected | 21 +++++++++---- functional_tests/test_data/one_sequence.fasta | 5 --- .../test_data/one_sequence.fasta.expected | 2 -- .../test_data/single_greater_than.fasta | 1 - functional_tests/test_data/two_sequence.fasta | 8 ----- .../test_data/two_sequence.fasta.expected | 2 -- .../two_sequence.fasta.minlen_1000.expected | 2 -- .../two_sequence.fasta.minlen_200.expected | 2 -- ...o_sequence.fasta.minlen_200.stdin.expected | 2 -- unit_tests/Corekaburra_test.py | 3 -- 13 files changed, 19 insertions(+), 65 deletions(-) delete mode 100644 functional_tests/test_data/empty_file.expected delete mode 100644 functional_tests/test_data/no_header delete mode 100644 functional_tests/test_data/one_sequence.fasta delete mode 100644 functional_tests/test_data/one_sequence.fasta.expected delete mode 100644 functional_tests/test_data/single_greater_than.fasta delete mode 100644 functional_tests/test_data/two_sequence.fasta delete mode 100644 functional_tests/test_data/two_sequence.fasta.expected delete mode 100644 functional_tests/test_data/two_sequence.fasta.minlen_1000.expected delete mode 100644 functional_tests/test_data/two_sequence.fasta.minlen_200.expected delete mode 100644 functional_tests/test_data/two_sequence.fasta.minlen_200.stdin.expected diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 0dca923..a465713 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -55,6 +55,10 @@ except ModuleNotFoundError: from summary_table import calculate_n_create_summaries +try: + from Corekaburra.output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer +except ModuleNotFoundError: + from output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer from argparse import ArgumentParser from math import floor diff --git a/functional_tests/test_data/empty_file.expected b/functional_tests/test_data/empty_file.expected deleted file mode 100644 index 0f6b828..0000000 --- a/functional_tests/test_data/empty_file.expected +++ /dev/null @@ -1,31 +0,0 @@ -usage: __main__.py [-h] -ig file_1.gff ... file_n.gff - [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-o path/to/output] - [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] - -Welcome to Corekaburra! Program to determine consensus core sequence from -multiple genomes. Outputs consensus core gene alignment, distance between core -genes, number of accessory genes between core genes and low frequency genes -between core genes - -optional arguments: - -h, --help show this help message and exit - -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...], --input_gffs file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] - Path to gff files used for pan-genome - -ip path/to/pan_genome, --input_pangenome path/to/pan_genome - Path to the folder produced by Panaroo or Roary - -cg complete_genomes.txt, --complete_genomes complete_genomes.txt - text file containing names of genomes that are to be - handled as complete genomes - -o path/to/output, --output path/to/output - Path to where output files will be placed [default: - current folder] - -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX - Prefix for output files, if any is desired - -a, --no_annotate_refound - Flag to toggle off the creation of new gff files, with - annotation of refound genes. Only done if input - pangenome is detected as comming from Panaroo - -c int, --cpu int Give max number of CPUs [default: 1] - -l, --log Record program progress in for debugging purpose - -q, --quiet Only print warnings diff --git a/functional_tests/test_data/no_header b/functional_tests/test_data/no_header deleted file mode 100644 index 02ab7d2..0000000 --- a/functional_tests/test_data/no_header +++ /dev/null @@ -1 +0,0 @@ -ATGC diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 02b4b8c..0f6b828 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,12 @@ -usage: __main__.py [-h] -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-o path/to/output] [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] +usage: __main__.py [-h] -ig file_1.gff ... file_n.gff + [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-o path/to/output] + [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] -Welcome to Corekaburra! Program to determine consensus core sequence from multiple genomes. Outputs consensus core gene alignment, distance between core genes, number of accessory genes between core genes and low frequency genes between core -genes +Welcome to Corekaburra! Program to determine consensus core sequence from +multiple genomes. Outputs consensus core gene alignment, distance between core +genes, number of accessory genes between core genes and low frequency genes +between core genes optional arguments: -h, --help show this help message and exit @@ -10,13 +15,17 @@ optional arguments: -ip path/to/pan_genome, --input_pangenome path/to/pan_genome Path to the folder produced by Panaroo or Roary -cg complete_genomes.txt, --complete_genomes complete_genomes.txt - text file containing names of genomes that are to be handled as complete genomes + text file containing names of genomes that are to be + handled as complete genomes -o path/to/output, --output path/to/output - Path to where output files will be placed [default: current folder] + Path to where output files will be placed [default: + current folder] -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX Prefix for output files, if any is desired -a, --no_annotate_refound - Flag to toggle off the creation of new gff files, with annotation of refound genes. Only done if input pangenome is detected as comming from Panaroo + Flag to toggle off the creation of new gff files, with + annotation of refound genes. Only done if input + pangenome is detected as comming from Panaroo -c int, --cpu int Give max number of CPUs [default: 1] -l, --log Record program progress in for debugging purpose -q, --quiet Only print warnings diff --git a/functional_tests/test_data/one_sequence.fasta b/functional_tests/test_data/one_sequence.fasta deleted file mode 100644 index ad03ad2..0000000 --- a/functional_tests/test_data/one_sequence.fasta +++ /dev/null @@ -1,5 +0,0 @@ ->SEQUENCE_1 -MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEG -LVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHK -IPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTL -MGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL diff --git a/functional_tests/test_data/one_sequence.fasta.expected b/functional_tests/test_data/one_sequence.fasta.expected deleted file mode 100644 index fabcc26..0000000 --- a/functional_tests/test_data/one_sequence.fasta.expected +++ /dev/null @@ -1,2 +0,0 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -one_sequence.fasta 1 237 237 237 237 diff --git a/functional_tests/test_data/single_greater_than.fasta b/functional_tests/test_data/single_greater_than.fasta deleted file mode 100644 index 0817502..0000000 --- a/functional_tests/test_data/single_greater_than.fasta +++ /dev/null @@ -1 +0,0 @@ -> \ No newline at end of file diff --git a/functional_tests/test_data/two_sequence.fasta b/functional_tests/test_data/two_sequence.fasta deleted file mode 100644 index 0570cc8..0000000 --- a/functional_tests/test_data/two_sequence.fasta +++ /dev/null @@ -1,8 +0,0 @@ ->SEQUENCE_1 -MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEG -LVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHK -IPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTL -MGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL ->SEQUENCE_2 -SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQI -ATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH diff --git a/functional_tests/test_data/two_sequence.fasta.expected b/functional_tests/test_data/two_sequence.fasta.expected deleted file mode 100644 index 4246c96..0000000 --- a/functional_tests/test_data/two_sequence.fasta.expected +++ /dev/null @@ -1,2 +0,0 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -two_sequence.fasta 2 357 120 178 237 diff --git a/functional_tests/test_data/two_sequence.fasta.minlen_1000.expected b/functional_tests/test_data/two_sequence.fasta.minlen_1000.expected deleted file mode 100644 index 0f0ae9b..0000000 --- a/functional_tests/test_data/two_sequence.fasta.minlen_1000.expected +++ /dev/null @@ -1,2 +0,0 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -two_sequence.fasta 0 0 - - - diff --git a/functional_tests/test_data/two_sequence.fasta.minlen_200.expected b/functional_tests/test_data/two_sequence.fasta.minlen_200.expected deleted file mode 100644 index 82e3549..0000000 --- a/functional_tests/test_data/two_sequence.fasta.minlen_200.expected +++ /dev/null @@ -1,2 +0,0 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -two_sequence.fasta 1 237 237 237 237 diff --git a/functional_tests/test_data/two_sequence.fasta.minlen_200.stdin.expected b/functional_tests/test_data/two_sequence.fasta.minlen_200.stdin.expected deleted file mode 100644 index 2c269d8..0000000 --- a/functional_tests/test_data/two_sequence.fasta.minlen_200.stdin.expected +++ /dev/null @@ -1,2 +0,0 @@ -FILENAME NUMSEQ TOTAL MIN AVG MAX -stdin 1 237 237 237 237 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index b8e9ae4..6ccfec1 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -7,7 +7,6 @@ # import import unittest import os -import io from shutil import copyfile import logging # pylint: disable=no-name-in-module @@ -23,8 +22,6 @@ from Corekaburra import summary_table from Corekaburra import output_writer_functions - - # move to folder with mock files. First try Github structure, then try pulled repository structure try: os.chdir('/Corekaburra/unit_tests/unit_test_data/') From 9f8685f1b59c31a733cbd8f1dac9c38fd64da22c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 13:36:59 +1100 Subject: [PATCH 022/135] Make change to functional change test data --- functional_tests/test_data/no_input.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 0f6b828..237dea8 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: __main__.py [-h] -ig file_1.gff ... file_n.gff +usage: Corekaburra.py [-h] -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-o path/to/output] [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] From 9b7d088d5c0468be8da6ac83109b7298e358cf21 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 13:38:55 +1100 Subject: [PATCH 023/135] Make change to functional change test data --- functional_tests/test_data/no_input.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 237dea8..1a90bac 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: Corekaburra.py [-h] -ig file_1.gff ... file_n.gff +usage: Corekaburra [-h] -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-o path/to/output] [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] From 7e423eb5cbf0fd993e1392d4d3a33b0f6b198597 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 14:08:22 +1100 Subject: [PATCH 024/135] Add in a functional test for exitting check of complete genomes --- Corekaburra/__main__.py | 4 +-- Corekaburra/check_inputs.py | 1 + Corekaburra/output_writer_functions.py | 2 +- functional_tests/Corekaburra-test.sh | 25 +++++++++++++++++++ .../complete_genome_double_chrom.gff | 0 .../complete_genome_single_chrom.gff | 1 + .../test_data/complete_genomes_file | 2 ++ 7 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 functional_tests/test_data/complete_genome_double_chrom.gff create mode 100644 functional_tests/test_data/complete_genome_single_chrom.gff create mode 100644 functional_tests/test_data/complete_genomes_file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index a465713..3446546 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -60,8 +60,6 @@ except ModuleNotFoundError: from output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer -from argparse import ArgumentParser -from math import floor import sys import pkg_resources @@ -141,6 +139,7 @@ def main(): args = get_commandline_arguments(sys.argv[1:]) # TODO - Add in function(s) that will check all files to not be empty. - Andrew? + # * Potentially add features that check for the validity of files? # TODO - Make Corekaburra take gzipped inputs # Check the presence of provided complete genomes among input GFFs @@ -167,7 +166,6 @@ def main(): # TODO - Make the program work with less than all files in the pangenome. Just make sure that all gff files supplied can be found in the pan genome. This will make is possible to look at hotspots and segments in different lineages check_gff_in_pan(args.input_gffs, input_pres_abs_file_path) - # Construct output folder try: os.mkdir(args.output_path) diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 198238e..c9692fb 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -7,6 +7,7 @@ from exit_with_error import exit_with_error EXIT_INPUT_FILE_ERROR = 1 + def define_pangenome_program(folder): """ Function to examine if input pan genome folder stems from Roary or Panaroo. diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index d980c5c..8879ac8 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -171,4 +171,4 @@ def no_acc_segment_writer(no_acc_segments, out_path, prefix, quiet): if __name__ == "__main__": - pass \ No newline at end of file + pass diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 234d1e8..2e89805 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -158,6 +158,31 @@ test_stdout_exit "$test_program -help" no_input.expected 0 call_new_test "Test exit status for a bad command line invocation" test_exit_status "$test_program --this_is_not_a_valid_argument > /dev/null 2>&1" 2 +# TODO - Test the exit upon a genome being provided as complete, but not being found in input gff files +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file > /dev/null 2>&1" 1 + +# TODO - Test exit upon unsuccessful identification of source program + +# TODO - Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo + +# TODO - Test exit upon gff not found in pan is provided as input + +# TODO - Test roary input + +# TODO - test Panaroo input + +# TODO - test Panaroo input w. correction + +# TODO - test Panaroo input with no correction + +# TODO - test for core genes being fragmented. + +# TODO - test for accessory genes being fragmented. + +# TODO - test complete genome with sinlge contig + +# TODO - test complete genome with multiple contigs (Simulate plasmids or two chromosomes) + # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/complete_genome_double_chrom.gff b/functional_tests/test_data/complete_genome_double_chrom.gff new file mode 100644 index 0000000..e69de29 diff --git a/functional_tests/test_data/complete_genome_single_chrom.gff b/functional_tests/test_data/complete_genome_single_chrom.gff new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/functional_tests/test_data/complete_genome_single_chrom.gff @@ -0,0 +1 @@ + diff --git a/functional_tests/test_data/complete_genomes_file b/functional_tests/test_data/complete_genomes_file new file mode 100644 index 0000000..9b77cef --- /dev/null +++ b/functional_tests/test_data/complete_genomes_file @@ -0,0 +1,2 @@ +complete_genome_single_chrom.gff +complete_genome_double_chrom.gff \ No newline at end of file From e0cad8e5d6ab6205bb4545f40ba62ae0457d9e43 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 14:18:14 +1100 Subject: [PATCH 025/135] Add in functional test for the exit upon no pan-genome programm being identified as source --- functional_tests/Corekaburra-test.sh | 10 ++++++---- .../Crash_pan_folder/gene_presence_absence.csv | 1 + .../Crash_pan_folder/gene_presence_absence_roary.csv | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 2e89805..1fd43fa 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -148,20 +148,22 @@ parse_args $@ # 2. Change to test directory cd $test_data_dir # 2. Run tests -# Test output for no arguments + call_new_test "Test output for no arguments" test_stdout_exit "$test_program" no_input.expected 2 -# Test output for -help argument given + call_new_test "Test output for -help argument given" test_stdout_exit "$test_program -help" no_input.expected 0 -# Test exit status for a bad command line invocation + call_new_test "Test exit status for a bad command line invocation" test_exit_status "$test_program --this_is_not_a_valid_argument > /dev/null 2>&1" 2 -# TODO - Test the exit upon a genome being provided as complete, but not being found in input gff files +call_new_test "Test exit status for a complete genome not given as input gff file" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file > /dev/null 2>&1" 1 # TODO - Test exit upon unsuccessful identification of source program +call_new_test "Test exit upon unsuccessful identification of source program" +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder > /dev/null 2>&1" 1 # TODO - Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo diff --git a/functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv b/functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv new file mode 100644 index 0000000..00e5a5f --- /dev/null +++ b/functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv @@ -0,0 +1 @@ +empty_file... Or actually it is not! But why are you looking here? \ No newline at end of file diff --git a/functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv b/functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv new file mode 100644 index 0000000..ad5574e --- /dev/null +++ b/functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv @@ -0,0 +1 @@ +Another empty file! \ No newline at end of file From 9ba2959fda5c8531442a6ee2a176889a1bce34b4 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 14:31:33 +1100 Subject: [PATCH 026/135] Add functional test for the exit upon no gene_data.csv being identified and modify the test for no source programm being found --- functional_tests/Corekaburra-test.sh | 5 +++-- .../gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 3 files changed, 3 insertions(+), 2 deletions(-) rename functional_tests/test_data/{Crash_pan_folder => Crash_panaroo_folder}/gene_presence_absence.csv (100%) rename functional_tests/test_data/{Crash_pan_folder => Crash_panaroo_folder}/gene_presence_absence_roary.csv (100%) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 1fd43fa..3a01300 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -161,11 +161,12 @@ test_exit_status "$test_program --this_is_not_a_valid_argument > /dev/null 2>&1" call_new_test "Test exit status for a complete genome not given as input gff file" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file > /dev/null 2>&1" 1 -# TODO - Test exit upon unsuccessful identification of source program call_new_test "Test exit upon unsuccessful identification of source program" -test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder > /dev/null 2>&1" 1 +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder" 1 # TODO - Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo +call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder" 1 # TODO - Test exit upon gff not found in pan is provided as input diff --git a/functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv b/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence.csv similarity index 100% rename from functional_tests/test_data/Crash_pan_folder/gene_presence_absence.csv rename to functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence.csv diff --git a/functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv b/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence_roary.csv similarity index 100% rename from functional_tests/test_data/Crash_pan_folder/gene_presence_absence_roary.csv rename to functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence_roary.csv From 35a8623588d81d1cb42031145a53628be09f557b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 14:41:47 +1100 Subject: [PATCH 027/135] Add in a functional test for a gff file being provided, but not found in the gene_presence_absence file --- functional_tests/Corekaburra-test.sh | 7 ++++--- .../test_data/Crash_gff_folder/gene_presence_absence.csv | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 3a01300..4eb5d62 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -162,13 +162,14 @@ call_new_test "Test exit status for a complete genome not given as input gff fil test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file > /dev/null 2>&1" 1 call_new_test "Test exit upon unsuccessful identification of source program" -test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder" 1 +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder > /dev/null 2>&1" 1 -# TODO - Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" -test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder" 1 +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder > /dev/null 2>&1" 1 # TODO - Test exit upon gff not found in pan is provided as input +call_new_test "Test exit upon gff not found in pan is provided as input" +test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Crash_gff_folder > /dev/null 2>&1" 1 # TODO - Test roary input diff --git a/functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv b/functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv new file mode 100644 index 0000000..04db977 --- /dev/null +++ b/functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv @@ -0,0 +1 @@ +"","","","","","","","","","","","","","","","complete_genome_single_chrom" \ No newline at end of file From e867a1431c4fbfab95b76253efaaa74b525321ff Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 15:41:39 +1100 Subject: [PATCH 028/135] Add in functional test for a small and simple Roary run. Adjust the calculation of last gene to end of contig. Adjust so that segments are noly written if any are present (at least one gene have more than two neighbours) --- Corekaburra/__main__.py | 9 ++-- Corekaburra/check_inputs.py | 1 + Corekaburra/consesus_core_genome.py | 53 +++++++++---------- Corekaburra/gff_parser.py | 2 +- functional_tests/Corekaburra-test.sh | 37 ++++++++++++- ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 5 ++ .../low_frequency_gene_placement.tsv.expected | 9 ++++ .../Roray_run/gene_presence_absence.csv | 4 ++ .../complete_genome_double_chrom.gff | 12 +++++ .../complete_genome_single_chrom.gff | 8 ++- .../complete_genome_single_chrom_2.gff | 7 +++ unit_tests/Corekaburra_test.py | 48 ++++++++--------- 13 files changed, 136 insertions(+), 60 deletions(-) create mode 100644 functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Roray_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/complete_genome_single_chrom_2.gff diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 3446546..aadfaa2 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -138,8 +138,6 @@ def main(): # get arguments from the commandline args = get_commandline_arguments(sys.argv[1:]) - # TODO - Add in function(s) that will check all files to not be empty. - Andrew? - # * Potentially add features that check for the validity of files? # TODO - Make Corekaburra take gzipped inputs # Check the presence of provided complete genomes among input GFFs @@ -252,7 +250,7 @@ def main(): # time_calculator(time_start, time.time(), "identifying segments in pan genome") # Produce dict containing summarised information from master info. - master_summary_info = calculate_n_create_summaries(master_info_total) + master_summary_info = calculate_n_create_summaries(master_info_total, core_dict) ### WRITE OUTPUTS ### print(f"\n-----------------------Printing outputs-----------------------") @@ -261,8 +259,9 @@ def main(): master_info_writer(master_info_total, args.output_path, args.output_prefix, args.quiet) summary_info_writer(master_summary_info, args.output_path, args.output_prefix, args.quiet) # TODO - Contruct output for segments - parent column. - segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) - no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) + if double_edge_segements is not None: + segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) + no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) # print(non_core_contig_info) TODO - Print core less contigs. # TODO - Possibly output core gene graph. with segment annotations? diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index c9692fb..d896d9b 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -55,6 +55,7 @@ def check_gff_in_pan(file_list, gene_presence_absence_path): pan_header_line = pan_file.readline() pan_header_line = pan_header_line.strip().split(',') genome_names = pan_header_line[14:] + genome_names = [name.replace('"', '') for name in genome_names] file_list = [os.path.basename(file) for file in file_list] file_list_no_suffix = [file.rstrip('.gff') for file in file_list] diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 8cf7c4a..5af46c1 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -108,32 +108,6 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun return sub_segment_dict -def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict): - """ - Function to be called from main that collects the functions for determining core segments in pan-genome - - :param core_neighbour_pairs: Dict of the number of times core pairs have been detected - :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs - :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes - - :return double_edge_segements: - :return no_acc_segments: - """ - # Construct a graph from core gene neighbours - core_graph = construct_core_graph(core_neighbour_pairs) - - # Find segments in the genome between core genes with multiple neighbors - double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) - - if double_edge_segements is not None: - # Find segments of core genes with no accessory in between - no_acc_segments = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - else: - no_acc_segments = None - - return double_edge_segements, no_acc_segments - - def identify_segments(core_graph, num_gffs, core_gene_dict): """ Function to identify stretches of core genes between core genes neighbouring multiple different genes @@ -147,7 +121,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Check if any node have multiple edges, if not then return. if len(multi_edge_nodes) == 0: - return None, None, None # TODO - log and report better that this is the outcome! + return None # TODO - log and report better that this is the outcome! # Dict to hold connections between >2 edge nodes connect_dict = {} @@ -295,5 +269,30 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): return double_edge_segements +def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict): + """ + Function to be called from main that collects the functions for determining core segments in pan-genome + + :param core_neighbour_pairs: Dict of the number of times core pairs have been detected + :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs + :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes + + :return double_edge_segements: + :return no_acc_segments: + """ + # Construct a graph from core gene neighbours + core_graph = construct_core_graph(core_neighbour_pairs) + + # Find segments in the genome between core genes with multiple neighbors + double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) + + if double_edge_segements is not None: + # Find segments of core genes with no accessory in between + no_acc_segments = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + else: + no_acc_segments = None + + return double_edge_segements, no_acc_segments + if __name__ == '__main__': pass \ No newline at end of file diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index ac90ed4..67c8b94 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -135,7 +135,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous if core_core_distance < 0: core_core_distance = 0 elif current_core_gene_cluster == 'Sequence_break': - core_core_distance = abs(contig_end - previous_core_gene_end_coor - 1) + core_core_distance = abs(contig_end - previous_core_gene_end_coor) else: NotImplementedError( 'An error occured when measuring the distance between core gene and contig end. Something went wrong!') diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 4eb5d62..310ec31 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -115,6 +115,30 @@ function test_stdout_exit { fi } +# Run a command and check that the output file is +# exactly equal the contents of a specified file +# ARG1: A file returned from program after running +# ARG2: a file path containing the expected output +function test_output_file { + let num_tests+=1 + output=$1 + expected_output_file=$2 + verbose_message "Testing output file: $1" + verbose_message "Expected file path: $2" + difference=$(diff $output $expected_output_file) || let num_errors+=1 + if [ -n "$difference" ]; then + let num_errors+=1 + echo "Test output failed: $1" + echo "Actual output:" + cat $output + expected_output=$(cat $expected_output_file) + echo "Expected output:" + echo "$expected_output" + echo "Difference:" + echo "$difference" + fi +} + # Run a command and check that the exit status is # equal to an expected value # exactly equal the contents of a specified file @@ -167,11 +191,16 @@ test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_p call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder > /dev/null 2>&1" 1 -# TODO - Test exit upon gff not found in pan is provided as input call_new_test "Test exit upon gff not found in pan is provided as input" test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Crash_gff_folder > /dev/null 2>&1" 1 # TODO - Test roary input +call_new_test "Test roary input" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Roary_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Roary_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Roary_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder # TODO - test Panaroo input @@ -187,6 +216,12 @@ test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_ge # TODO - test complete genome with multiple contigs (Simulate plasmids or two chromosomes) +# TODO - test with accessory genes + +# TODO - test with segments + +# TODO - test with decreased core-gene cutoff + # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..e48e928 --- /dev/null +++ b/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,5 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..9e8becf --- /dev/null +++ b/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom_2 B C 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 diff --git a/functional_tests/test_data/Roray_run/gene_presence_absence.csv b/functional_tests/test_data/Roray_run/gene_presence_absence.csv new file mode 100644 index 0000000..85fbed5 --- /dev/null +++ b/functional_tests/test_data/Roray_run/gene_presence_absence.csv @@ -0,0 +1,4 @@ +"","","","","","","","","","","","","","","complete_genome_single_chrom","complete_genome_single_chrom_2" +"A","","","2","2","1","","","","","","","","","single_comp_A","single_comp_2_A" +"B","","","2","2","1","","","","","","","","","single_comp_B","single_comp_2_B" +"C","","","2","2","1","","","","","","","","","single_comp_C","single_comp_2_C" \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom.gff b/functional_tests/test_data/complete_genome_double_chrom.gff index e69de29..e7d483b 100644 --- a/functional_tests/test_data/complete_genome_double_chrom.gff +++ b/functional_tests/test_data/complete_genome_double_chrom.gff @@ -0,0 +1,12 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=Ajwa_the_Shigella_A;Other_info +contig_1 . CDS 100 190 . . . ID=Ajwa_the_Shigella_B;Other_info +contig_1 . CDS 200 290 . . . ID=Ajwa_the_Shigella_C;Other_info +contig_2 . CDS 1 90 . . . ID=Ajwa_the_Shigella_D;Other_info +contig_2 . CDS 100 190 . . . ID=Ajwa_the_Shigella_E;Other_info +contig_2 . CDS 200 290 . . . ID=Ajwa_the_Shigella_F;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_single_chrom.gff b/functional_tests/test_data/complete_genome_single_chrom.gff index 8b13789..ded01c4 100644 --- a/functional_tests/test_data/complete_genome_single_chrom.gff +++ b/functional_tests/test_data/complete_genome_single_chrom.gff @@ -1 +1,7 @@ - +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info +contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info +contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_single_chrom_2.gff b/functional_tests/test_data/complete_genome_single_chrom_2.gff new file mode 100644 index 0000000..fe98ff1 --- /dev/null +++ b/functional_tests/test_data/complete_genome_single_chrom_2.gff @@ -0,0 +1,7 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info +contig_1 . CDS 100 190 . . . ID=single_comp_2_B;Other_info +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 6ccfec1..5258de2 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -907,12 +907,12 @@ def test_recording_core_gene_before_seqeuncebreak(self): expected_previous_core_gene_end_coor = 150 expected_acc_genes_in_region = [] expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1349} + expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1350} expected_accessory_gene_content = {'pan_gene_1--Sequence_break': []} expected_low_freq_gene_content = {'pan_gene_1--Sequence_break': []} expected_core_gene_pairs = ['pan_gene_1--Sequence_break'] expected_master_info = {'pan_gene_1--Sequence_break--gff_name': ['gff_name', 'pan_gene_1', 'Sequence_break', - 1349, 0, [], []]} + 1350, 0, [], []]} return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ @@ -954,12 +954,12 @@ def test_recording_core_gene_before_seqeuncebreak_w_accessory(self): expected_previous_core_gene_end_coor = 150 expected_acc_genes_in_region = [] expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1349} + expected_core_gene_pair_distance = {'pan_gene_1--Sequence_break': 1350} expected_accessory_gene_content = {'pan_gene_1--Sequence_break': ['acc_1']} expected_low_freq_gene_content = {'pan_gene_1--Sequence_break': ['low_1', "low_2"]} expected_core_gene_pairs = ['pan_gene_1--Sequence_break'] expected_master_info = {'pan_gene_1--Sequence_break--gff_name': ['gff_name', 'pan_gene_1', 'Sequence_break', - 1349, 3, ['acc_1'], ['low_1', "low_2"]]} + 1350, 3, ['acc_1'], ['low_1', "low_2"]]} return_previous_core_gene_id, return_previous_core_gene_end_coor, return_acc_genes_in_region, \ return_low_freq_genes_in_region, return_core_gene_pair_distance, return_accessory_gene_content, \ @@ -1414,7 +1414,7 @@ def test_single_chromosome_draft(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299} + 'pan_gene_7--Sequence_break': 300} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -1430,7 +1430,7 @@ def test_single_chromosome_draft(self): 'pan_gene_2--pan_gene_5--test_single_chromosome': ['test_single_chromosome', 'pan_gene_2', 'pan_gene_5', 359, 2, ['pan_gene_4'], ['pan_gene_3'], ], 'pan_gene_5--pan_gene_7--test_single_chromosome': ['test_single_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], 'Sequence_break--pan_gene_2--test_single_chromosome': ['test_single_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], - 'pan_gene_7--Sequence_break--test_single_chromosome': ['test_single_chromosome', 'pan_gene_7', 'Sequence_break', 299, 1, [], ['pan_gene_8']]} + 'pan_gene_7--Sequence_break--test_single_chromosome': ['test_single_chromosome', 'pan_gene_7', 'Sequence_break', 300, 1, [], ['pan_gene_8']]} coreless_contigs = {} @@ -1591,11 +1591,11 @@ def test_two_daft_contigs(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299, + 'pan_gene_7--Sequence_break': 300, 'pan_gene_10--pan_gene_13': 359, 'pan_gene_13--pan_gene_15': 269, 'Sequence_break--pan_gene_10': 178, - 'pan_gene_15--Sequence_break': 299} + 'pan_gene_15--Sequence_break': 300} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -1623,13 +1623,13 @@ def test_two_daft_contigs(self): 'pan_gene_5--pan_gene_7--test_double_chromosome': ['test_double_chromosome', 'pan_gene_5', 'pan_gene_7', 269, 1, [], ['pan_gene_6']], 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], - 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', 'Sequence_break', 300, 1, [], ['pan_gene_8']], 'pan_gene_10--pan_gene_13--test_double_chromosome': ['test_double_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 2, ['pan_gene_12'], ['pan_gene_11']], 'pan_gene_13--pan_gene_15--test_double_chromosome': ['test_double_chromosome', 'pan_gene_13', 'pan_gene_15', 269, 1, [], ['pan_gene_14']], 'Sequence_break--pan_gene_10--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], - 'pan_gene_15--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_15', 'Sequence_break', 299, 0, [], []] + 'pan_gene_15--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_15', 'Sequence_break', 300, 0, [], []] } @@ -1688,7 +1688,7 @@ def test_with_coreless_contig_draft_last_contig(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299} + 'pan_gene_7--Sequence_break': 300} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -1710,7 +1710,7 @@ def test_with_coreless_contig_draft_last_contig(self): 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', - 'Sequence_break', 299, 1, [], ['pan_gene_8']] + 'Sequence_break', 300, 1, [], ['pan_gene_8']] } coreless_contigs = {'test_double_chromosome--gff_name_contig_2': [['pan_gene_12'], ['pan_gene_11', 'pan_gene_14']]} @@ -1843,7 +1843,7 @@ def test_with_coreless_contig_draft_first_contig(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299} + 'pan_gene_7--Sequence_break': 300} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -1865,7 +1865,7 @@ def test_with_coreless_contig_draft_first_contig(self): 'Sequence_break--pan_gene_2--test_double_chromosome': ['test_double_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], 'pan_gene_7--Sequence_break--test_double_chromosome': ['test_double_chromosome', 'pan_gene_7', - 'Sequence_break', 299, 1, [], ['pan_gene_8']] + 'Sequence_break', 300, 1, [], ['pan_gene_8']] } coreless_contigs = { @@ -1931,10 +1931,10 @@ def test_with_coreless_contig_middle_contig(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299, + 'pan_gene_7--Sequence_break': 300, 'Sequence_break--pan_gene_10': 178, 'pan_gene_10--pan_gene_13': 359, - 'pan_gene_13--Sequence_break': 619} + 'pan_gene_13--Sequence_break': 620} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -1962,10 +1962,10 @@ def test_with_coreless_contig_middle_contig(self): 'Sequence_break--pan_gene_2--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], 'pan_gene_7--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_7', - 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'Sequence_break', 300, 1, [], ['pan_gene_8']], 'Sequence_break--pan_gene_10--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], 'pan_gene_10--pan_gene_13--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 0, [], []], - 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 619, 0, [], []] + 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 620, 0, [], []] } coreless_contigs = { @@ -2402,10 +2402,10 @@ def test_segmentation_of_fragmented_acc_gene_on_coreless_contig(self): core_gene_pair_distance = {'pan_gene_2--pan_gene_5': 359, 'pan_gene_5--pan_gene_7': 269, 'Sequence_break--pan_gene_2': 178, - 'pan_gene_7--Sequence_break': 299, + 'pan_gene_7--Sequence_break': 300, 'Sequence_break--pan_gene_10': 178, 'pan_gene_10--pan_gene_13': 359, - 'pan_gene_13--Sequence_break': 619} + 'pan_gene_13--Sequence_break': 620} accessory_gene_content = {'pan_gene_2--pan_gene_5': ['pan_gene_4'], 'pan_gene_5--pan_gene_7': [], @@ -2433,10 +2433,10 @@ def test_segmentation_of_fragmented_acc_gene_on_coreless_contig(self): 'Sequence_break--pan_gene_2--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_2', 178, 1, ['pan_gene_1'], []], 'pan_gene_7--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_7', - 'Sequence_break', 299, 1, [], ['pan_gene_8']], + 'Sequence_break', 300, 1, [], ['pan_gene_8']], 'Sequence_break--pan_gene_10--test_triple_chromosome': ['test_triple_chromosome', 'Sequence_break', 'pan_gene_10', 178, 0, [], []], 'pan_gene_10--pan_gene_13--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_10', 'pan_gene_13', 359, 0, [], []], - 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 619, 0, [], []] + 'pan_gene_13--Sequence_break--test_triple_chromosome': ['test_triple_chromosome', 'pan_gene_13', 'Sequence_break', 620, 0, [], []] } coreless_contigs = { @@ -2847,11 +2847,9 @@ def test_double_edge_segment_identification_all_2_degree_input(self): core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - return_1, return_2, return_3 = consesus_core_genome.identify_segments(core_graph, 10, {}) + return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}) self.assertEqual(None, return_1) - self.assertEqual(None, return_2) - self.assertEqual(None, return_3) def test_double_edge_segment_identification_two_segments(self): expected_segments = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} From d433aa7716461359eb11613372795bdb0c1dc787 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 15:47:17 +1100 Subject: [PATCH 029/135] Add in simple Panaroo run to functional tests --- functional_tests/Corekaburra-test.sh | 9 +++++++-- .../test_data/Panaroo_run/gene_presence_absence.csv | 1 + .../Panaroo_run/gene_presence_absence_roary.csv | 4 ++++ 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 functional_tests/test_data/Panaroo_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 310ec31..b1c544d 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -194,8 +194,7 @@ test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_p call_new_test "Test exit upon gff not found in pan is provided as input" test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Crash_gff_folder > /dev/null 2>&1" 1 -# TODO - Test roary input -call_new_test "Test roary input" +call_new_test "Test Roary input" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Roary_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Roary_run_expected/low_frequency_gene_placement.tsv.expected @@ -203,6 +202,12 @@ test_output_file test_out_folder/core_pair_summary.csv Roary_run_expected/core_p rm -r test_out_folder # TODO - test Panaroo input +call_new_test "Test Panaroo input" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Roary_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Roary_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Roary_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder # TODO - test Panaroo input w. correction diff --git a/functional_tests/test_data/Panaroo_run/gene_presence_absence.csv b/functional_tests/test_data/Panaroo_run/gene_presence_absence.csv new file mode 100644 index 0000000..00e5a5f --- /dev/null +++ b/functional_tests/test_data/Panaroo_run/gene_presence_absence.csv @@ -0,0 +1 @@ +empty_file... Or actually it is not! But why are you looking here? \ No newline at end of file diff --git a/functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv b/functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv new file mode 100644 index 0000000..d6b4441 --- /dev/null +++ b/functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv @@ -0,0 +1,4 @@ +,,,,,,,,,,,,,,complete_genome_single_chrom,complete_genome_single_chrom_2 +A,,,2,2,1,,,,,,,,,single_comp_A,single_comp_2_A +B,,,2,2,1,,,,,,,,,single_comp_B,single_comp_2_B +C,,,2,2,1,,,,,,,,,single_comp_C,single_comp_2_C \ No newline at end of file From 3d84b11f28135580cd4b2b0e772cfc3f0bd65f00 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 15:54:53 +1100 Subject: [PATCH 030/135] Add in a -a for simple panaroo run to avoid reannoation --- functional_tests/Corekaburra-test.sh | 14 +++++++------- .../core_core_accessory_gene_content.tsv.expected | 1 - .../core_pair_summary.csv.expected | 5 ----- .../low_frequency_gene_placement.tsv.expected | 9 --------- 4 files changed, 7 insertions(+), 22 deletions(-) delete mode 100644 functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected delete mode 100644 functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected delete mode 100644 functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index b1c544d..1361e07 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -196,17 +196,17 @@ test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_ge call_new_test "Test Roary input" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Roary_run_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Roary_run_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Roary_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - test Panaroo input call_new_test "Test Panaroo input" -Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Roary_run_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Roary_run_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Roary_run_expected/core_pair_summary.csv.expected +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder -a > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - test Panaroo input w. correction diff --git a/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected deleted file mode 100644 index fee984c..0000000 --- a/functional_tests/test_data/Roary_run_expected/core_core_accessory_gene_content.tsv.expected +++ /dev/null @@ -1 +0,0 @@ -Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected deleted file mode 100644 index e48e928..0000000 --- a/functional_tests/test_data/Roary_run_expected/core_pair_summary.csv.expected +++ /dev/null @@ -1,5 +0,0 @@ -Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc -A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected deleted file mode 100644 index 9e8becf..0000000 --- a/functional_tests/test_data/Roary_run_expected/low_frequency_gene_placement.tsv.expected +++ /dev/null @@ -1,9 +0,0 @@ -Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_single_chrom A B 9 0 -complete_genome_single_chrom B C 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 -complete_genome_single_chrom_2 A B 9 0 -complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 From 2f86cab3bcd5e728b1871dfb53fc1da06c401d6a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 16:01:18 +1100 Subject: [PATCH 031/135] Add in the renamed folder for the outputs of simple Roary and Panaroo runs --- functional_tests/Corekaburra-test.sh | 2 -- .../core_core_accessory_gene_content.tsv.expected | 1 + .../Simple_run_expected/core_pair_summary.csv.expected | 5 +++++ .../low_frequency_gene_placement.tsv.expected | 9 +++++++++ 4 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 1361e07..a425661 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -211,8 +211,6 @@ rm -r test_out_folder # TODO - test Panaroo input w. correction -# TODO - test Panaroo input with no correction - # TODO - test for core genes being fragmented. # TODO - test for accessory genes being fragmented. diff --git a/functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..e48e928 --- /dev/null +++ b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,5 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..9e8becf --- /dev/null +++ b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom_2 B C 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 From a2dee3f2808f2c599f1144642a2ef472d4da6196 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 5 Jan 2022 16:41:50 +1100 Subject: [PATCH 032/135] Add in some adjustments to how recording of genes are handleded when a genome has multiple chromosomes that are complete. Add functional test for multiple complete chromosomes --- Corekaburra/gff_parser.py | 16 ++++++++++--- functional_tests/Corekaburra-test.sh | 22 ++++++++++++++---- .../test_data/Complete_double_chromosomes.txt | 1 + .../gene_presence_absence.csv | 7 ++++++ .../complete_genome_double_chrom.gff | 12 +++++----- .../complete_genome_double_chrom_2.gff | 12 ++++++++++ .../.DS_Store | Bin 0 -> 6148 bytes ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 7 ++++++ .../low_frequency_gene_placement.tsv.expected | 13 +++++++++++ 10 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 functional_tests/test_data/Complete_double_chromosomes.txt create mode 100644 functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/complete_genome_double_chrom_2.gff create mode 100644 functional_tests/test_data/double_comple_chromosome_expected/.DS_Store create mode 100644 functional_tests/test_data/double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/double_comple_chromosome_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 67c8b94..76ef5f7 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -430,10 +430,20 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # Set new contig previous_contig = line[0] - # Check if first gene on new contig is core or not + # Check if fist gene is core on complete genome, if then record details. + if line[8] in core_genes[gff_name] and complete_genome: + # Set information on first core gene to be used when finishing search + first_core_gene_gff_line = line - # Check if first gene on new contig is a core gene, if the record it. - if line[8] in core_genes[gff_name]: + # Set information to be used with next core gene neighbour + previous_core_gene_end_coor = int(line[4]) + previous_core_gene_id = line[8] + + # Set that first core gene has been observed + first_core_gene = False + + # Check if first gene on new contig is a core gene, if the record it. + elif line[8] in core_genes[gff_name]: previous_core_gene_id = "Sequence_break" # Get the starting position of the first core gene on contig to record the gene. diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index a425661..a783749 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -201,7 +201,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test Panaroo input call_new_test "Test Panaroo input" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder -a > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected @@ -209,16 +208,29 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder +# TODO - test complete genome with single contig + + +# TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) +call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected +rm -r test_out_folder + + + + + + # TODO - test Panaroo input w. correction +# TODO - Add in corrections before this! # TODO - test for core genes being fragmented. # TODO - test for accessory genes being fragmented. -# TODO - test complete genome with sinlge contig - -# TODO - test complete genome with multiple contigs (Simulate plasmids or two chromosomes) - # TODO - test with accessory genes # TODO - test with segments diff --git a/functional_tests/test_data/Complete_double_chromosomes.txt b/functional_tests/test_data/Complete_double_chromosomes.txt new file mode 100644 index 0000000..5a7358b --- /dev/null +++ b/functional_tests/test_data/Complete_double_chromosomes.txt @@ -0,0 +1 @@ +complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff \ No newline at end of file diff --git a/functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv b/functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv new file mode 100644 index 0000000..18a85d8 --- /dev/null +++ b/functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv @@ -0,0 +1,7 @@ +"","","","","","","","","","","","","","","complete_genome_double_chrom","complete_genome_double_chrom_2" +"A","","","2","2","1","","","","","","","","","dub_chrom_A","dub_chrom_2_A" +"B","","","2","2","1","","","","","","","","","dub_chrom_B","dub_chrom_2_B" +"C","","","2","2","1","","","","","","","","","dub_chrom_C","dub_chrom_2_C" +"D","","","2","2","1","","","","","","","","","dub_chrom_D","dub_chrom_2_D" +"E","","","2","2","1","","","","","","","","","dub_chrom_E","dub_chrom_2_E" +"F","","","2","2","1","","","","","","","","","dub_chrom_F","dub_chrom_2_F" \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom.gff b/functional_tests/test_data/complete_genome_double_chrom.gff index e7d483b..fc59b4a 100644 --- a/functional_tests/test_data/complete_genome_double_chrom.gff +++ b/functional_tests/test_data/complete_genome_double_chrom.gff @@ -1,10 +1,10 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=Ajwa_the_Shigella_A;Other_info -contig_1 . CDS 100 190 . . . ID=Ajwa_the_Shigella_B;Other_info -contig_1 . CDS 200 290 . . . ID=Ajwa_the_Shigella_C;Other_info -contig_2 . CDS 1 90 . . . ID=Ajwa_the_Shigella_D;Other_info -contig_2 . CDS 100 190 . . . ID=Ajwa_the_Shigella_E;Other_info -contig_2 . CDS 200 290 . . . ID=Ajwa_the_Shigella_F;Other_info +contig_1 . CDS 1 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_2 . CDS 1 90 . . . ID=dub_chrom_D;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_F;Other_info ##FASTA >contig_1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/functional_tests/test_data/complete_genome_double_chrom_2.gff b/functional_tests/test_data/complete_genome_double_chrom_2.gff new file mode 100644 index 0000000..b536247 --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_2.gff @@ -0,0 +1,12 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=dub_chrom_2_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_2_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_2_C;Other_info +contig_2 . CDS 1 90 . . . ID=dub_chrom_2_D;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_2_E;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_2_F;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/double_comple_chromosome_expected/.DS_Store b/functional_tests/test_data/double_comple_chromosome_expected/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Wed, 5 Jan 2022 16:44:37 +1100 Subject: [PATCH 033/135] Change the comparison of output files to compare correctly --- functional_tests/Corekaburra-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index a783749..7f276b8 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -214,8 +214,8 @@ rm -r test_out_folder # TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder From 1158edccad11a03deabfb8e0e5b004884d702f92 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:13:45 +1100 Subject: [PATCH 034/135] Add in functional test for a single chromosome being complete --- functional_tests/Corekaburra-test.sh | 14 ++++++++------ .../test_data/Complete_single_chromosome.txt | 1 + .../core_core_accessory_gene_content.tsv | 1 + .../core_pair_summary.csv | 6 ++++++ .../low_frequency_gene_placement.tsv | 8 ++++++++ 5 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 functional_tests/test_data/Complete_single_chromosome.txt create mode 100644 functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv create mode 100644 functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv create mode 100644 functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 7f276b8..32bdaf3 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -209,20 +209,22 @@ test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_ rm -r test_out_folder # TODO - test complete genome with single contig - +call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +rm -r test_out_folder # TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder - - - - +# TODO - Test with single genome in input being complete # TODO - test Panaroo input w. correction # TODO - Add in corrections before this! diff --git a/functional_tests/test_data/Complete_single_chromosome.txt b/functional_tests/test_data/Complete_single_chromosome.txt new file mode 100644 index 0000000..dab063b --- /dev/null +++ b/functional_tests/test_data/Complete_single_chromosome.txt @@ -0,0 +1 @@ +complete_genome_single_chrom.gff \ No newline at end of file diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv b/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv new file mode 100644 index 0000000..57bef6d --- /dev/null +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv @@ -0,0 +1,6 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +A-Sequence_break,1,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv new file mode 100644 index 0000000..703697f --- /dev/null +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv @@ -0,0 +1,8 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom A C 10 0 +complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom_2 B C 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 From d5c7fbeb5fc6d2a14124d6b5d474807c6344ccd7 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:16:41 +1100 Subject: [PATCH 035/135] Add in .expected extension to expected files --- ..._content.tsv => core_core_accessory_gene_content.tsv.expected} | 0 .../{core_pair_summary.csv => core_pair_summary.csv.expected} | 0 ...ne_placement.tsv => low_frequency_gene_placement.tsv.expected} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename functional_tests/test_data/Single_comple_chromosome_expected/{core_core_accessory_gene_content.tsv => core_core_accessory_gene_content.tsv.expected} (100%) rename functional_tests/test_data/Single_comple_chromosome_expected/{core_pair_summary.csv => core_pair_summary.csv.expected} (100%) rename functional_tests/test_data/Single_comple_chromosome_expected/{low_frequency_gene_placement.tsv => low_frequency_gene_placement.tsv.expected} (100%) diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv b/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected similarity index 100% rename from functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv rename to functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected similarity index 100% rename from functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv rename to functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected similarity index 100% rename from functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv rename to functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected From 8dccb0b00c0a2fe36d94642a24a5c8c410e879c2 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:20:43 +1100 Subject: [PATCH 036/135] change around the command line Corekaburra calls for single and double complete chromosomes --- functional_tests/Corekaburra-test.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 32bdaf3..65e354f 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -210,7 +210,7 @@ rm -r test_out_folder # TODO - test complete genome with single contig call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected @@ -218,14 +218,12 @@ rm -r test_out_folder # TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" -Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - Test with single genome in input being complete - # TODO - test Panaroo input w. correction # TODO - Add in corrections before this! From 00af299060d7fceb2348894e3e3fed56b8f3c2a4 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:24:18 +1100 Subject: [PATCH 037/135] Change round the order of the input complete double chromosome gff files --- functional_tests/Corekaburra-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 65e354f..9e7d502 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -209,7 +209,7 @@ test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_ rm -r test_out_folder # TODO - test complete genome with single contig -call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" +call_new_test "Test complete genome with single contig" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -218,7 +218,7 @@ rm -r test_out_folder # TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_double_chrom.gff complete_genome_double_chrom_2.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected From 7af0d74b47653f8bf018725f228afdd7b37ef4e2 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:29:32 +1100 Subject: [PATCH 038/135] Change order of output in double chromosome test --- .../low_frequency_gene_placement.tsv.expected | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 9fa941b..d97ec42 100644 --- a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -1,13 +1,13 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom A B 9 0 +complete_genome_double_chrom B C 9 0 +complete_genome_double_chrom A C 10 0 +complete_genome_double_chrom D E 9 0 +complete_genome_double_chrom E F 9 0 +complete_genome_double_chrom D F 10 0 complete_genome_double_chrom_2 A B 9 0 complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom_2 A C 10 0 complete_genome_double_chrom_2 D E 9 0 complete_genome_double_chrom_2 E F 9 0 complete_genome_double_chrom_2 D F 10 0 -complete_genome_double_chrom A B 9 0 -complete_genome_double_chrom B C 9 0 -complete_genome_double_chrom A C 10 0 -complete_genome_double_chrom D E 9 0 -complete_genome_double_chrom E F 9 0 -complete_genome_double_chrom D F 10 0 From 1d333501c8974e03b8dc0e8a0ed984c2bde0fca8 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 09:33:55 +1100 Subject: [PATCH 039/135] Change order of output in double chromosome test --- .../low_frequency_gene_placement.tsv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index d97ec42..1df4948 100644 --- a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -4,7 +4,7 @@ complete_genome_double_chrom B C 9 0 complete_genome_double_chrom A C 10 0 complete_genome_double_chrom D E 9 0 complete_genome_double_chrom E F 9 0 -complete_genome_double_chrom D F 10 0 +complete_genome_double_chrom D F 10 0 complete_genome_double_chrom_2 A B 9 0 complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom_2 A C 10 0 From 157f88669054f3e4cfaf19c7702a7cda0772dbf1 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 10:36:39 +1100 Subject: [PATCH 040/135] Add in functional tests for accessory, segments and sub-segments --- functional_tests/Corekaburra-test.sh | 29 ++++++++++++++----- .../gene_presence_absence.csv | 9 ++++++ ...e_core_accessory_gene_content.tsv.expected | 5 ++++ .../core_pair_summary.csv.expected | 4 +++ .../low_frequency_gene_placement.tsv.expected | 7 +++++ .../gene_presence_absence.csv | 8 +++++ ...e_core_accessory_gene_content.tsv.expected | 2 ++ .../core_pair_summary.csv.expected | 9 ++++++ .../core_segments.csv.expected | 7 +++++ .../low_frequency_gene_placement.tsv.expected | 13 +++++++++ .../no_accessory_core_segments.csv.expected | 7 +++++ .../test_data/complete_larger_genome_list.txt | 2 ++ .../test_data/genome_single_chrom_larger.gff | 11 +++++++ .../genome_single_chrom_larger_rearrange.gff | 10 +++++++ 14 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected create mode 100644 functional_tests/test_data/complete_larger_genome_list.txt create mode 100644 functional_tests/test_data/genome_single_chrom_larger.gff create mode 100644 functional_tests/test_data/genome_single_chrom_larger_rearrange.gff diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 9e7d502..b4ef55f 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -208,15 +208,13 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test complete genome with single contig -call_new_test "Test complete genome with single contig" +call_new_test "Test complete genome with single contig and single complete genome among input" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - Test complete genome with multiple contigs (Simulate plasmids or two chromosomes) call_new_test "Test complete genome with multiple contigs (Simulate plasmids or two chromosomes)" Corekaburra -ig complete_genome_double_chrom.gff complete_genome_double_chrom_2.gff -ip complete_double_chromoosme_run -o test_out_folder -cg Complete_double_chromosomes.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected @@ -224,6 +222,26 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_ test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder +# TODO - test with accessory genes +call_new_test "Test with accessory genes" +Corekaburra -ig genome_single_chrom_larger_rearrange.gff genome_single_chrom_larger.gff -ip Accessory_chrom_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Accessory_chrom_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - test with segments +call_new_test "Test with segments and sub-segments" +Corekaburra -ig genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Rearrangement_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Accessory_chrom_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Accessory_chrom_run_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Accessory_chrom_run_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + +# TODO - test with decreased core-gene cutoff + # TODO - test Panaroo input w. correction # TODO - Add in corrections before this! @@ -231,11 +249,6 @@ rm -r test_out_folder # TODO - test for accessory genes being fragmented. -# TODO - test with accessory genes - -# TODO - test with segments - -# TODO - test with decreased core-gene cutoff # 3. End of testing - check if any errors occurrred diff --git a/functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv b/functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv new file mode 100644 index 0000000..08056c7 --- /dev/null +++ b/functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv @@ -0,0 +1,9 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","genome_single_chrom_larger_rearrange" +"A","","","2","2","1","","","","","","","","","single_comp_A","single_comp_2_A" +"B_1","","","1","1","1","","","","","","","","","single_comp_B","" +"B_2","","","1","1","1","","","","","","","","","","single_comp_2_B" +"C","","","2","2","1","","","","","","","","","single_comp_C","single_comp_2_C" +"D_1","","","1","1","1","","","","","","","","","","single_comp_2_D" +"D_2","","","1","1","1","","","","","","","","","single_comp_D","" +"E","","","2","2","1","","","","","","","","","single_comp_E","single_comp_2_E" +"E","","","2","2","1","","","","","","","","","single_comp_F","single_comp_2_F" \ No newline at end of file diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..aa226db --- /dev/null +++ b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,5 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger A C B_1 intermediate_frequency +genome_single_chrom_larger C E D_2 intermediate_frequency +genome_single_chrom_larger_rearrange A C D_1 intermediate_frequency +genome_single_chrom_larger_rearrange C E B_2 intermediate_frequency diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..acec5a7 --- /dev/null +++ b/functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,2,2,2,2,109,109,109.0,109.0,1,1,1.0,1.0 +A-E,2,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +C-E,2,2,2,2,109,109,109.0,109.0,1,1,1.0,1.0 diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..b2ae7e9 --- /dev/null +++ b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger C E 109 1 +genome_single_chrom_larger A E 10 0 +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_rearrange C E 109 1 +genome_single_chrom_larger_rearrange A E 10 0 diff --git a/functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv b/functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv new file mode 100644 index 0000000..75e7154 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","genome_single_chrom_larger_rearrange" +"A","","","2","2","1","","","","","","","","","single_comp_A","single_comp_2_A" +"B","","","2","2","1","","","","","","","","","single_comp_B","single_comp_2_B" +"C","","","2","2","1","","","","","","","","","single_comp_C","single_comp_2_C" +"D","","","2","2","1","","","","","","","","","single_comp_D","single_comp_2_D" +"E","","","2","2","1","","","","","","","","","single_comp_E","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","single_comp_F","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","" \ No newline at end of file diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..cdef650 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,2 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger A F G intermediate_frequency diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..c434187 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,9 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-D,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-F,2,2,2,2,10,10,10.0,10.0,0,1,0.5,0.5 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-D,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +D-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +E-F,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected new file mode 100644 index 0000000..9830e19 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected @@ -0,0 +1,7 @@ +Segment_name,Segment_position,Core_gene +A-E,1,A +A-E,2,F +A-E,3,E +B-D,1,B +B-D,2,C +B-D,3,D diff --git a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..1798da0 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger B C 9 0 +genome_single_chrom_larger C D 9 0 +genome_single_chrom_larger D E 9 0 +genome_single_chrom_larger E F 9 0 +genome_single_chrom_larger A F 10 1 +genome_single_chrom_larger_rearrange A D 9 0 +genome_single_chrom_larger_rearrange C D 9 0 +genome_single_chrom_larger_rearrange B C 9 0 +genome_single_chrom_larger_rearrange B E 9 0 +genome_single_chrom_larger_rearrange E F 9 0 +genome_single_chrom_larger_rearrange A F 10 0 diff --git a/functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..2521634 --- /dev/null +++ b/functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,7 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-E,A-A,1,1,A +A-E,F-E,2,1,F +A-E,F-E,2,2,E +B-D,B-D,1,1,B +B-D,B-D,1,2,C +B-D,B-D,1,3,D diff --git a/functional_tests/test_data/complete_larger_genome_list.txt b/functional_tests/test_data/complete_larger_genome_list.txt new file mode 100644 index 0000000..d9e72fa --- /dev/null +++ b/functional_tests/test_data/complete_larger_genome_list.txt @@ -0,0 +1,2 @@ +genome_single_chrom_larger_rearrange.gff +genome_single_chrom_larger.gff \ No newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger.gff b/functional_tests/test_data/genome_single_chrom_larger.gff new file mode 100644 index 0000000..4f9d711 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger.gff @@ -0,0 +1,11 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info +contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info +contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info +contig_1 . CDS 300 390 . . . ID=single_comp_D;Other_info +contig_1 . CDS 400 490 . . . ID=single_comp_E;Other_info +contig_1 . CDS 500 590 . . . ID=single_comp_F;Other_info +contig_1 . CDS 591 592 . . . ID=single_comp_G;Other_info +##FASTA +>contigo newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff b/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff new file mode 100644 index 0000000..7a794b5 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff @@ -0,0 +1,10 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info +contig_1 . CDS 100 190 . . . ID=single_comp_2_D;Other_info +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info +contig_1 . CDS 300 390 . . . ID=single_comp_2_B;Other_info +contig_1 . CDS 400 490 . . . ID=single_comp_2_E;Other_info +contig_1 . CDS 500 590 . . . ID=single_comp_2_F;Other_info +##FASTA +>contigo newline at end of file From 18bc11414faabda7fcb29ea9db7a7773752b99a8 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 10:50:20 +1100 Subject: [PATCH 041/135] Make adjustments to the functional tests for accessory and segments --- functional_tests/Corekaburra-test.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index b4ef55f..c8d7b30 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -224,7 +224,7 @@ rm -r test_out_folder # TODO - test with accessory genes call_new_test "Test with accessory genes" -Corekaburra -ig genome_single_chrom_larger_rearrange.gff genome_single_chrom_larger.gff -ip Accessory_chrom_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 +Corekaburra -ig genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Accessory_chrom_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Accessory_chrom_run_expected/core_pair_summary.csv.expected @@ -233,11 +233,11 @@ rm -r test_out_folder # TODO - test with segments call_new_test "Test with segments and sub-segments" Corekaburra -ig genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Rearrangement_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Accessory_chrom_run_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Accessory_chrom_run_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Accessory_chrom_run_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Rearrangement_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder # TODO - test with decreased core-gene cutoff From 2cd605293a8910d6f1541f8299a893d2513e885b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 12:06:08 +1100 Subject: [PATCH 042/135] Add in a nicer commandline interface and options to set gene group cutoffs. Add function to check cutoffs and tests for these --- Corekaburra/__main__.py | 19 ++- Corekaburra/check_inputs.py | 10 ++ Corekaburra/commandline_interface.py | 129 ++++++++++++------- functional_tests/Corekaburra-test.sh | 11 +- functional_tests/test_data/no_input.expected | 34 +++-- unit_tests/Corekaburra_test.py | 14 ++ 6 files changed, 149 insertions(+), 68 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index aadfaa2..4655f19 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -26,9 +26,9 @@ from read_complete_genome_file import parse_complete_genome_file try: - from Corekaburra.check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan + from Corekaburra.check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan, check_cutoffs except ModuleNotFoundError: - from check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan + from check_inputs import define_pangenome_program, check_gene_data, check_gff_in_pan, check_cutoffs try: from Corekaburra.parse_gene_presence_absence import read_gene_presence_absence @@ -138,7 +138,11 @@ def main(): # get arguments from the commandline args = get_commandline_arguments(sys.argv[1:]) + # Check that low-frequency cutoff and core cutoff are as expected + check_cutoffs(args.low_cutoff, args.core_cutoff) + # TODO - Make Corekaburra take gzipped inputs + # TODO - Add so that a single gff file can only be given as input once and not multiple times? # Check the presence of provided complete genomes among input GFFs if args.comp_genomes is not None: @@ -181,8 +185,12 @@ def main(): ## Read in gene presence absence file time_start = time.time() # TODO - Add the user specified thresholds for core and low frequency genes. + # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? + # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes + # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. + # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] core_dict, low_freq_dict, acc_gene_dict, attribute_dict = read_gene_presence_absence(input_pres_abs_file_path, - 1, 0.05, source_program, + args.core_cutoff, args.low_cutoff, source_program, args.input_gffs, tmp_folder_path) @@ -272,9 +280,8 @@ def main(): # Remove temporary database holding gff databases # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - do so in nice exit function - # print(isdir(temp_folder_path)) - # if isdir(temp_folder_path): - # rmdir(temp_folder_path) + if os.path.isdir(tmp_folder_path): + os.rmdir(tmp_folder_path) if __name__ == '__main__': diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index d896d9b..1dd8010 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -6,8 +6,18 @@ except ModuleNotFoundError: from exit_with_error import exit_with_error EXIT_INPUT_FILE_ERROR = 1 +EXIT_COMMAND_LINE_ERROR = 2 +def check_cutoffs(low_cutoff, core_cutoff): + if (low_cutoff == 0 or 0 < low_cutoff < core_cutoff) and core_cutoff < 1: + return + else: + exit_with_error('Something is wrong with cutoffs for core and low-frequency genes!\n' + 'Make sure the cutoff for core genes is larger than for low-frequency, and is >0 but <1.\n' + 'Also make sure that the low-frequency gene cutoff is either equal to 0 or <1', + EXIT_COMMAND_LINE_ERROR) + def define_pangenome_program(folder): """ Function to examine if input pan genome folder stems from Roary or Panaroo. diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 35e79dc..b51440e 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -16,57 +16,81 @@ def get_commandline_arguments(args): 'Program to determine consensus core sequence from multiple genomes.\n' 'Outputs consensus core gene alignment, distance between core genes, ' 'number of accessory genes between core genes and low frequency genes ' - 'between core genes') #TODO - Change - - parser.add_argument('-ig', - '--input_gffs', - help='Path to gff files used for pan-genome', - metavar='file_1.gff ... file_n.gff', - required=True, - dest='input_gffs', - nargs='+') - - parser.add_argument('-ip', - '--input_pangenome', - help='Path to the folder produced by Panaroo or Roary', - metavar='path/to/pan_genome', - required=True, - dest='input_pan') - - parser.add_argument('-cg', - '--complete_genomes', - help='text file containing names of genomes that are to be handled as complete genomes', - required=False, - metavar='complete_genomes.txt', - default=None, - dest='comp_genomes') + 'between core genes', + add_help=False) #TODO - Change - parser.add_argument('-o', - '--output', - help='Path to where output files will be placed [default: current folder]', - required=False, - type=str, - metavar='path/to/output', - default='.', - dest='output_path') - - parser.add_argument('-p', - '--prefix', - help='Prefix for output files, if any is desired', - required=False, - default=None, - dest='output_prefix') + required = parser.add_argument_group('Required arguments') + run_mods = parser.add_argument_group('Analysis modifiers') + output_control = parser.add_argument_group('Output control') + rem_args = parser.add_argument_group('Other arguments') - parser.add_argument('-a', - '--no_annotate_refound', - help='Flag to toggle off the creation of new gff files, with annotation of refound genes.\n' - 'Only done if input pangenome is detected as comming from Panaroo', - required=False, - default=True, - action='store_false', - dest='annotate') + required.add_argument('-ig', + '--input_gffs', + help='Path to gff files used for pan-genome', + metavar='file.gff', + required=True, + dest='input_gffs', + nargs='+') + + required.add_argument('-ip', + '--input_pangenome', + help='Path to the folder produced by Panaroo or Roary', + metavar='path/to/pan_genome', + required=True, + dest='input_pan') + + run_mods.add_argument('-cg', + '--complete_genomes', + help='text file containing names of genomes that are to be handled as complete genomes', + required=False, + metavar='complete_genomes.txt', + default=None, + dest='comp_genomes') + + run_mods.add_argument('-a', + '--no_annotate_refound', + help='Flag to toggle off the creation of new gff files, with annotation of refound genes.\n' + 'Only done if input pangenome is detected as coming from Panaroo', + required=False, + default=True, + action='store_false', + dest='annotate') - parser.add_argument('-c', + run_mods.add_argument('-cc', + '--core_cutoff', + help='Percentage of isolates in which a core gene must be present [default: 1.0]', + required=False, + metavar='1.0', + type=float, + default=1.0, + dest='core_cutoff') + + run_mods.add_argument('-lc', + '--low_cutoff', + help='Percentage of isolates where genes found in less than these are seen as low-frequency genes [default: 0.05]', + required=False, + metavar='0.05', + type=float, + default=0.05, + dest='low_cutoff') + + output_control.add_argument('-o', + '--output', + help='Path to where output files will be placed [default: current folder]', + required=False, + type=str, + metavar='path/to/output', + default='.', + dest='output_path') + + output_control.add_argument('-p', + '--prefix', + help='Prefix for output files, if any is desired', + required=False, + default=None, + dest='output_prefix') + + rem_args.add_argument('-c', '--cpu', help='Give max number of CPUs [default: 1]', required=False, @@ -75,7 +99,7 @@ def get_commandline_arguments(args): type=int, dest='cpu') - logger_level = parser.add_mutually_exclusive_group() + logger_level = rem_args.add_mutually_exclusive_group() logger_level.add_argument('-l', '--log', help='Record program progress in for debugging purpose', @@ -90,6 +114,13 @@ def get_commandline_arguments(args): default=False, required=False) + rem_args.add_argument('-h', + '--help', + action='help', + help='Show help function') + + + # Check if any thing is given as input otherwise warn and print help if len(args) < 1: parser.print_help() diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index c8d7b30..1b3f332 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -182,6 +182,15 @@ test_stdout_exit "$test_program -help" no_input.expected 0 call_new_test "Test exit status for a bad command line invocation" test_exit_status "$test_program --this_is_not_a_valid_argument > /dev/null 2>&1" 2 +call_new_test "Test exit status for a bad cutoffs provided - core lower than low-frequency" +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file -cc 0.1 -lc 0.2 > /dev/null 2>&1" 2 + +call_new_test "Test exit status for a bad cutoffs provided - core above range" +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file -cc 1.1 -lc 0.2 > /dev/null 2>&1" 2 + +call_new_test "Test exit status for a bad cutoffs provided - low-frequency below range" +test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file -cc 1 -lc -0.2 > /dev/null 2>&1" 2 + call_new_test "Test exit status for a complete genome not given as input gff file" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder -cg complete_genomes_file > /dev/null 2>&1" 1 @@ -222,7 +231,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv double_comple_ test_output_file test_out_folder/core_pair_summary.csv double_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test with accessory genes call_new_test "Test with accessory genes" Corekaburra -ig genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Accessory_chrom_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected @@ -230,7 +238,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Accessory_chro test_output_file test_out_folder/core_pair_summary.csv Accessory_chrom_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test with segments call_new_test "Test with segments and sub-segments" Corekaburra -ig genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Rearrangement_run -o test_out_folder -cg complete_larger_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 1a90bac..26455d1 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,31 +1,43 @@ -usage: Corekaburra [-h] -ig file_1.gff ... file_n.gff - [file_1.gff ... file_n.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-o path/to/output] - [-p OUTPUT_PREFIX] [-a] [-c int] [-l | -q] +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] + [-h] Welcome to Corekaburra! Program to determine consensus core sequence from multiple genomes. Outputs consensus core gene alignment, distance between core genes, number of accessory genes between core genes and low frequency genes between core genes -optional arguments: - -h, --help show this help message and exit - -ig file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...], --input_gffs file_1.gff ... file_n.gff [file_1.gff ... file_n.gff ...] +Required arguments: + -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...] Path to gff files used for pan-genome -ip path/to/pan_genome, --input_pangenome path/to/pan_genome Path to the folder produced by Panaroo or Roary + +Analysis modifiers: -cg complete_genomes.txt, --complete_genomes complete_genomes.txt text file containing names of genomes that are to be handled as complete genomes + -a, --no_annotate_refound + Flag to toggle off the creation of new gff files, with + annotation of refound genes. Only done if input + pangenome is detected as coming from Panaroo + -cc 1.0, --core_cutoff 1.0 + Percentage of isolates in which a core gene must be + present [default: 1.0] + -lc 0.05, --low_cutoff 0.05 + Percentage of isolates where genes found in less than + these are seen as low-frequency genes [default: 0.05] + +Output control: -o path/to/output, --output path/to/output Path to where output files will be placed [default: current folder] -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX Prefix for output files, if any is desired - -a, --no_annotate_refound - Flag to toggle off the creation of new gff files, with - annotation of refound genes. Only done if input - pangenome is detected as comming from Panaroo + +Other arguments: -c int, --cpu int Give max number of CPUs [default: 1] -l, --log Record program progress in for debugging purpose -q, --quiet Only print warnings + -h, --help Show help function diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 5258de2..0db16fc 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -49,6 +49,20 @@ def test_exit_w_tmp_folder_deletion(self): os.rename(tmp_folder_copy, tmp_folder) +class TestCutOffViolations(unittest.TestCase): + def test_low_below_range(self): + with self.assertRaises(SystemExit): + check_inputs.check_cutoffs(-0.1, 1) + + def test_core_above_range(self): + with self.assertRaises(SystemExit): + check_inputs.check_cutoffs(0.05, 1.1) + + def test_low_larger_than_core(self): + with self.assertRaises(SystemExit): + check_inputs.check_cutoffs(0.6, 0.4) + + class TestParsingCompleteGenomes(unittest.TestCase): """ Test for the passing of input file containing names of complete genome and checking their presence in the pan-genome """ def test_all_files_found(self): From f50c53577ce39ca7f581bc1eef87b8de5535133c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 12:16:16 +1100 Subject: [PATCH 043/135] Make adjustment to logic in checking cutoffs --- Corekaburra/check_inputs.py | 4 ++-- Corekaburra/commandline_interface.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 1dd8010..8bc5dab 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -10,11 +10,11 @@ def check_cutoffs(low_cutoff, core_cutoff): - if (low_cutoff == 0 or 0 < low_cutoff < core_cutoff) and core_cutoff < 1: + if 0 <= low_cutoff < core_cutoff <= 1: return else: exit_with_error('Something is wrong with cutoffs for core and low-frequency genes!\n' - 'Make sure the cutoff for core genes is larger than for low-frequency, and is >0 but <1.\n' + 'Make sure the cutoff for core genes is larger than for low-frequency, and is >0 or =1.\n' 'Also make sure that the low-frequency gene cutoff is either equal to 0 or <1', EXIT_COMMAND_LINE_ERROR) diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index b51440e..e5f4960 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -59,7 +59,6 @@ def get_commandline_arguments(args): run_mods.add_argument('-cc', '--core_cutoff', help='Percentage of isolates in which a core gene must be present [default: 1.0]', - required=False, metavar='1.0', type=float, default=1.0, @@ -68,7 +67,6 @@ def get_commandline_arguments(args): run_mods.add_argument('-lc', '--low_cutoff', help='Percentage of isolates where genes found in less than these are seen as low-frequency genes [default: 0.05]', - required=False, metavar='0.05', type=float, default=0.05, From 362590bbdc1b66d194a380d3fb3b4b219f09e96d Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 12:44:18 +1100 Subject: [PATCH 044/135] Rectify mistake in nameing low-frequenct and intermediate genes in the output writen, as they were swapped. Add in functional tests where the cutoff for core and low-frequency genes are varied --- Corekaburra/output_writer_functions.py | 4 ++-- functional_tests/Corekaburra-test.sh | 22 +++++++++++++++++++ ...e_core_accessory_gene_content.tsv.expected | 8 +++---- .../Change_cutoffs/gene_presence_absence.csv | 8 +++++++ ...e_core_accessory_gene_content.tsv.expected | 8 +++++++ .../core_pair_summary.csv.expected | 4 ++++ .../low_frequency_gene_placement.tsv.expected | 10 +++++++++ ...e_core_accessory_gene_content.tsv.expected | 2 +- ...e_core_accessory_gene_content.tsv.expected | 4 ++++ .../core_pair_summary.csv.expected | 8 +++++++ .../low_frequency_gene_placement.tsv.expected | 14 ++++++++++++ .../core_core_accessory_gene_content.tsv | 8 +++++++ .../low_freq_cutoff_0/core_pair_summary.csv | 4 ++++ .../low_frequency_gene_placement.tsv | 10 +++++++++ 14 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv create mode 100644 functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv create mode 100644 functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 8879ac8..19add65 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -54,7 +54,7 @@ def master_info_writer(master_info, out_path, prefix, quiet): core_core_region[1], core_core_region[2], gene, - 'low_frequency'] + 'intermediate_frequency'] writer.writerow(row) if len(core_core_region[6]): @@ -63,7 +63,7 @@ def master_info_writer(master_info, out_path, prefix, quiet): core_core_region[1], core_core_region[2], gene, - 'intermediate_frequency'] + 'low_frequency'] writer.writerow(row) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 1b3f332..1bb4742 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -248,6 +248,28 @@ test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_ru rm -r test_out_folder # TODO - test with decreased core-gene cutoff +call_new_test "Test with decreased core-gene cutoff" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv core_90_cutoff_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - test with increase low cutoff +call_new_test "Test with increase low cutoff" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0.4 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Increase_low_cutoff/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Increase_low_cutoff/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - test with zero low cutoff +call_new_test "Test with zero low cutoff" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv low_freq_cutoff_0/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv low_freq_cutoff_0/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv low_freq_cutoff_0/core_pair_summary.csv.expected +rm -r test_out_folder # TODO - test Panaroo input w. correction # TODO - Add in corrections before this! diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected index aa226db..f1c04da 100644 --- a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected @@ -1,5 +1,5 @@ Gff Core_gene_1 Core_gene_2 gene type -genome_single_chrom_larger A C B_1 intermediate_frequency -genome_single_chrom_larger C E D_2 intermediate_frequency -genome_single_chrom_larger_rearrange A C D_1 intermediate_frequency -genome_single_chrom_larger_rearrange C E B_2 intermediate_frequency +genome_single_chrom_larger A C B_1 low_frequency +genome_single_chrom_larger C E D_2 low_frequency +genome_single_chrom_larger_rearrange A C D_1 low_frequency +genome_single_chrom_larger_rearrange C E B_2 low_frequency diff --git a/functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv b/functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv new file mode 100644 index 0000000..06bdf25 --- /dev/null +++ b/functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom" +"A","","","3","3","1","","","","","","","","","single_comp_A","single_comp_2_A","single_comp_A" +"B","","","2","2","1","","","","","","","","","single_comp_B","","single_comp_B" +"C","","","3","3","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C" +"D","","","1","1","1","","","","","","","","","single_comp_D","","" +"E","","","2","2","1","","","","","","","","","single_comp_E","single_comp_2_B","" +"F","","","1","1","1","","","","","","","","","single_comp_F","","" +"G","","","1","1","1","","","","","","","","","single_comp_G","","" \ No newline at end of file diff --git a/functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..1b36d42 --- /dev/null +++ b/functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,8 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B low_frequency +complete_genome_single_chrom_2 A C E low_frequency +genome_single_chrom_larger A C B low_frequency +genome_single_chrom_larger C Sequence_break D low_frequency +genome_single_chrom_larger C Sequence_break E low_frequency +genome_single_chrom_larger C Sequence_break F low_frequency +genome_single_chrom_larger C Sequence_break G low_frequency diff --git a/functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected new file mode 100644 index 0000000..551faf3 --- /dev/null +++ b/functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 +A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 diff --git a/functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..8335562 --- /dev/null +++ b/functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,10 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A C 109 1 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger C Sequence_break 310 4 diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected index cdef650..111ba56 100644 --- a/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected @@ -1,2 +1,2 @@ Gff Core_gene_1 Core_gene_2 gene type -genome_single_chrom_larger A F G intermediate_frequency +genome_single_chrom_larger A F G low_frequency diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..66e8a15 --- /dev/null +++ b/functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,4 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger C E D low_frequency +genome_single_chrom_larger E Sequence_break F low_frequency +genome_single_chrom_larger E Sequence_break G low_frequency diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..89a2209 --- /dev/null +++ b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected @@ -0,0 +1,8 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-E,1,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,2,2,3,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,2,3,2,2,9,109,59.0,59.0,0,1,0.5,0.5 +C-Sequence_break,2,3,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +E-Sequence_break,1,2,0,0,110,110,110.0,110.0,2,2,2.0,2.0 diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..8e047e6 --- /dev/null +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,14 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger B C 9 0 +genome_single_chrom_larger C E 109 1 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger E Sequence_break 110 2 diff --git a/functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv b/functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv new file mode 100644 index 0000000..03e8ab7 --- /dev/null +++ b/functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv @@ -0,0 +1,8 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger A C B intermediate_frequency +genome_single_chrom_larger C Sequence_break D intermediate_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv b/functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv new file mode 100644 index 0000000..551faf3 --- /dev/null +++ b/functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 +A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 diff --git a/functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv b/functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv new file mode 100644 index 0000000..8335562 --- /dev/null +++ b/functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv @@ -0,0 +1,10 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A C 109 1 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger C Sequence_break 310 4 From 541c66f3ac9adc21d404393ac5a48a428d2d29e2 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 12:52:18 +1100 Subject: [PATCH 045/135] Change unit test to reflect change in output functions --- functional_tests/Corekaburra-test.sh | 12 +++++----- ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 ..._core_accessory_gene_content.tsv.expected} | 0 .../core_pair_summary.csv.expected} | 0 ...low_frequency_gene_placement.tsv.expected} | 0 .../gene_content.txt | 24 +++++++++---------- 8 files changed, 18 insertions(+), 18 deletions(-) rename functional_tests/test_data/{Increase_low_cutoff => Increase_low_cutoff_expected}/core_core_accessory_gene_content.tsv.expected (100%) rename functional_tests/test_data/{Increase_low_cutoff => Increase_low_cutoff_expected}/core_pair_summary.csv.expected (100%) rename functional_tests/test_data/{Increase_low_cutoff => Increase_low_cutoff_expected}/low_frequency_gene_placement.tsv.expected (100%) rename functional_tests/test_data/{low_freq_cutoff_0/core_core_accessory_gene_content.tsv => low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected} (100%) rename functional_tests/test_data/{low_freq_cutoff_0/core_pair_summary.csv => low_freq_cutoff_0_expected/core_pair_summary.csv.expected} (100%) rename functional_tests/test_data/{low_freq_cutoff_0/low_frequency_gene_placement.tsv => low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected} (100%) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 1bb4742..e966ac3 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -258,17 +258,17 @@ rm -r test_out_folder # TODO - test with increase low cutoff call_new_test "Test with increase low cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0.4 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Increase_low_cutoff/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Increase_low_cutoff/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Increase_low_cutoff_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - test with zero low cutoff call_new_test "Test with zero low cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv low_freq_cutoff_0/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv low_freq_cutoff_0/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv low_freq_cutoff_0/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv low_freq_cutoff_0_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - test Panaroo input w. correction diff --git a/functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected similarity index 100% rename from functional_tests/test_data/Increase_low_cutoff/core_core_accessory_gene_content.tsv.expected rename to functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected similarity index 100% rename from functional_tests/test_data/Increase_low_cutoff/core_pair_summary.csv.expected rename to functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected diff --git a/functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected similarity index 100% rename from functional_tests/test_data/Increase_low_cutoff/low_frequency_gene_placement.tsv.expected rename to functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected diff --git a/functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected similarity index 100% rename from functional_tests/test_data/low_freq_cutoff_0/core_core_accessory_gene_content.tsv rename to functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected similarity index 100% rename from functional_tests/test_data/low_freq_cutoff_0/core_pair_summary.csv rename to functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected diff --git a/functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected similarity index 100% rename from functional_tests/test_data/low_freq_cutoff_0/low_frequency_gene_placement.tsv rename to functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt index b35a276..a12ebc7 100644 --- a/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt @@ -1,13 +1,13 @@ Gff Core_gene_1 Core_gene_2 gene type -genome_1 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency -genome_1 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency -genome_1 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency -genome_2 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency -genome_2 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency -genome_2 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency -genome_3 pan_cluster_1 pan_cluster_2 Acc_1 low_frequency -genome_3 pan_cluster_1 pan_cluster_2 Acc_2 low_frequency -genome_3 pan_cluster_1 pan_cluster_2 low_1 intermediate_frequency -genome_1 pan_cluster_2 pan_cluster_3 Acc_1 low_frequency -genome_1 pan_cluster_2 pan_cluster_3 Acc_2 low_frequency -genome_2 pan_cluster_2 pan_cluster_4 Acc_1 low_frequency +genome_1 pan_cluster_1 pan_cluster_2 Acc_1 intermediate_frequency +genome_1 pan_cluster_1 pan_cluster_2 Acc_2 intermediate_frequency +genome_1 pan_cluster_1 pan_cluster_2 low_1 low_frequency +genome_2 pan_cluster_1 pan_cluster_2 Acc_1 intermediate_frequency +genome_2 pan_cluster_1 pan_cluster_2 Acc_2 intermediate_frequency +genome_2 pan_cluster_1 pan_cluster_2 low_1 low_frequency +genome_3 pan_cluster_1 pan_cluster_2 Acc_1 intermediate_frequency +genome_3 pan_cluster_1 pan_cluster_2 Acc_2 intermediate_frequency +genome_3 pan_cluster_1 pan_cluster_2 low_1 low_frequency +genome_1 pan_cluster_2 pan_cluster_3 Acc_1 intermediate_frequency +genome_1 pan_cluster_2 pan_cluster_3 Acc_2 intermediate_frequency +genome_2 pan_cluster_2 pan_cluster_4 Acc_1 intermediate_frequency From 21802772c8551a2e2095c56b0413b05f33afda62 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:02:03 +1100 Subject: [PATCH 046/135] Adjust the output for functional test around cutoff --- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../core_core_accessory_gene_content.tsv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected index 8e047e6..6171417 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -3,12 +3,12 @@ complete_genome_single_chrom A B 9 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 -complete_genome_single_chrom_2 A E 9 0 -complete_genome_single_chrom_2 C E 9 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger A B 9 0 genome_single_chrom_larger B C 9 0 genome_single_chrom_larger C E 109 1 genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger E Sequence_break 110 2 +genome_single_chrom_larger E Sequence_break 110 2 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected index 03e8ab7..bc483b0 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected @@ -1,8 +1,8 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_single_chrom A C B intermediate_frequency -complete_genome_single_chrom_2 A C E intermediate_frequency genome_single_chrom_larger A C B intermediate_frequency genome_single_chrom_larger C Sequence_break D intermediate_frequency genome_single_chrom_larger C Sequence_break E intermediate_frequency genome_single_chrom_larger C Sequence_break F intermediate_frequency genome_single_chrom_larger C Sequence_break G intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected index 8335562..fec55a8 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 -complete_genome_single_chrom_2 A C 109 1 -complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger A Sequence_break 0 0 genome_single_chrom_larger C Sequence_break 310 4 +complete_genome_single_chrom_2 A C 109 1 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 From af13eafe86d2d879a1a3cfe5883b51d06fec4a23 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:07:28 +1100 Subject: [PATCH 047/135] adjust output --- .../core_core_accessory_gene_content.tsv.expected | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected index 1b36d42..9aad9eb 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected @@ -1,8 +1,9 @@ Gff Core_gene_1 Core_gene_2 gene type -complete_genome_single_chrom A C B low_frequency +complete_genome_single_chrom A C B low_frequency complete_genome_single_chrom_2 A C E low_frequency genome_single_chrom_larger A C B low_frequency genome_single_chrom_larger C Sequence_break D low_frequency genome_single_chrom_larger C Sequence_break E low_frequency genome_single_chrom_larger C Sequence_break F low_frequency -genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger C Sequence_break G low_frequency + From 7f67e61fe965c66c564f3472003a85cd78245b9e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:17:02 +1100 Subject: [PATCH 048/135] Add in sorting of keys from dicts to make output the same between runs --- Corekaburra/output_writer_functions.py | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../core_core_accessory_gene_content.tsv.expected | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 19add65..65260d9 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -28,7 +28,7 @@ def master_info_writer(master_info, out_path, prefix, quiet): writer.writerow(header) # Write remaining rows: - for key in master_info.keys(): + for key in sorted(master_info.keys()): info = master_info[key][0:5] writer.writerow(info) @@ -46,7 +46,7 @@ def master_info_writer(master_info, out_path, prefix, quiet): writer.writerow(header) # Write remaining rows: - for key in master_info.keys(): + for key in sorted(master_info.keys()): core_core_region = master_info[key] if len(core_core_region[5]): for gene in core_core_region[5]: diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected index 6171417..4598f2f 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -2,13 +2,13 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger A B 9 0 genome_single_chrom_larger B C 9 0 genome_single_chrom_larger C E 109 1 genome_single_chrom_larger A Sequence_break 0 0 genome_single_chrom_larger E Sequence_break 110 2 -complete_genome_single_chrom_2 A E 9 0 -complete_genome_single_chrom_2 C E 9 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected index bc483b0..a3128c2 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected @@ -1,8 +1,8 @@ Gff Core_gene_1 Core_gene_2 gene type -complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency genome_single_chrom_larger A C B intermediate_frequency genome_single_chrom_larger C Sequence_break D intermediate_frequency genome_single_chrom_larger C Sequence_break E intermediate_frequency genome_single_chrom_larger C Sequence_break F intermediate_frequency -genome_single_chrom_larger C Sequence_break G intermediate_frequency -complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger C Sequence_break G intermediate_frequency From 3e89d2ce71f545c4749d5388b436a0d8f94a2024 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:22:31 +1100 Subject: [PATCH 049/135] Adjust output from functional tests --- ...core_core_accessory_gene_content.tsv.expected | 5 ++--- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- .../low_frequency_gene_placement.tsv.expected | 16 ++++++++-------- ...core_core_accessory_gene_content.tsv.expected | 6 +++--- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected index 9aad9eb..1b36d42 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected @@ -1,9 +1,8 @@ Gff Core_gene_1 Core_gene_2 gene type -complete_genome_single_chrom A C B low_frequency +complete_genome_single_chrom A C B low_frequency complete_genome_single_chrom_2 A C E low_frequency genome_single_chrom_larger A C B low_frequency genome_single_chrom_larger C Sequence_break D low_frequency genome_single_chrom_larger C Sequence_break E low_frequency genome_single_chrom_larger C Sequence_break F low_frequency -genome_single_chrom_larger C Sequence_break G low_frequency - +genome_single_chrom_larger C Sequence_break G low_frequency diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected index 8335562..e0313a2 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -1,10 +1,10 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 A C 109 1 -complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 genome_single_chrom_larger A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 4 diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected index 4598f2f..90a8203 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -1,14 +1,14 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 -complete_genome_single_chrom B C 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 +genome_single_chrom_larger A B 9 0 complete_genome_single_chrom_2 A E 9 0 -complete_genome_single_chrom_2 C E 9 0 +complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom_2 A Sequence_break 0 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 -genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger A Sequence_break 0 0 +complete_genome_single_chrom B C 9 0 genome_single_chrom_larger B C 9 0 +complete_genome_single_chrom_2 C E 9 0 genome_single_chrom_larger C E 109 1 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger E Sequence_break 110 2 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger E Sequence_break 110 2 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected index a3128c2..03e8ab7 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected @@ -1,8 +1,8 @@ Gff Core_gene_1 Core_gene_2 gene type -complete_genome_single_chrom A C B intermediate_frequency -complete_genome_single_chrom_2 A C E intermediate_frequency +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency genome_single_chrom_larger A C B intermediate_frequency genome_single_chrom_larger C Sequence_break D intermediate_frequency genome_single_chrom_larger C Sequence_break E intermediate_frequency genome_single_chrom_larger C Sequence_break F intermediate_frequency -genome_single_chrom_larger C Sequence_break G intermediate_frequency +genome_single_chrom_larger C Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected index fec55a8..e0313a2 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected @@ -1,10 +1,10 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 -genome_single_chrom_larger A C 109 1 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger C Sequence_break 310 4 complete_genome_single_chrom_2 A C 109 1 +genome_single_chrom_larger A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom_2 A Sequence_break 0 0 +genome_single_chrom_larger A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger C Sequence_break 310 4 From adc5b91099de0a29efd42898f8bc103bd25faa2c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:29:40 +1100 Subject: [PATCH 050/135] Adjust unit test after adding sorting of ficts --- .../unit_test_data/TestWritingOutputFunction/low_freq.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt index b36c0e8..da074fc 100644 --- a/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt @@ -3,7 +3,7 @@ genome_1 pan_cluster_1 pan_cluster_2 99 3 genome_2 pan_cluster_1 pan_cluster_2 99 3 genome_3 pan_cluster_1 pan_cluster_2 99 3 genome_1 pan_cluster_2 pan_cluster_3 100 2 -genome_2 pan_cluster_2 pan_cluster_4 150 1 genome_3 pan_cluster_2 pan_cluster_3 200 0 +genome_2 pan_cluster_2 pan_cluster_4 150 1 genome_1 pan_cluster_3 pan_cluster_4 -5 0 genome_3 pan_cluster_3 pan_cluster_4 -10 0 From c9078610cc9787ba8e1ff4d51fea1082b6551c72 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:39:58 +1100 Subject: [PATCH 051/135] Adjust outputs following sort statement of dict keys in output functions --- ...re_core_accessory_gene_content.tsv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- .../low_frequency_gene_placement.tsv.expected | 18 +++++++++--------- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- .../low_frequency_gene_placement.tsv.expected | 18 ++++++++++-------- 6 files changed, 33 insertions(+), 31 deletions(-) diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected index f1c04da..d3e2626 100644 --- a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected @@ -1,5 +1,5 @@ Gff Core_gene_1 Core_gene_2 gene type genome_single_chrom_larger A C B_1 low_frequency +genome_single_chrom_larger_rearrange A C D_1 low_frequency genome_single_chrom_larger C E D_2 low_frequency -genome_single_chrom_larger_rearrange A C D_1 low_frequency genome_single_chrom_larger_rearrange C E B_2 low_frequency diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected index b2ae7e9..e0e75aa 100644 --- a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,7 +1,7 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -genome_single_chrom_larger A C 109 1 -genome_single_chrom_larger C E 109 1 -genome_single_chrom_larger A E 10 0 +genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger A E 10 0 +genome_single_chrom_larger_rearrange A E 10 0 +genome_single_chrom_larger C E 109 1 genome_single_chrom_larger_rearrange C E 109 1 -genome_single_chrom_larger_rearrange A E 10 0 diff --git a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected index 1798da0..fea1b41 100644 --- a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,13 +1,13 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -genome_single_chrom_larger A B 9 0 -genome_single_chrom_larger B C 9 0 -genome_single_chrom_larger C D 9 0 -genome_single_chrom_larger D E 9 0 -genome_single_chrom_larger E F 9 0 -genome_single_chrom_larger A F 10 1 -genome_single_chrom_larger_rearrange A D 9 0 -genome_single_chrom_larger_rearrange C D 9 0 +genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger_rearrange A D 9 0 +genome_single_chrom_larger A F 10 1 +genome_single_chrom_larger_rearrange A F 10 0 +genome_single_chrom_larger B C 9 0 genome_single_chrom_larger_rearrange B C 9 0 genome_single_chrom_larger_rearrange B E 9 0 +genome_single_chrom_larger C D 9 0 +genome_single_chrom_larger_rearrange C D 9 0 +genome_single_chrom_larger D E 9 0 +genome_single_chrom_larger E F 9 0 genome_single_chrom_larger_rearrange E F 9 0 -genome_single_chrom_larger_rearrange A F 10 0 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected index 9e8becf..3327fca 100644 --- a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom C Sequence_break 10 0 -complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 703697f..383d390 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -1,8 +1,8 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_single_chrom A B 9 0 -complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 -complete_genome_single_chrom_2 A B 9 0 -complete_genome_single_chrom_2 B C 9 0 complete_genome_single_chrom_2 A Sequence_break 0 0 +complete_genome_single_chrom B C 9 0 +complete_genome_single_chrom_2 B C 9 0 complete_genome_single_chrom_2 C Sequence_break 10 0 diff --git a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 1df4948..f5d6e45 100644 --- a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -1,13 +1,15 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_double_chrom A B 9 0 -complete_genome_double_chrom B C 9 0 -complete_genome_double_chrom A C 10 0 -complete_genome_double_chrom D E 9 0 -complete_genome_double_chrom E F 9 0 -complete_genome_double_chrom D F 10 0 +complete_genome_double_chrom A B 9 0 complete_genome_double_chrom_2 A B 9 0 +complete_genome_double_chrom A C 10 0 +complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom B C 9 0 complete_genome_double_chrom_2 B C 9 0 -complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom D E 9 0 complete_genome_double_chrom_2 D E 9 0 -complete_genome_double_chrom_2 E F 9 0 +complete_genome_double_chrom D F 10 0 complete_genome_double_chrom_2 D F 10 0 +complete_genome_double_chrom E F 9 0 +complete_genome_double_chrom_2 E F 9 0 + + From 3114e40df598adbd1520e2ae6b12d00289a0c5bc Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:47:33 +1100 Subject: [PATCH 052/135] Adjust outputs following sort statement of dict keys in output functions --- .../core_core_accessory_gene_content.tsv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected index d3e2626..8e9f48f 100644 --- a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected @@ -1,5 +1,5 @@ Gff Core_gene_1 Core_gene_2 gene type genome_single_chrom_larger A C B_1 low_frequency -genome_single_chrom_larger_rearrange A C D_1 low_frequency +genome_single_chrom_larger_rearrange A C D_1 low_frequency genome_single_chrom_larger C E D_2 low_frequency genome_single_chrom_larger_rearrange C E B_2 low_frequency diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected index e0e75aa..bf27c68 100644 --- a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,7 +1,7 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -genome_single_chrom_larger A E 10 0 -genome_single_chrom_larger_rearrange A E 10 0 +genome_single_chrom_larger A E 10 0 +genome_single_chrom_larger_rearrange A E 10 0 genome_single_chrom_larger C E 109 1 genome_single_chrom_larger_rearrange C E 109 1 diff --git a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected index fea1b41..d8026be 100644 --- a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,12 +1,12 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -genome_single_chrom_larger A B 9 0 -genome_single_chrom_larger_rearrange A D 9 0 -genome_single_chrom_larger A F 10 1 +genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger_rearrange A D 9 0 +genome_single_chrom_larger A F 10 1 genome_single_chrom_larger_rearrange A F 10 0 -genome_single_chrom_larger B C 9 0 +genome_single_chrom_larger B C 9 0 genome_single_chrom_larger_rearrange B C 9 0 genome_single_chrom_larger_rearrange B E 9 0 -genome_single_chrom_larger C D 9 0 +genome_single_chrom_larger C D 9 0 genome_single_chrom_larger_rearrange C D 9 0 genome_single_chrom_larger D E 9 0 genome_single_chrom_larger E F 9 0 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected index 3327fca..9c3b77e 100644 --- a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,7 +1,7 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_single_chrom A B 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 383d390..89bad36 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -1,6 +1,6 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_single_chrom A B 9 0 -complete_genome_single_chrom_2 A B 9 0 +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 diff --git a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index f5d6e45..79986b9 100644 --- a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -1,11 +1,11 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_double_chrom A B 9 0 +complete_genome_double_chrom A B 9 0 complete_genome_double_chrom_2 A B 9 0 -complete_genome_double_chrom A C 10 0 -complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom A C 10 0 +complete_genome_double_chrom_2 A C 10 0 complete_genome_double_chrom B C 9 0 complete_genome_double_chrom_2 B C 9 0 -complete_genome_double_chrom D E 9 0 +complete_genome_double_chrom D E 9 0 complete_genome_double_chrom_2 D E 9 0 complete_genome_double_chrom D F 10 0 complete_genome_double_chrom_2 D F 10 0 From 65a71a5b10b22f28ec46ebbbc0e6c3d83c026043 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Thu, 6 Jan 2022 13:50:10 +1100 Subject: [PATCH 053/135] Remove extra new lines in file --- .../low_frequency_gene_placement.tsv.expected | 2 -- 1 file changed, 2 deletions(-) diff --git a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 79986b9..47709a3 100644 --- a/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -11,5 +11,3 @@ complete_genome_double_chrom D F 10 0 complete_genome_double_chrom_2 D F 10 0 complete_genome_double_chrom E F 9 0 complete_genome_double_chrom_2 E F 9 0 - - From 28b3a8258590af667683cc7101d790c67901d8de Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Fri, 7 Jan 2022 11:02:27 +1100 Subject: [PATCH 054/135] Add functional test for providing less than all gffs from pan-genome as input --- functional_tests/Corekaburra-test.sh | 13 ++++++++++--- .../Less_than_all_gffs/gene_presence_absence.csv | 8 ++++++++ .../core_core_accessory_gene_content.tsv.expected | 6 ++++++ .../core_pair_summary.csv.expected | 9 +++++++++ .../core_segments.csv.expected | 3 +++ .../low_frequency_gene_placement.tsv.expected | 15 +++++++++++++++ .../no_accessory_core_segments.csv.expected | 3 +++ 7 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index e966ac3..f1cf4fe 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -247,7 +247,6 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -# TODO - test with decreased core-gene cutoff call_new_test "Test with decreased core-gene cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected @@ -255,7 +254,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv core_90_cutoff test_output_file test_out_folder/core_pair_summary.csv core_90_cutoff_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test with increase low cutoff call_new_test "Test with increase low cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0.4 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected @@ -263,7 +261,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Increase_low_c test_output_file test_out_folder/core_pair_summary.csv Increase_low_cutoff_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test with zero low cutoff call_new_test "Test with zero low cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -lc 0 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected @@ -271,6 +268,16 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv low_freq_cutof test_output_file test_out_folder/core_pair_summary.csv low_freq_cutoff_0_expected/core_pair_summary.csv.expected rm -r test_out_folder +# TODO - Test with less than all gffs from pan-genome provided +call_new_test "Test with less than all gffs from pan-genome provided" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Less_than_all_gffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_gffs_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_run_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + # TODO - test Panaroo input w. correction # TODO - Add in corrections before this! diff --git a/functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv b/functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv new file mode 100644 index 0000000..e6f2dad --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","4","1","","","","","","","","","single_comp_A","single_comp_2_A","single_comp_A","single_comp_2_A" +"B","","","3","3","1","","","","","","","","","single_comp_B","","single_comp_B","single_comp_2_B" +"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","2","2","1","","","","","","","","","single_comp_D","","","single_comp_2_D" +"E","","","3","3","1","","","","","","","","","single_comp_E","single_comp_2_B","","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","single_comp_F","","","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","","","" \ No newline at end of file diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..f28d2d7 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,6 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger C E D intermediate_frequency +genome_single_chrom_larger E Sequence_break F intermediate_frequency +genome_single_chrom_larger E Sequence_break G low_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..bac3707 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,9 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,4,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 +A-Sequence_break,3,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,4,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-E,1,3,3,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,1,4,3,3,109,109,109.0,109.0,1,1,1.0,1.0 +C-Sequence_break,1,4,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +E-Sequence_break,2,3,0,0,110,110,110.0,110.0,1,2,1.5,1.5 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected new file mode 100644 index 0000000..a380ae3 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected @@ -0,0 +1,3 @@ +Segment_name,Segment_position,Core_gene +B-C,1,B +B-C,2,C diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..929708d --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,15 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A B 9 0 +genome_single_chrom_larger A B 9 0 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger_rearrange A Sequence_break 0 0 +complete_genome_single_chrom B C 9 0 +genome_single_chrom_larger B C 9 0 +genome_single_chrom_larger_rearrange B C 9 0 +genome_single_chrom_larger_rearrange B E 9 0 +genome_single_chrom_larger C E 109 1 +complete_genome_single_chrom C Sequence_break 10 0 +genome_single_chrom_larger E Sequence_break 110 2 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..8cdcbeb --- /dev/null +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,3 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +B-C,B-C,1,1,B +B-C,B-C,1,2,C From 97afeed30a50689c43eb85df499d2c2b9e7e79a3 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Sun, 9 Jan 2022 13:16:15 +1100 Subject: [PATCH 055/135] Add in function to propperly refind genes, unit test and functional tests for these. Add in function to discard corrected genes after being segmented --- Code_to_transfer/correct_gffs.py | 305 ----------------- Corekaburra/__main__.py | 42 ++- Corekaburra/check_inputs.py | 7 + Corekaburra/commandline_interface.py | 24 +- Corekaburra/consesus_core_genome.py | 4 +- Corekaburra/correct_gffs.py | 319 ++++++++++++++++++ Corekaburra/exit_with_error.py | 1 + Corekaburra/gff_parser.py | 34 +- Corekaburra/parse_gene_presence_absence.py | 13 +- functional_tests/Corekaburra-test.sh | 15 +- .../Reannotate_run_fail/gene_data.csv | 6 + .../gene_presence_absence.csv | 8 + .../gene_presence_absence_roary.csv | 8 + .../Reannotate_run_succes/gene_data.csv | 7 + .../gene_presence_absence.csv | 8 + .../gene_presence_absence_roary.csv | 8 + ...nome_single_chrom_2_corrected.gff.expected | 14 + ...genome_single_chrom_corrected.gff.expected | 13 + ...om_larger_rearrange_corrected.gff.expected | 21 ++ ...e_core_accessory_gene_content.tsv.expected | 16 + .../core_pair_summary.csv.expected | 4 + .../low_frequency_gene_placement.tsv.expected | 13 + .../complete_genome_single_chrom.gff | 8 +- .../complete_genome_single_chrom_2.gff | 8 +- .../test_data/genome_single_chrom_larger.gff | 14 +- .../genome_single_chrom_larger_rearrange.gff | 14 +- unit_tests/Corekaburra_test.py | 283 +++++++++++++++- .../TestPresenceOfGenedataFile/.DS_Store | Bin 6148 -> 0 bytes 28 files changed, 845 insertions(+), 372 deletions(-) delete mode 100644 Code_to_transfer/correct_gffs.py create mode 100644 Corekaburra/correct_gffs.py create mode 100644 functional_tests/test_data/Reannotate_run_fail/gene_data.csv create mode 100644 functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Reannotate_run_succes/gene_data.csv create mode 100644 functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected delete mode 100644 unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store diff --git a/Code_to_transfer/correct_gffs.py b/Code_to_transfer/correct_gffs.py deleted file mode 100644 index 2b00fa7..0000000 --- a/Code_to_transfer/correct_gffs.py +++ /dev/null @@ -1,305 +0,0 @@ -from Bio import Seq -from Bio.SeqRecord import SeqRecord -import gffutils -from gffutils.gffwriter import GFFWriter -import os -import concurrent.futures -# from Bio.Blast import -from time import time - -# REMOVE! -from parse_gene_presence_absence import read_gene_presence_absence - - -def read_gene_data(gene_data_file): - """ Function to read the gene_data.csv file outputted by Panaroo and - return a dict of genomes with their refound genes""" - # Construct dictionary to hold refound genes and sequences for these - gene_data_dict = {} - - # Read the gene_data.csv file and record all refound genes - with open(gene_data_file, 'r') as gene_data: - - for line in gene_data.readlines(): - # Split read line at commas - line = line.split(',') - - # Check if refound gene - if 'refound' in line[2]: - # Try to add the refound gene to the gene_data dict as a key to value being the DNA sequence, - # if the genome is not found in gene_data dict, then construct dict for the genome and add the gene - try: - gene_data_dict[line[0]][line[2]] = line[5] - except KeyError: - gene_data_dict[line[0]] = {} - gene_data_dict[line[0]][line[2]] = line[5] - - # TODO - Find example of gene with differnt length and a premature stop codon. - # if 'len' in line[2]: - # print(line) - # if 'stop' in line[2]: - # print(line) - gene_data.close() - - return gene_data_dict - - -def extract_genome_fasta(gff_name): - """ Function to read a gff3 file and extract the fasta seuqnce at the end - each contig is returned as an entry in a dictionary """ - # Initialise the two return variables a dict for refound gene's annotaton and the largest locus_tag - genome_fasta_dict = {} - largest_locus_tag = '' - header_lines = [] - - # Open the gff file and indicate that the FASTA sequence has not beed reached - with open(gff_name, 'r') as gff_file: - found_fasta = False - - # Go through gff file and find and read the fasta seuqence at the end after the ##FASTA mark - for line in gff_file.readlines(): - if found_fasta: - # Check if line is fasta header, - # If then construct new dict key if not append sequence to current key - if '>' in line: - line = line.split(' ')[0] - try: - current_contig = line.strip() - current_contig = current_contig.split(">")[1] - genome_fasta_dict[current_contig] = '' - except KeyError: - raise KeyError("Some contig names contain redundant names when seperated by white space") - else: - line = line.split('\n')[0] - genome_fasta_dict[current_contig] = genome_fasta_dict[current_contig] + line - - # Check if FASTA part of gff file has been found, - # if then indicate to start recording sequences, - # else compare locus_tag of the line - elif '##FASTA' in line: - found_fasta = True - - else: - # Compare the locus_tag with the previously largest locus_tag, - # save the largest of the two - if '#' not in line: - line = line.split('\t') - line = line[8].split(';') - line = [element for element in line if 'locus_tag' in element] - # Examien non empty locus_tags - if len(line) > 0: - line = line[0] - line = line.split('locus_tag=')[1] - if line > largest_locus_tag: - largest_locus_tag = line - # Save header lines - else: - header_lines.append(line) - - return genome_fasta_dict, largest_locus_tag, header_lines - - -def construct_gff_line(gene_oi, genome_oi, contig, strand, annotation, refound_gene_tag, largest_locus_tag): - # Prepare the start, end, and locus_tag of the gene feature line. - gene_start = genome_oi.find(gene_oi) + 1 - gene_end = gene_start + len(gene_oi) - locus_tag_parts = largest_locus_tag.split('_') - locus_tag_parts[1] = int(locus_tag_parts[1]) + 1 - new_locus_tag = f'{locus_tag_parts[0]}_{locus_tag_parts[1]}' - - # construct tab delimited string containing the features of the gene - gff_line = f'{contig}\t' \ - f'Panaroo\t' \ - f'CDS\t' \ - f'{gene_start}\t' \ - f'{gene_end}\t' \ - f'.\t' \ - f'{strand}\t' \ - f'0\t' \ - f'ID={new_locus_tag};annotation={annotation};locus_tag={new_locus_tag};old_locus_tag={refound_gene_tag}' - - return gff_line, new_locus_tag - - -def add_gene_to_gff(tmp_gff, gene_oi, genome_oi, contig, strand, annotation, refound_gene_tag, largest_locus_tag): - gene_start = genome_oi.find(gene_oi) + 1 - gene_end = gene_start + len(gene_oi) - locus_tag_parts = largest_locus_tag.rsplit('_', maxsplit=1) - locus_tag_parts[1] = int(locus_tag_parts[1]) + 1 - new_locus_tag = f'{locus_tag_parts[0]}_{locus_tag_parts[1]}' - - # construct tab delimited string containing the features of the gene - gff_line = f'{contig}\t' \ - f'Panaroo\t' \ - f'CDS\t' \ - f'{gene_start}\t' \ - f'{gene_end}\t' \ - f'.\t' \ - f'{strand}\t' \ - f'0\t' \ - f'ID={new_locus_tag};annotation={annotation};locus_tag={new_locus_tag};old_locus_tag={refound_gene_tag}' - - with open(tmp_gff, 'a') as tmp_gff_file: - tmp_gff_file.write(gff_line + '\n') - tmp_gff_file.close() - - return new_locus_tag - - -def write_contig(file, contig_name, sequnce): - # Write contig name - file.write(f'>{contig_name}\n') - - # Write bulk of sequence - for i in range(len(sequnce) // 60): - file.write(sequnce[0+60*i:60+60*i] + '\n') - - # Write remainder of sequence - remainder = len(sequnce) % 60 - genome_length = len(sequnce) - file.write(sequnce[len(sequnce) - remainder:genome_length+1] + '\n') - - -def annotate_refound_genes(gff_name, gene_data_dict, temp_folder_path, annotation_dict, corrected_gff_out_dir, i): - """ Function to annotate the genes refound by Panaroo in a gff3 file""" - # Print info on progress - if (i+1) % 25 == 0 or i == 0: - print(f"Correcting GFF file #{i+1}") - - # Read in a gff file - # Get base name of gff file and construct path to database in temporary folder - gff_file_name = os.path.basename(gff_name) - data_base = os.path.join(temp_folder_path, f'{gff_file_name}_db') - - # Create a database for the gff file - gffutils.create_db(gff_name, data_base) - # Attach database - gff_db = gffutils.FeatureDB(data_base) - - # Write quick tmp database for appending new genes. - tmp_gff = os.path.join(temp_folder_path, f'{gff_file_name.split(".gff")[0]}_tmp.gff') - with open(tmp_gff, 'w') as gff_file: - for feature in gff_db.all_features(): - gff_file.writelines(str(feature) + '\n') - gff_file.close() - - # Pass the gff file manually to extract the genome fasta sequence(s) and the largest locus_tag - fasta_genome, largest_locus_tag, header_lines = extract_genome_fasta(gff_name) - - # Find all refound genes for given genome in gene data file - genome_name = gff_file_name.split('.')[0] - - # Search for the refound genes and record their coordinate, strand, - for refound_gene in gene_data_dict[genome_name].keys(): - gene_oi = gene_data_dict[genome_name][refound_gene] - - strand = None - contig_counter = 0 - contigs = list(fasta_genome.keys()) - while strand is None: - contig = contigs[contig_counter] - genome_oi = fasta_genome[contig] - - if gene_oi in genome_oi: - strand = '+' - - else: - # get reverse complement of the gene - gene_oi = Seq.reverse_complement(gene_oi) - if gene_oi in genome_oi: - strand = '-' - - contig_counter += 1 - - if strand is not None: - # Add the gene to the gff file. - largest_locus_tag = add_gene_to_gff(tmp_gff, gene_oi, genome_oi, contig, strand, - annotation_dict[refound_gene], refound_gene, largest_locus_tag) - - else: - raise ValueError(f"When correcting gff {gff_name}, the gene: {refound_gene} " - f"did not have any hit in the genome!") - - # Construct a database from the temporary gff that contain the added annotations - path_tmp_gff_db = os.path.join(temp_folder_path, f'{gff_file_name}_tmp_db') - # make database - gffutils.create_db(tmp_gff, path_tmp_gff_db) - # Attach database - tmp_gff_db = gffutils.FeatureDB(path_tmp_gff_db) - - # Print the final GFF3 file - with open(os.path.join(corrected_gff_out_dir, f'{gff_file_name.split(".gff")[0]}_corrected.gff'), 'w') as gff_file: - # Write initial lines - for line in header_lines: - gff_file.write(line) - - # ADD the gff lines - for feature in tmp_gff_db.all_features(order_by=('seqid', 'start')): - gff_file.writelines(str(feature) + '\n') - - # Write line to seperate genome fasta - gff_file.write("##FASTA\n") - - # Write the genome fasta - for contig_name in fasta_genome.keys(): - write_contig(gff_file, contig_name, fasta_genome[contig_name]) - - # remove database for gff file and the temporary gff file in temporary folder - os.remove(data_base) - os.remove(tmp_gff) - os.remove(path_tmp_gff_db) - - -def correct_gffs(gffs, gene_data_file, output_folder, annotation_dict, temp_folder_path): - # TODO - Make verbose - print("Reading gene_data.csv") - # Read Gene_data.csv file into dict with a dict of refound genes for each genome - gene_data_dict = read_gene_data(gene_data_file) - - # Construct directory to hold corrected gff files: - corrected_gff_out_dir = os.path.join(output_folder, 'Corrected_gff_files') - # Try and construct folder, if present check if content matches input to avoid process - try: - os.mkdir(corrected_gff_out_dir) - except FileExistsError: - corrected_folder_content = os.listdir(corrected_gff_out_dir) - - gff_names = [os.path.basename(gff) for gff in gffs] - - corrected_files = [file for file in corrected_folder_content if f'{file.split("_corrected")[0]}.gff' in gff_names] - if len(corrected_files) == len(gffs): - os.rmdir(temp_folder_path) - return corrected_gff_out_dir - - else: - os.rmdir('genome_corer_tmp') - raise FileNotFoundError - #TODO - RAISE ERROR THAT SOME FILES ARE NOT IN THE corrected - # Find the files and process them - # If none of the files match then abort. - - - # TODO - Make verbose - print("Start correcting GFF files with refound genes from Panaroo") - # Multi process the annotation of the genomes. (Process has been found to be better than threads) - total_time = time() - with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor: - result = [executor.submit(annotate_refound_genes, gff, gene_data_dict, temp_folder_path, - annotation_dict, corrected_gff_out_dir, i) for i, gff in enumerate(gffs)] - - for f in concurrent.futures.as_completed(result): - f.result() - - - print(f"total time for correction {time() - total_time}") - - return corrected_gff_out_dir - - -if __name__ == '__main__': - _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv', - 1, 0.05) - - correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv', - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict) - # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff') diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 4655f19..dafbd49 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -35,6 +35,11 @@ except ModuleNotFoundError: from parse_gene_presence_absence import read_gene_presence_absence +try: + from Corekaburra.correct_gffs import prepair_for_reannotation +except ModuleNotFoundError: + from correct_gffs import prepair_for_reannotation + try: from Corekaburra.gff_parser import segment_genome_content except ModuleNotFoundError: @@ -65,7 +70,7 @@ EXIT_INPUT_FILE_ERROR = 1 EXIT_COMMAND_LINE_ERROR = 2 -EXIT_FASTA_FILE_ERROR = 3 +EXIT_GFF_REANNOTATION_ERROR = 3 DEFAULT_MIN_LEN = 0 DEFAULT_VERBOSE = False PROGRAM_NAME = "Corekaburra" @@ -160,6 +165,9 @@ def main(): # Check if gene_data file is present if Panaroo input is given an gffs should be annotated if args.annotate and source_program == 'Panaroo': gene_data_path = check_gene_data(args.input_pan) + else: + gene_data_path = None + if not args.quiet: print(f"Pan genome determined to come from {source_program}") print("All files found, let's move on!\n") @@ -184,16 +192,21 @@ def main(): ## Read in gene presence absence file time_start = time.time() - # TODO - Add the user specified thresholds for core and low frequency genes. # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] - core_dict, low_freq_dict, acc_gene_dict, attribute_dict = read_gene_presence_absence(input_pres_abs_file_path, + core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, args.input_gffs, tmp_folder_path) + if source_program == "Panaroo" and args.annotate: + gene_data_dict, corrected_dict, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, args.input_gffs) + else: + gene_data_dict = None + corrected_dict = None + # TODO - Add this into the multiprocessing loop to not doubble files # TODO - Add a user command to keep and discard the corrected files (But still using them - Make mutually exclusive with -a option) # Add in the refound genes into the gff files and print the corrected GFF files. @@ -218,14 +231,26 @@ def main(): master_info_total = {} non_core_contig_info = {} - with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor: # TODO - change the max workers to the user specified number + progress_counter = 0 + if len(args.input_gffs) > 10: + progress_update = len(args.input_gffs) / 10 + else: + progress_update = 1 + + with concurrent.futures.ProcessPoolExecutor(max_workers=args.cpu) as executor: print(f"\n------Start core region identification of given gff files-----") print(f'{len(args.input_gffs)} GFF files to process') - results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, i, comp_genomes) - for i, gff in enumerate(args.input_gffs)] + results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes, + source_program, args.annotate, gene_data_dict, corrected_dict, tmp_folder_path, args.discard_gffs) + for gff in args.input_gffs] for output in concurrent.futures.as_completed(results): + progress_counter += 1 + if progress_counter % progress_update == 0 or progress_counter == 1: + print( + f"GFF file #{progress_counter} has been processed") + # Split the outputs core_pairs, distance, acc_count, \ low_freq, master_info_return, \ @@ -266,11 +291,9 @@ def main(): time_start = time.time() master_info_writer(master_info_total, args.output_path, args.output_prefix, args.quiet) summary_info_writer(master_summary_info, args.output_path, args.output_prefix, args.quiet) - # TODO - Contruct output for segments - parent column. if double_edge_segements is not None: segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) - # print(non_core_contig_info) TODO - Print core less contigs. # TODO - Possibly output core gene graph. with segment annotations? # time_calculator(time_start, time.time(), "writing output files") @@ -279,9 +302,10 @@ def main(): # time_calculator(total_time_start, time.time(), "running the entire program") # Remove temporary database holding gff databases - # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - do so in nice exit function if os.path.isdir(tmp_folder_path): os.rmdir(tmp_folder_path) + if args.discard_gffs: + os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files')) if __name__ == '__main__': diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 8bc5dab..9a489ae 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -10,6 +10,12 @@ def check_cutoffs(low_cutoff, core_cutoff): + """ + Function to check the given cutoffs are legal, otherwise provide more info. + :param low_cutoff: Cutoff for low-frequency genes + :param core_cutoff: Cutoff for core genes + :return: Nothing + """ if 0 <= low_cutoff < core_cutoff <= 1: return else: @@ -18,6 +24,7 @@ def check_cutoffs(low_cutoff, core_cutoff): 'Also make sure that the low-frequency gene cutoff is either equal to 0 or <1', EXIT_COMMAND_LINE_ERROR) + def define_pangenome_program(folder): """ Function to examine if input pan genome folder stems from Roary or Panaroo. diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index e5f4960..817265f 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -88,14 +88,22 @@ def get_commandline_arguments(args): default=None, dest='output_prefix') + output_control.add_argument('-d', + '--discard_corrected', + help='Discard gff files corrected with refound genes identified by Panaroo - Only compativle if pan-genome comes from Panaroo [Default: Corrected files are kept]', + required=False, + default=False, + action='store_true', + dest='discard_gffs') + rem_args.add_argument('-c', - '--cpu', - help='Give max number of CPUs [default: 1]', - required=False, - metavar='int', - default=1, - type=int, - dest='cpu') + '--cpu', + help='Give max number of CPUs [default: 1]', + required=False, + metavar='int', + default=1, + type=int, + dest='cpu') logger_level = rem_args.add_mutually_exclusive_group() logger_level.add_argument('-l', @@ -126,8 +134,6 @@ def get_commandline_arguments(args): elif '-help' in args: parser.print_help() sys.exit(0) - if '--check' in args: - sys.exit(1) #TODO write script that checks for dependencies! args = parser.parse_args(args) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 5af46c1..ec34761 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -62,6 +62,7 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun :param combined_acc_gene_count: Dict of the number of accessory genes (value) identified between a set core genes (Key) :return: Dict of subsegments. Same keys as for the segment dict, but keys are a list of lists. Each sub-list is a subsegment. """ + # TODO - ATM this does not occur if every core gene only has two connections. Should is still occur to let a complete static genome synteny be divided into no-accessory segments? # Create dict of subsegments of the larger segments sub_segment_dict = {key: [] for key in double_edge_segements} @@ -244,7 +245,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: # TODO - should this != 0 be here? double_edge_segements[suspected_pair] = path path_identified = True - pass # TODO - is this correct + continue else: # Check if path is length >2, # if then find >2 degree nodes and remove an edge to them, @@ -294,5 +295,6 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num return double_edge_segements, no_acc_segments + if __name__ == '__main__': pass \ No newline at end of file diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py new file mode 100644 index 0000000..f3f9d23 --- /dev/null +++ b/Corekaburra/correct_gffs.py @@ -0,0 +1,319 @@ +from Bio import Seq +from Bio.SeqRecord import SeqRecord +import gffutils +from gffutils.gffwriter import GFFWriter +import os +import concurrent.futures +# from Bio.Blast import +from time import time + +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error +EXIT_GFF_REANNOTATION_ERROR = 3 + + +def read_gene_data(gene_data_file): + """ + Function to read the gene_data.csv file outputted by Panaroo and + :param gene_data_file: File path to the gene_data.csv file + :return: A dict of genomes with their refound genes + """ + + # Construct dictionary to hold refound genes and sequences for these + gene_data_dict = {} + + # Read the gene_data.csv file and record all refound genes + with open(gene_data_file, 'r') as gene_data: + + for line in gene_data.readlines(): + # Split read line at commas + line = line.split(',') + + # Check if refound gene + if 'refound' in line[2]: + # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, + # if the first key (genome) is not found in gene_data dict, + # then construct dict for the genome and add the gene + try: + gene_data_dict[line[0]][line[2]] = [line[5], line[6], line[7].strip()] + except KeyError: + gene_data_dict[line[0]] = {line[2]: [line[5], line[6], line[7].strip()]} + + return gene_data_dict + + +def prepair_for_reannotation(gene_data_path, output_folder, gffs): + """ + Function for creating an output folder for corrected genomes, check if any are present, and if then which. + :param gene_data_path: Path to the gene_data.csv file from Panaroo + :param output_folder: Folder designated as the output folder for Corekaburra + :param gffs: List of file-paths to gff files. + + :return gene_data_dict: Dict containing the information expected from the gene_data.csv file + :return corrected_gff_out_dir: File path to the created or identified directory of corrected gff files + :return gffs: List of gff files, some may be altered to be the corrected verison from prior runs/ + """ + # Read Gene_data.csv file into dict with a dict of refound genes for each genome + gene_data_dict = read_gene_data(gene_data_path) + + # Construct directory to hold corrected gff files: + corrected_gff_out_dir = os.path.join(output_folder, 'Corrected_gff_files') + # Try and construct folder, + # if present check if content matches input to avoid process + try: + os.mkdir(corrected_gff_out_dir) + except FileExistsError: + corrected_folder_content = os.listdir(corrected_gff_out_dir) + + gff_names = [os.path.basename(gff) for gff in gffs] + + corrected_files = [file for file in corrected_folder_content if + f'{file.split("_corrected")[0]}.gff' in gff_names] + + if len(corrected_files) > 0: + gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] + gffs = gffs + corrected_files + + return gene_data_dict, corrected_gff_out_dir, gffs + + +def extract_genome_fasta(gff_name): + """ + Function to read and extract information from a gff3 file + :param gff_name: File path to a gff file + + :return genome_fasta_dict: Dict over contig names as keys, and values being the contig sequence + :return largest_locus_tag: the largest locus_tag identified in gff file + :return header_lines: the header lines proceeding annotations. + """ + + # Initialise the two return variables a dict for refound gene's annotaton and the largest locus_tag + genome_fasta_dict = {} + largest_locus_tag = '' + header_lines = [] + + # Open the gff file and indicate that the FASTA sequence has not beed reached + with open(gff_name, 'r') as gff_file: + found_fasta = False + + # Go through gff file and find and read the fasta seuqence at the end after the ##FASTA mark + for line in gff_file.readlines(): + if found_fasta: + # Check if line is fasta header, + # If then construct new dict key if not append sequence to current key + if '>' in line: + line = line.split(' ')[0] + try: + current_contig = line.strip() + current_contig = current_contig.split(">")[1] + genome_fasta_dict[current_contig] = '' + except KeyError: + raise KeyError("Some contig names contain redundant names when seperated by white space") + else: + line = line.split('\n')[0] + genome_fasta_dict[current_contig] = genome_fasta_dict[current_contig] + line + + # Check if FASTA part of gff file has been found, + # if then indicate to start recording sequences, + # else compare locus_tag of the line + elif '##FASTA' in line: + found_fasta = True + + else: + # Compare the locus_tag with the previously largest locus_tag, + # save the largest of the two + if '#' not in line: + line = line.split('\t') + line = line[8].split(';') + line = [element for element in line if 'locus_tag' in element] + # Examine non empty locus_tags + if len(line) > 0: + line = line[0] + line = line.split('locus_tag=')[1] + line = line.strip() + if line > largest_locus_tag: + largest_locus_tag = line + # Save header lines + else: + header_lines.append(line) + + return genome_fasta_dict, largest_locus_tag, header_lines + + +def add_gene_to_gff(tmp_gff, gene_oi, genome_oi, contig, strand, refound_gene_tag, annotation, largest_locus_tag): + """ + Function to construct and append a line to a file. + :param tmp_gff: An open file to append a line to + :param gene_oi: Gene in question + :param genome_oi: Genome in question + :param contig: Contig of the genome + :param strand: Strand of the gene in question + :param annotation: Any annotation found in Panaroo + :param refound_gene_tag: Tag given by Panaroo + :param largest_locus_tag: The current largest locus_tag + :return: The new largest locus_tag + """ + gene_start = genome_oi.find(gene_oi) + 1 + gene_end = gene_start + len(gene_oi) - 1 + locus_tag_parts = largest_locus_tag.rsplit('_', maxsplit=1) + tag_length = len(locus_tag_parts[1]) + + locus_tag_parts[1] = int(locus_tag_parts[1]) + 1 + preceding_zeros = str(0) * (tag_length - len(str(locus_tag_parts[1]))) + # Add preceding zeros + locus_tag_parts[1] = f'{preceding_zeros}{locus_tag_parts[1]}' + new_locus_tag = f'{locus_tag_parts[0]}_{locus_tag_parts[1]}' + description = annotation[1] + name = annotation[0] + + # Construct gff field 9 + info_field = f'ID={new_locus_tag};locus_tag={new_locus_tag};old_locus_tag={refound_gene_tag}' + + if name != '': + info_field += f';name={name}' + if description != '': + info_field += f';annotation={description}' + + # construct tab delimited string containing the features of the gene + gff_line = f'{contig}\t' \ + f'Panaroo\t' \ + f'CDS\t' \ + f'{gene_start}\t' \ + f'{gene_end}\t' \ + f'.\t' \ + f'{strand}\t' \ + f'0\t' \ + f'{info_field}' + + tmp_gff.write(gff_line + '\n') + + return new_locus_tag + + +def write_contig(file, contig_name, sequence): + """ + Write contig into file + :param file: Open file for appending contig to + :param contig_name: Name of the contig to be added + :param sequence: Sequence of contig to be added + :return: Nothing + """ + # Write contig name + file.write(f'>{contig_name}\n') + + # Write bulk of sequence + for i in range(len(sequence) // 60): + file.write(sequence[0+60*i:60+60*i] + '\n') + + # Write remainder of sequence + remainder = len(sequence) % 60 + genome_length = len(sequence) + file.write(sequence[len(sequence) - remainder:genome_length+1] + '\n') + + +def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir): + """ + Function to add back in genes that are refound by Panaroo into gff files. + :param gff_name: File path of gff to be corrected + :param gene_data_dict: Dict of refound genes identified from gene_presence_absence.csv file + :param tmp_folder_path: File path to the temporary folder + :param corrected_gff_out_dir: File path to the folder where corrected genomes should be place + :return: Nothing. + """ + """ Function to annotate the genes refound by Panaroo in a gff3 file""" + # Read in a gff file + # Get base name of gff file and construct path to database in temporary folder + gff_file_name = os.path.basename(gff_name) + data_base = os.path.join(tmp_folder_path, f'{gff_file_name}_db') + + # Create a database for the gff file + gffutils.create_db(gff_name, data_base) + # Attach database + gff_db = gffutils.FeatureDB(data_base) + + # Write quick tmp database for appending new genes. + tmp_gff = os.path.join(tmp_folder_path, f'{gff_file_name.split(".gff")[0]}_tmp.gff') + with open(tmp_gff, 'w') as gff_file: + for feature in gff_db.all_features(): + gff_file.writelines(str(feature) + '\n') + + # Pass the gff file manually to extract the genome fasta sequence(s) and the largest locus_tag + fasta_genome, largest_locus_tag, header_lines = extract_genome_fasta(gff_name) + + # Find all refound genes for given genome in gene data file + genome_name = gff_file_name.split('.')[0] + + # Search for the refound genes and record their coordinate, strand and add them to the gff file + with open(tmp_gff, 'a') as tmp_gff_file: + for refound_gene in gene_data_dict[genome_name]: # .keys() + gene_oi = gene_data_dict[genome_name][refound_gene][0] + + strand = None + contig_counter = 0 + contigs = list(fasta_genome) # .keys() + while strand is None and contig_counter < len(contigs) : + contig = contigs[contig_counter] + genome_oi = fasta_genome[contig] + + if gene_oi in genome_oi: + strand = '+' + + else: + # get reverse complement of the gene + gene_oi = Seq.reverse_complement(gene_oi) # TODO - Should evaluate if this is correct! Make test that test search for both forward and backward genes on first and second contig! + if gene_oi in genome_oi: + strand = '-' + + contig_counter += 1 + + if strand is not None: + # Add the gene to the gff file. + largest_locus_tag = add_gene_to_gff(tmp_gff_file, gene_oi, genome_oi, contig, strand, + refound_gene, gene_data_dict[genome_name][refound_gene][1:], largest_locus_tag) + else: + exit_with_error(f"When correcting gff {gff_name}, the gene: {refound_gene} " + f"did not have any hit in the genome!", EXIT_GFF_REANNOTATION_ERROR) + + # Construct a database from the temporary gff that contain the added annotations + path_tmp_gff_db = os.path.join(tmp_folder_path, f'{gff_file_name}_tmp_db') + # make database + gffutils.create_db(tmp_gff, path_tmp_gff_db) + # Attach database + tmp_gff_db = gffutils.FeatureDB(path_tmp_gff_db) + + # Print the final GFF3 file + corrected_gff_file = os.path.join(corrected_gff_out_dir, f'{gff_file_name.split(".gff")[0]}_corrected.gff') + with open(corrected_gff_file, 'w') as gff_file: + # Write initial lines + for line in header_lines: + gff_file.write(line) + + # ADD the gff lines + for feature in tmp_gff_db.all_features(order_by=('seqid', 'start')): + gff_file.writelines(str(feature) + '\n') + + # Write line to separate genome fasta + gff_file.write("##FASTA\n") + + # Write the genome fasta + for contig_name in fasta_genome.keys(): + write_contig(gff_file, contig_name, fasta_genome[contig_name]) + + # remove database for gff file and the temporary gff file in temporary folder + os.remove(data_base) + os.remove(tmp_gff) + os.remove(path_tmp_gff_db) + + return corrected_gff_file + + +if __name__ == '__main__': + pass + # _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv', + # 1, 0.05) + # + # correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv', + # "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict) + # # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff') diff --git a/Corekaburra/exit_with_error.py b/Corekaburra/exit_with_error.py index fe1c46a..6412273 100644 --- a/Corekaburra/exit_with_error.py +++ b/Corekaburra/exit_with_error.py @@ -24,6 +24,7 @@ def exit_with_error(message, exit_status, tmp_folder=None): pass except FileNotFoundError: pass + # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - do so in nice exit function logging.error(message) print(f"Corekaburra ERROR: {message}, exiting", file=sys.stderr) diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 76ef5f7..3acb7ec 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -1,6 +1,10 @@ -import numpy as np import os +try: + from Corekaburra.correct_gffs import annotate_refound_genes +except ModuleNotFoundError: + from correct_gffs import annotate_refound_genes + def parse_gff(input_file): """ @@ -20,6 +24,8 @@ def parse_gff(input_file): # See if refound gene or Prokka annotated and isolate ID in gene_presence_absence.csv accordingly if "old_locus_tag=" in line[8]: gene_id = line[8][line[8].find('old_locus_tag'):] + if ';' in gene_id: + gene_id = gene_id[:gene_id.find(';')] else: gene_id = line[8][line[8].find('ID'):line[8].find(';')] @@ -28,6 +34,7 @@ def parse_gff(input_file): line[8] = gene_id yield line + def get_contig_lengths(input_file): """ Function that takes an input gff file path and records the length of each contig in the file @@ -247,7 +254,7 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene last_first_low_freq_count] previous_core_gene_id = "" - previous_core_gene_end_coor = int(first_core_gene_gff_line[4]) # TODO - should this be the end or just some random large number? + previous_core_gene_end_coor = int(first_core_gene_gff_line[4]) acc_genes_in_region = [] low_freq_genes_in_region = [] @@ -382,7 +389,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # Check if there is a core gene on traversed contig or if a core gene is present on the first contig - # if then record it, if not record the accessory and low frequency genes found on contig and reset. if previous_core_gene_id != "Sequence_break" and previous_core_gene_id != "": - if complete_genome: # TODO - Write a unit check for this good danm thing! - May be wrap in function to be used further down too, when last line in file has been read? *** + if complete_genome: (previous_core_gene_id, previous_core_gene_end_coor, acc_genes_in_region, @@ -447,7 +454,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc previous_core_gene_id = "Sequence_break" # Get the starting position of the first core gene on contig to record the gene. - # Make it negative to fit the calculation of the distance between genes. # TODO - Do we need to add one to further adjust + # Make it negative to fit the calculation of the distance between genes. cur_core_gene_start = -int(line[3]) (previous_core_gene_id, @@ -462,7 +469,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc cur_core_gene_start, acc_genes_in_region, low_freq_genes_in_region, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, - core_gene_pairs, master_info) # TODO - Why zero here? + core_gene_pairs, master_info) # Add as accessory - if first gene is not core else: @@ -552,7 +559,8 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc low_freq_gene_content, master_info, coreless_contigs -def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, i, complete_genomes): +def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes, source_program, + annotate, gene_data_dict, corrected_dict, tmp_folder_path, discard_corrected): """ Single function segmenting the gff into core gene regions to be used for simple multi processing :param input_gff_file: File-path to the given gff file to be segmented @@ -569,8 +577,13 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ :return i: The index of the gff in the larger scheme of the analysis :return complete_genomes: List of genomes given as complete by the user. """ - if (i+1) % 25 == 0 or i == 0: - print(f"Determining core-core synteny for GFF file #{i+1}") # TODO - look what have been done for Magphi in recording progress! + + # Correct input gff file + # Add in the refound genes into the gff files and print the corrected GFF files. + if source_program == "Panaroo" and annotate: + # check if not already corrected file and if any gene is to be inserted at all + if "_corrected" not in input_gff_file and any([x in input_gff_file for x in list(gene_data_dict)]): + input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dict) # TODO - likely check if genome should be corrected at this point in the process. - Would require more inputs. @@ -582,4 +595,9 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ acc_genes=acc_gene_dict, complete_genomes=complete_genomes) + # TODO - Add in an if statement that checks if the corrected files should be kept! + # - If not then delete them and add an if statment that will delete the folder in the main script + if "_corrected" in input_gff_file and discard_corrected: + os.remove(input_gff_file) + return return_data diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index d6ff7ef..5d363b7 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -136,28 +136,19 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, if verbose: print(f"\n------------Opening the gene presence/absence file------------\n") print(f"Core genes must be found in {core_gene_isolate_presence} or more isolates") - print(f"Low frequency genes must be found in {low_freq_gene_isolate_presence} or fewer isolates\n") + print(f"Low frequency genes must be found in less than {low_freq_gene_isolate_presence} isolates\n") # initialise dict of dicts to hold genes from each gffs and to be returned core_gene_dict = {item: {} for item in gff_file_names[14:]} low_freq_gene_dict = {item: {} for item in gff_file_names[14:]} acc_gene_dict = {item: {} for item in gff_file_names[14:]} - # Initialise dict that contain annotations - annotation_dict = {} - # Read lines from file and determine if core, low frequency or 'regular' accessory and record annotations for line in reader: # Remove quotes if Roary if source_program == 'Roary': line = [element.replace('"', '') for element in line] - # Record annotations of refound genes - if any(['refound' in gene for gene in line[14:]]): - refound_genes = [gene for gene in line[14:] if 'refound' in gene] - for gene in refound_genes: - annotation_dict[gene] = line[2] - # Get number of genes in line and average presence of genes in genomes gene_isolate_presence = int(line[3]) no_seq_presence = int(line[4]) @@ -234,7 +225,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, gff_dbs = [file for file in files_in_tmp if '_db' in file] [os.remove(os.path.join(tmp_folder_path, db)) for db in gff_dbs] - return core_gene_dict, low_freq_gene_dict, acc_gene_dict, annotation_dict + return core_gene_dict, low_freq_gene_dict, acc_gene_dict if __name__ == '__main__': diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index f1cf4fe..4dff868 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -268,7 +268,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv low_freq_cutof test_output_file test_out_folder/core_pair_summary.csv low_freq_cutoff_0_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - Test with less than all gffs from pan-genome provided call_new_test "Test with less than all gffs from pan-genome provided" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff -ip Less_than_all_gffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected @@ -278,8 +277,20 @@ test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_run_expect test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder +# TODO - Test unsuccessful reannotation of Panaroo +call_new_test "Test exit status for a bad command line invocation" +test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 + # TODO - test Panaroo input w. correction -# TODO - Add in corrections before this! +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Reannotation_sucessful_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Reannotation_sucessful_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected +test_output_file test_out_folder/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected +test_output_file test_out_folder/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected # TODO - test for core genes being fragmented. diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_data.csv b/functional_tests/test_data/Reannotate_run_fail/gene_data.csv new file mode 100644 index 0000000..bd38db5 --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_fail/gene_data.csv @@ -0,0 +1,6 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +complete_genome_single_chrom_2,,single_comp_2_A,single_comp_2_A,,NNNNNN,gene_name,gene_function +complete_genome_single_chrom_2,,0_refound_0,0_refound_0,,TATA,gene_name,gene_function +complete_genome_single_chrom,,0_refound_0,0_refound_0,,ATAT,gene_name, +genome_single_chrom_larger_rearrange,,2_refound_0,2_refound_0,,TTTTTTTTTTTTTTTTTTTTTTTTTAAAAATTTTTTTTTTTTT,,gene_function +complete_genome_single_chrom,,single_comp_C,single_comp_C,,NNNNNN,gene_name,gene_function diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv new file mode 100644 index 0000000..7f4a02c --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger,complete_genome_single_chrom_2,complete_genome_single_chrom,genome_single_chrom_larger_rearrange +A,,,4,4,1,,,,,,,,,single_comp_A,single_comp_2_A,single_comp_A,single_comp_2_A +B,,,3,3,1,,,,,,,,,single_comp_B,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,single_comp_C,single_comp_2_C,single_comp_C,single_comp_2_C +D,,,3,3,1,,,,,,,,,single_comp_D,0_refound_0,,single_comp_2_D +E,,,3,3,1,,,,,,,,,single_comp_E,single_comp_2_B,,single_comp_2_E +F,,,3,3,1,,,,,,,,,single_comp_F,,1_refound_0,single_comp_2_F +G,,,1,1,1,,,,,,,,,single_comp_G,,,2_refound_0 \ No newline at end of file diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv new file mode 100644 index 0000000..6a88004 --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","4","1","","","","","","","","","single_comp_A","single_comp_2_A","single_comp_A","single_comp_2_A" +"B","","","3","3","1","","","","","","","","","single_comp_B","","single_comp_B","single_comp_2_B" +"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","3","3","1","","","","","","","","","single_comp_D","0_refound_0","","single_comp_2_D" +"E","","","3","3","1","","","","","","","","","single_comp_E","single_comp_2_B","","single_comp_2_E" +"F","","","3","3","1","","","","","","","","","single_comp_F","","1_refound_0","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","","","2_refound_0" \ No newline at end of file diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_data.csv b/functional_tests/test_data/Reannotate_run_succes/gene_data.csv new file mode 100644 index 0000000..ad737aa --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_succes/gene_data.csv @@ -0,0 +1,7 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +complete_genome_single_chrom_2,,single_comp_2_A,single_comp_2_A,,TTTTTTT,gene_name,gene_function +complete_genome_single_chrom_2,,0_refound_0,0_refound_0,,TATGA,gene_name,gene_function +complete_genome_single_chrom_2,,0_refound_1,0_refound_1,,GGGG,,,, +complete_genome_single_chrom,,1_refound_0,1_refound_0,,ATAGT,gene_name, +genome_single_chrom_larger_rearrange,,2_refound_0,2_refound_0,,CCCCT,,gene_function +complete_genome_single_chrom,,single_comp_C,single_comp_C,,TTTTTTT,gene_name,gene_function \ No newline at end of file diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv new file mode 100644 index 0000000..7f4a02c --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger,complete_genome_single_chrom_2,complete_genome_single_chrom,genome_single_chrom_larger_rearrange +A,,,4,4,1,,,,,,,,,single_comp_A,single_comp_2_A,single_comp_A,single_comp_2_A +B,,,3,3,1,,,,,,,,,single_comp_B,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,single_comp_C,single_comp_2_C,single_comp_C,single_comp_2_C +D,,,3,3,1,,,,,,,,,single_comp_D,0_refound_0,,single_comp_2_D +E,,,3,3,1,,,,,,,,,single_comp_E,single_comp_2_B,,single_comp_2_E +F,,,3,3,1,,,,,,,,,single_comp_F,,1_refound_0,single_comp_2_F +G,,,1,1,1,,,,,,,,,single_comp_G,,,2_refound_0 \ No newline at end of file diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv new file mode 100644 index 0000000..fc64b90 --- /dev/null +++ b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger,complete_genome_single_chrom_2,complete_genome_single_chrom,genome_single_chrom_larger_rearrange +A,,,4,4,1,,,,,,,,,single_comp_A,single_comp_2_A,single_comp_A,single_comp_2_A +B,,,3,3,1,,,,,,,,,single_comp_B,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,single_comp_C,single_comp_2_C,single_comp_C,single_comp_2_C +D,,,3,3,1,,,,,,,,,single_comp_D,0_refound_0,,single_comp_2_D +E,,,3,3,1,,,,,,,,,single_comp_E,single_comp_2_B,,single_comp_2_E +F,,,3,3,1,,,,,,,,,single_comp_F,,1_refound_0,single_comp_2_F +G,,,1,1,1,,,,,,,,,single_comp_G,0_refound_1,,2_refound_0 \ No newline at end of file diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected new file mode 100644 index 0000000..8c226a9 --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected @@ -0,0 +1,14 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=locus_tag_0001 +contig_1 Panaroo CDS 76 79 . - 0 ID=locus_tag_0005;locus_tag=locus_tag_0005;old_locus_tag=0_refound_1 +contig_1 . CDS 100 190 . . . ID=single_comp_2_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=locus_tag_0003 +contig_1 Panaroo CDS 296 300 . - 0 ID=locus_tag_0004;locus_tag=locus_tag_0004;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNCCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCATA + diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected new file mode 100644 index 0000000..c4cba46 --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected @@ -0,0 +1,13 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info;locus_tag=locus_tag_0003 +contig_1 Panaroo CDS 296 300 . + 0 ID=locus_tag_0004;locus_tag=locus_tag_0004;old_locus_tag=1_refound_0;name=gene_name +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATAGT + diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected new file mode 100644 index 0000000..9a7eb2f --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_2_D;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=locus_tag_0003 +contig_1 . CDS 300 390 . . . ID=single_comp_2_B;Other_info;locus_tag=locus_tag_0004 +contig_1 . CDS 400 490 . . . ID=single_comp_2_E;Other_info;locus_tag=locus_tag_0005 +contig_1 . CDS 500 590 . . . ID=single_comp_2_F;Other_info;locus_tag=locus_tag_0006 +contig_1 Panaroo CDS 596 600 . - 0 ID=locus_tag_0007;locus_tag=locus_tag_0007;old_locus_tag=2_refound_0;annotation=gene_function +##FASTA +>contigdiff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..28e78c0 --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +complete_genome_single_chrom_2 A C G low_frequency +genome_single_chrom_larger A C B intermediate_frequency +genome_single_chrom_larger_rearrange A C D intermediate_frequency +complete_genome_single_chrom C Sequence_break F intermediate_frequency +complete_genome_single_chrom_2 C Sequence_break D intermediate_frequency +genome_single_chrom_larger C Sequence_break D intermediate_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break G low_frequency diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..83f14d5 --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,4,4,4,4,109,109,109.0,109.0,1,2,1.2,1.0 +A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..b41a429 --- /dev/null +++ b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom_2 A C 109 2 +genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger_rearrange A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 1 +complete_genome_single_chrom_2 C Sequence_break 10 1 +genome_single_chrom_larger C Sequence_break 310 4 +genome_single_chrom_larger_rearrange C Sequence_break 310 4 diff --git a/functional_tests/test_data/complete_genome_single_chrom.gff b/functional_tests/test_data/complete_genome_single_chrom.gff index ded01c4..3bbc95e 100644 --- a/functional_tests/test_data/complete_genome_single_chrom.gff +++ b/functional_tests/test_data/complete_genome_single_chrom.gff @@ -1,7 +1,7 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info -contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info -contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info +contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info;locus_tag=locus_tag_0003 ##FASTA >contig_1 -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATAGT \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_single_chrom_2.gff b/functional_tests/test_data/complete_genome_single_chrom_2.gff index fe98ff1..7aa0808 100644 --- a/functional_tests/test_data/complete_genome_single_chrom_2.gff +++ b/functional_tests/test_data/complete_genome_single_chrom_2.gff @@ -1,7 +1,7 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info -contig_1 . CDS 100 190 . . . ID=single_comp_2_B;Other_info -contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_2_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=locus_tag_0003 ##FASTA >contig_1 -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCATA \ No newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger.gff b/functional_tests/test_data/genome_single_chrom_larger.gff index 4f9d711..742094b 100644 --- a/functional_tests/test_data/genome_single_chrom_larger.gff +++ b/functional_tests/test_data/genome_single_chrom_larger.gff @@ -1,11 +1,11 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info -contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info -contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info -contig_1 . CDS 300 390 . . . ID=single_comp_D;Other_info -contig_1 . CDS 400 490 . . . ID=single_comp_E;Other_info -contig_1 . CDS 500 590 . . . ID=single_comp_F;Other_info -contig_1 . CDS 591 592 . . . ID=single_comp_G;Other_info +contig_1 . CDS 1 90 . . . ID=single_comp_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_C;Other_info;locus_tag=locus_tag_0003 +contig_1 . CDS 300 390 . . . ID=single_comp_D;Other_info;locus_tag=locus_tag_0004 +contig_1 . CDS 400 490 . . . ID=single_comp_E;Other_info;locus_tag=locus_tag_0005 +contig_1 . CDS 500 590 . . . ID=single_comp_F;Other_info;locus_tag=locus_tag_0006 +contig_1 . CDS 591 592 . . . ID=single_comp_G;Other_info;locus_tag=locus_tag_0007 ##FASTA >contigo newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff b/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff index 7a794b5..9ce3680 100644 --- a/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff +++ b/functional_tests/test_data/genome_single_chrom_larger_rearrange.gff @@ -1,10 +1,10 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info -contig_1 . CDS 100 190 . . . ID=single_comp_2_D;Other_info -contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info -contig_1 . CDS 300 390 . . . ID=single_comp_2_B;Other_info -contig_1 . CDS 400 490 . . . ID=single_comp_2_E;Other_info -contig_1 . CDS 500 590 . . . ID=single_comp_2_F;Other_info +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=single_comp_2_D;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=locus_tag_0003 +contig_1 . CDS 300 390 . . . ID=single_comp_2_B;Other_info;locus_tag=locus_tag_0004 +contig_1 . CDS 400 490 . . . ID=single_comp_2_E;Other_info;locus_tag=locus_tag_0005 +contig_1 . CDS 500 590 . . . ID=single_comp_2_F;Other_info;locus_tag=locus_tag_0006 ##FASTA >contigo newline at end of file +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGG \ No newline at end of file diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 0db16fc..6667f8e 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -21,6 +21,7 @@ from Corekaburra import consesus_core_genome from Corekaburra import summary_table from Corekaburra import output_writer_functions +from Corekaburra import correct_gffs # move to folder with mock files. First try Github structure, then try pulled repository structure try: @@ -389,10 +390,9 @@ def test_parsing_w_100_presence(self): "Silas_the_Legionella_tag-9-4": "D", "Silas_the_Legionella_tag-9-5": "E"}, 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} - expected_annotation_dict = {} # None - Should be done and holds refounds! - TODO Make test for this core_gene_dict, low_freq_gene_dict, \ - acc_gene_dict, annotation_dict = \ + acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, @@ -422,7 +422,7 @@ def test_parsing_w_100_presence_roary(self): core_gene_dict, low_freq_gene_dict, \ - acc_gene_dict, annotation_dict = \ + acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, @@ -512,7 +512,7 @@ def test_parsing_w_90_presence(self): tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' core_gene_dict, low_freq_gene_dict, \ - acc_gene_dict, annotation_dict = \ + acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, @@ -602,7 +602,7 @@ def test_parsing_w_90_presence_roary(self): tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' core_gene_dict, low_freq_gene_dict, \ - acc_gene_dict, annotation_dict = \ + acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, @@ -675,6 +675,279 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_acc_gene_dict, acc_gene_dict) +class TestReadGeneData(unittest.TestCase): + """ Function to test the passing of gene_data.csv file from Panaroo """ + def test_read_file(self): + expected_dict = {'PY_40': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], + '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], + '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function']}, + 'PY_41': {'0_refound_1': ['ATGTTGTAGGAAAATACTTGGAAGAATACGTTGACAGGGGTATTTTTGATAAGGAGCCGTTCCAGACCTTTGATCAGAAAGGGATTGGCCGTCTCTTTAGCCCGTTCGGTTAAGCCTGAGTTGAAACTTGGTATTTGTGGGGAACATGGTGGCGATCCTGCTTCCATTGACTTTTACCACAGCCAAGGCCTGACCTACGTTTCTTGTTCGCCATTTAGAGTGCCGCTTACTCGCTTGGCGGCTGCTCAGGCTGCCATCAAAGCTTCAGGCCACAGTCTTACCCAAGACAAATAG', 'gene_name', 'gene_function']}, + 'PY_42': {'0_refound_2': ['ATGTCACTACTGCATATTCATCACAATAAAAAAAAGACAATAGCCCTAATCGTGCTATTGTCTCAAAATCATTTATTTACTTGAAACTTTATCGTGTTACACCAACAGTTTAA', 'gene_name', 'gene_function']}, + 'PY_43': {'0_refound_4': ['ATGAAACGCTATCAACAAGATGCCCTGCTTTTCAAAAAAAATAGATAAAGAAAAGGCTGCGACAGTATCTGCAAGCAGGGCAAAAGAACTAGAAGATAGGCTCAGTCATCAGCCATTAATTGATGATTATCGAGAAAAGATGCAAGATGCAAGATGCAAGTGATGTGACTCAGTATATCACCAAACGTATAGAAGATCAGTTAAACAAGGAGTTAACAAATGGCAAAAACTAA', 'gene_name', 'gene_function']}} + + return_dict = correct_gffs.read_gene_data('TestReadGeneData/Mock_gene_data.csv') + + self.assertEqual(expected_dict, return_dict) + + +class TestPrepairForReannotation(unittest.TestCase): + """ Test for pre-pairing a folder for corrected genomes, and testing if any are present from previous runs """ + def tearDown(self): + try: + """ Class to remove created corrected output folder""" + os.rmdir('TestPrepairForReannotation/Corrected_gff_files') + except FileNotFoundError: + pass + + def test_no_files_annotated(self): + input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation('TestPrepairForReannotation/Mock_gene_data.csv', + 'TestPrepairForReannotation/', + input_gffs) + + self.assertTrue(os.path.isdir('TestPrepairForReannotation/Corrected_gff_files')) + self.assertEqual(input_gffs, corrected_files_return) + + def test_some_files_annotated(self): + input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( + 'TestPrepairForReannotation/Mock_gene_data.csv', + 'TestPrepairForReannotation/Some_genomes', + input_gffs) + + expected_gffs = ['Mock_2.gff', 'Mock_1_corrected.gff'] + + self.assertEqual(expected_gffs, corrected_files_return) + + def test_all_files_annotated(self): + input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( + 'TestPrepairForReannotation/Mock_gene_data.csv', + 'TestPrepairForReannotation/All_genomes', + input_gffs) + + expected_gffs = ['Mock_1_corrected.gff', 'Mock_2_corrected.gff'] + + self.assertEqual(expected_gffs, corrected_files_return) + + +class TestAddGeneToGff(unittest.TestCase): + """ + Test of the function used to add a gene annotation (line) to a gff file + """ + # Make a setup and a teardown that copies and renames the mock file + def setUp(self): + """ Class to copy the mock gff before modifying""" + copyfile('TestAddGeneToGff/mocky_test_gff.gff', 'TestAddGeneToGff/mocky_test_gff.gff_copy') + + def tearDown(self): + """ Class to remove modified gff and rename the original""" + os.remove('TestAddGeneToGff/mocky_test_gff.gff') + os.rename('TestAddGeneToGff/mocky_test_gff.gff_copy', 'TestAddGeneToGff/mocky_test_gff.gff') + + def test_adding_a_gene_no_info(self): + tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' + gene_oi = ['TATA', '', ''] + genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' + contig = 'test_contig_1' + strand = '+' + refound_gene_tag = '0_refound_0' + largest_locus_tag = 'fer_1432' + + expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0\n'] + + with open(tmp_gff_file, 'a') as tmp_gff: + correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) + + with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + def test_adding_a_gene_name(self): + tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' + gene_oi = ['TATA', 'Gene_name', ''] + genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' + contig = 'test_contig_1' + strand = '+' + refound_gene_tag = '0_refound_0' + largest_locus_tag = 'fer_1432' + + expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;name=Gene_name\n'] + + with open(tmp_gff_file, 'a') as tmp_gff: + correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) + + with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + def test_adding_a_gene_annotation(self): + tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' + gene_oi = ['TATA', '', 'Gene_annotation'] + genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' + contig = 'test_contig_1' + strand = '+' + refound_gene_tag = '0_refound_0' + largest_locus_tag = 'fer_1432' + + expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;annotation=Gene_annotation\n'] + + with open(tmp_gff_file, 'a') as tmp_gff: + correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) + + with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + def test_adding_a_gene_name_and_annotation(self): + tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' + gene_oi = ['TATA', 'Gene_name', 'Gene_annotation'] + genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' + contig = 'test_contig_1' + strand = '+' + refound_gene_tag = '0_refound_0' + largest_locus_tag = 'fer_1432' + + expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_annotation\n'] + + with open(tmp_gff_file, 'a') as tmp_gff: + correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) + + with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + +class TestWriteContig(unittest.TestCase): + """ + Test of the function used to write a contig in a gff file. + """ + # Make a setup and a teardown that copies and renames the mock file + def setUp(self): + """ Class to copy the mock gff before modifying""" + copyfile('TestWriteContig/mocky_test_gff.gff', 'TestWriteContig/mocky_test_gff.gff_copy') + + def tearDown(self): + """ Class to remove modified gff and rename the original""" + os.remove('TestWriteContig/mocky_test_gff.gff') + os.rename('TestWriteContig/mocky_test_gff.gff_copy', 'TestWriteContig/mocky_test_gff.gff') + + def test_writing_a_contig(self): + file_path = 'TestWriteContig/mocky_test_gff.gff' + contig_name = 'Test_contig_name space' + sequence = 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC' + + expected_lines = ['##gff-version 3\n', '#test comment line\n', '>Test_contig_name space\n', + 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGC\n', + 'GGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC\n'] + + with open(file_path, 'a') as file: + correct_gffs.write_contig(file, contig_name, sequence) + + with open('TestWriteContig/mocky_test_gff.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + +class TestAnnotateRefoundGenomes(unittest.TestCase): + """ + Test of the function used to reannotate refound genes identified by panaroo in a gff file. + """ + def tearDown(self): + """ Class to remove modified gff and rename the original""" + try: + os.remove('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff') + except FileNotFoundError: + os.remove('TestAnnotateRefoundGenomes/reannotate_gff_tmp.gff') + os.remove('TestAnnotateRefoundGenomes/reannotate_gff.gff_db') + + def test_annotation_of_pos_stand_gene(self): + gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' + gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], + '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', '', 'gene_function'], + '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', '']}} + tmp_folder_path = 'TestAnnotateRefoundGenomes' + corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' + + expected_lines = \ + ['##gff-version 3\n', + '#test comment line\n', + 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', + 'test_contig\tPanaroo\tCDS\t16\t158\t.\t+\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', + 'test_contig\tPanaroo\tCDS\t174\t316\t.\t+\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', + 'test_contig\tPanaroo\tCDS\t332\t469\t.\t+\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', + 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', + '##FASTA\n', + '>test_contig\n', + 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', + 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', + 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', + 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', + 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', + 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', + 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', + 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', + 'TTTT\n' + ] + + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + + with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + def test_annotation_of_neg_stand_gene(self): + gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' + gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAATAAAAACAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', 'gene_function'], + '0_refound_100': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAAAAAAAAAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', '', 'gene_function'], + '0_refound_10': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAGGCGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', '']}} + tmp_folder_path = 'TestAnnotateRefoundGenomes' + corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' + expected_lines = ['##gff-version 3\n', + '#test comment line\n', + 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', + 'test_contig\tPanaroo\tCDS\t16\t158\t.\t-\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', + 'test_contig\tPanaroo\tCDS\t174\t316\t.\t-\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', + 'test_contig\tPanaroo\tCDS\t332\t469\t.\t-\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', + 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', + '##FASTA\n', + '>test_contig\n', + 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', + 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', + 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', + 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', + 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', + 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', + 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', + 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', + 'TTTT\n'] + + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + + with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: + self.assertEqual(expected_lines, added_gff.readlines()) + + def test_gene_not_found(self): + gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' + gene_data_dict = {'reannotate_gff': {'0_refound_0': [ + 'CCCCCCCCCCCCGGGGGGGGGGGGGGGCGGCGCGCGCGCGCGCGGCGCGCGCGGCGCGC', + 'gene_name', 'gene_function']}} + + tmp_folder_path = 'TestAnnotateRefoundGenomes' + corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' + + with self.assertRaises(SystemExit): + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + + # TODO - Add test for annotating of second contig + +class TestExtractGenomeFasta(unittest.TestCase): + def test_extract_genome_fasta(self): + genome_fasta_dict_expected = {'contig} + largest_locus_tag_expected = 'fer_006' + header_lines_expected = ['##gff-version3\n', '#test-line\n'] + + genome_fasta_dict, largest_locus_tag, header_lines = correct_gffs.extract_genome_fasta('TestExtractGenomeFasta/Mock_gff.gff') + + self.assertEqual(genome_fasta_dict_expected, genome_fasta_dict) + self.assertEqual(largest_locus_tag_expected, largest_locus_tag) + self.assertEqual(header_lines_expected, header_lines) + + class TestParsingGffFile(unittest.TestCase): """ Test of the function that is used to pass a gff file and return a generator object of CDS lines """ def test_gff_generator_generation_not_corrected(self): diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/.DS_Store deleted file mode 100644 index 4d1d3738e86b331026c69b1a36816d3145aed360..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKu};H447E!Ifi4{x?-%+9p$aoYXQUE}=+KY|%8K$U{0Iv_#`D=Kq?d>Vp{lYa z-@Ewii}NmuVuN?cC87)}I6BABBeE`<$jB_R$Z?M++HM|ai~G825^pygA_H=E zr*uapz0d>opI>Las*9rB)FtAwxAo2P;^Xsbe~+qO{p!=KllSC|8hS<+!60h%UhP`_7ImGkwoSXq?z!~@p22is_vWcRP&VV!E4D1+???ZqJ=7z0e{B&T5 zEdX!~a}>;_mynoXm>af=Sb?yH0yUJa#b6DGJ(yo^*eYr`u{9rTcV_EQINlxmhv-h6 zEBfdRI0IbSA_(%^$L@UIMf0g^#VSpWb4 From d8e108df2994473f92f3db4effd388b77f51fa0e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Sun, 9 Jan 2022 13:17:46 +1100 Subject: [PATCH 056/135] Add data for unit- and functional tests for refinding genes --- .../TestAddGeneToGff/mocky_test_gff.gff | 2 ++ .../TestAnnotateRefoundGenomes/reannotate_gff.gff | 7 +++++++ .../TestExtractGenomeFasta/Mock_gff.gff | 14 ++++++++++++++ .../Corrected_gff_files/Mock_1_corrected.gff | 0 .../Corrected_gff_files/Mock_2_corrected.gff | 0 .../TestPrepairForReannotation/Mock_gene_data.csv | 2 ++ .../Corrected_gff_files/Mock_1_corrected.gff | 0 .../TestReadGeneData/Mock_gene_data.csv | 7 +++++++ .../TestWriteContig/mocky_test_gff.gff | 2 ++ 9 files changed, 34 insertions(+) create mode 100644 unit_tests/unit_test_data/TestAddGeneToGff/mocky_test_gff.gff create mode 100644 unit_tests/unit_test_data/TestAnnotateRefoundGenomes/reannotate_gff.gff create mode 100644 unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff create mode 100644 unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff create mode 100644 unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff create mode 100644 unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv create mode 100644 unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff create mode 100644 unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv create mode 100644 unit_tests/unit_test_data/TestWriteContig/mocky_test_gff.gff diff --git a/unit_tests/unit_test_data/TestAddGeneToGff/mocky_test_gff.gff b/unit_tests/unit_test_data/TestAddGeneToGff/mocky_test_gff.gff new file mode 100644 index 0000000..8087950 --- /dev/null +++ b/unit_tests/unit_test_data/TestAddGeneToGff/mocky_test_gff.gff @@ -0,0 +1,2 @@ +##gff-version 3 +#test comment line diff --git a/unit_tests/unit_test_data/TestAnnotateRefoundGenomes/reannotate_gff.gff b/unit_tests/unit_test_data/TestAnnotateRefoundGenomes/reannotate_gff.gff new file mode 100644 index 0000000..649663f --- /dev/null +++ b/unit_tests/unit_test_data/TestAnnotateRefoundGenomes/reannotate_gff.gff @@ -0,0 +1,7 @@ +##gff-version 3 +#test comment line +test_contig Prokka CDS 1 10 . + 0 locus_tag=locus_tag_0097 +test_contig Prokka CDS 474 484 . + 0 locus_tag=locus_tag_0098 +##FASTA +>test_contig +TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTT \ No newline at end of file diff --git a/unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff b/unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff new file mode 100644 index 0000000..cb64627 --- /dev/null +++ b/unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff @@ -0,0 +1,14 @@ +##gff-version3 +#test-line +contig_1 . CDS 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=fer_001 +contig_1 . CDS 100 190 . . . ID=single_comp_2_D;Other_info;locus_tag=fer_002 +contig_1 . CDS 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=fer_003 +contig_1 . CDS 300 390 . . . ID=single_comp_2_B;Other_info;locus_tag=fer_004 +contig_1 . CDS 400 490 . . . ID=single_comp_2_E;Other_info;locus_tag=fer_005 +contig_1 . CDS 500 590 . . . ID=single_comp_2_F;Other_info;locus_tag=fer_006 +##FASTA +>contigo newline at end of file diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv b/unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv new file mode 100644 index 0000000..ca73c65 --- /dev/null +++ b/unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv @@ -0,0 +1,2 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +PY_40,,0_refound_0,0_refound_0,LPI*SRLRGIAVFIGKTILFYLILMLLVYFFGYLGHGQSNFIYNEF*X,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG,, diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv b/unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv new file mode 100644 index 0000000..e9aec51 --- /dev/null +++ b/unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv @@ -0,0 +1,7 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +PY_40,,0_refound_0,0_refound_0,LPI*SRLRGIAVFIGKTILFYLILMLLVYFFGYLGHGQSNFIYNEF*X,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG,gene_name,gene_function +PY_40,,0_refound_100,0_refound_100,LPI*SRLRGIAVFIGKTILFYLILMLLVYFFGYLGHGQSNFIYNEF*X,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG,gene_name,gene_function +PY_40,,0_refound_10,0_refound_10,LPI*SRLRGIAVFIGKTILFYLILMLLVYFFGYLGHGQSNFIYNEF*X,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG,gene_name,gene_function +PY_41,,0_refound_1,0_refound_1,ML*ENTWKNTLTGVFLIRSRSRPLIRKGLAVSLARSVKPELKLGICGEHGGDPASIDFYHSQGLTYVSCSPFRVPLTRLAAAQAAIKASGHSLTQDK*X,ATGTTGTAGGAAAATACTTGGAAGAATACGTTGACAGGGGTATTTTTGATAAGGAGCCGTTCCAGACCTTTGATCAGAAAGGGATTGGCCGTCTCTTTAGCCCGTTCGGTTAAGCCTGAGTTGAAACTTGGTATTTGTGGGGAACATGGTGGCGATCCTGCTTCCATTGACTTTTACCACAGCCAAGGCCTGACCTACGTTTCTTGTTCGCCATTTAGAGTGCCGCTTACTCGCTTGGCGGCTGCTCAGGCTGCCATCAAAGCTTCAGGCCACAGTCTTACCCAAGACAAATAG,gene_name,gene_function +PY_42,,0_refound_2,0_refound_2,VTTAYSSQ*KKDNSPNRAIVSKSFIYLKLYRVTPTV*X,ATGTCACTACTGCATATTCATCACAATAAAAAAAAGACAATAGCCCTAATCGTGCTATTGTCTCAAAATCATTTATTTACTTGAAACTTTATCGTGTTACACCAACAGTTTAA,gene_name,gene_function +PY_43,,0_refound_4,0_refound_4,*NAINKMPCFSKKIDKEKAATVSASRAKELEDRLSHQPLIDDYREKMQDARCK*CDSVYHQTYRRSVKQGVNKWQKLX,ATGAAACGCTATCAACAAGATGCCCTGCTTTTCAAAAAAAATAGATAAAGAAAAGGCTGCGACAGTATCTGCAAGCAGGGCAAAAGAACTAGAAGATAGGCTCAGTCATCAGCCATTAATTGATGATTATCGAGAAAAGATGCAAGATGCAAGATGCAAGTGATGTGACTCAGTATATCACCAAACGTATAGAAGATCAGTTAAACAAGGAGTTAACAAATGGCAAAAACTAA,gene_name,gene_function diff --git a/unit_tests/unit_test_data/TestWriteContig/mocky_test_gff.gff b/unit_tests/unit_test_data/TestWriteContig/mocky_test_gff.gff new file mode 100644 index 0000000..8087950 --- /dev/null +++ b/unit_tests/unit_test_data/TestWriteContig/mocky_test_gff.gff @@ -0,0 +1,2 @@ +##gff-version 3 +#test comment line From e46e536c8a0514258a57c5ef6b068d24b4fb450e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Sun, 9 Jan 2022 13:20:44 +1100 Subject: [PATCH 057/135] Change help message expected for functional tests --- functional_tests/test_data/no_input.expected | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 26455d1..72e50ed 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] - [-h] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] Welcome to Corekaburra! Program to determine consensus core sequence from multiple genomes. Outputs consensus core gene alignment, distance between core @@ -35,6 +35,10 @@ Output control: current folder] -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX Prefix for output files, if any is desired + -d, --discard_corrected + Discard gff files corrected with refound genes + identified by Panaroo - Only compativle if pan-genome + comes from Panaroo [Default: Corrected files are kept] Other arguments: -c int, --cpu int Give max number of CPUs [default: 1] From daac64771455f2e9d9de5f42fbf7ab869aee5162 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Sun, 9 Jan 2022 13:23:09 +1100 Subject: [PATCH 058/135] Add changes to paths of outputs and delete output folder from intentially failing run --- functional_tests/Corekaburra-test.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 4dff868..0125230 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -278,19 +278,22 @@ test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gf rm -r test_out_folder # TODO - Test unsuccessful reannotation of Panaroo -call_new_test "Test exit status for a bad command line invocation" +call_new_test "Test unsuccessful reannotation of Panaroo" test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 +rm -r test_out_folder # TODO - test Panaroo input w. correction +call_new_test "Test Panaroo input with correction of gff files" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected test_output_file test_out_folder/core_segments.csv Reannotation_sucessful_expected/core_segments.csv.expected test_output_file test_out_folder/no_accessory_core_segments.csv Reannotation_sucessful_expected/no_accessory_core_segments.csv.expected -test_output_file test_out_folder/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected -test_output_file test_out_folder/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected -test_output_file test_out_folder/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected +test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected +test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected +test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected +rm -r test_out_folder # TODO - test for core genes being fragmented. From 78d5867c22cb829a9f1e091170d64df99068a080 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Sun, 9 Jan 2022 13:26:54 +1100 Subject: [PATCH 059/135] Remove the expecteation of segments from refound correction test, as none are found --- functional_tests/Corekaburra-test.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 0125230..2a79efe 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -284,12 +284,10 @@ rm -r test_out_folder # TODO - test Panaroo input w. correction call_new_test "Test Panaroo input with correction of gff files" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Reannotation_sucessful_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Reannotation_sucessful_expected/no_accessory_core_segments.csv.expected test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected From 7775325c6649aa56f5f554750cf051c081e1dd8d Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 08:43:41 +1100 Subject: [PATCH 060/135] Add in an adjustments for identifying fragmneted genes and their position in genomes. Add in adjsuted unit-test for aformentioned. Add in functional tests for handling single core gene on draft and complete genome, and fragmeneted core and accessory --- Corekaburra/parse_gene_presence_absence.py | 32 ++++++++--------- functional_tests/Corekaburra-test.sh | 36 +++++++++++++++++-- .../gene_presence_absence.csv | 8 +++++ ...e_core_accessory_gene_content.tsv.expected | 11 ++++++ .../core_pair_summary.csv.expected | 4 +++ .../low_frequency_gene_placement.tsv.expected | 13 +++++++ .../gene_presence_absence.csv | 8 +++++ .../gene_presence_absence.csv | 8 +++++ ...e_core_accessory_gene_content.tsv.expected | 11 ++++++ .../core_pair_summary.csv.expected | 4 +++ .../low_frequency_gene_placement.tsv.expected | 13 +++++++ .../gene_presence_absence.csv | 7 ++++ .../complete_genome_double_chrom.gff | 12 +++---- ...e_core_accessory_gene_content.tsv.expected | 4 +++ .../core_pair_summary.csv.expected | 5 +++ .../low_frequency_gene_placement.tsv.expected | 6 ++++ ...e_core_accessory_gene_content.tsv.expected | 3 ++ .../core_pair_summary.csv.expected | 6 ++++ .../low_frequency_gene_placement.tsv.expected | 9 +++++ unit_tests/Corekaburra_test.py | 20 ++++++----- 20 files changed, 186 insertions(+), 34 deletions(-) create mode 100644 functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Single_core_contig/gene_presence_absence.csv create mode 100644 functional_tests/test_data/single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/single_core_contig_complete_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 5d363b7..1125ded 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -13,7 +13,6 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): :param genome: The name of the genome in question :return: returns the dict to be used further """ - if ';' in gene: for gene_part in gene.split(';'): # TODO - NOTE! HERE BOTH GENES IN A PAIR IS ADDED as separate key/value-pairs main_dict[genome][gene_part] = pan_gene_name @@ -23,25 +22,25 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): return main_dict -def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): +def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): """ Function that check for that placement of fragmented gene parts, to determine if they are neighbouring or have some genomic feature between them - :param fragments_in_line: List of genes that are found to be fragmented, one composite of fragments for each index + :param fragment_info: List of genes that are found to be fragmented, one composite of fragments for each index :param input_gffs: A list of file-paths to the gff files given as input :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ return_list = [] - for fragment in fragments_in_line: + for fragment in fragment_info: # split the two fragments - fragments = fragment.split(';') + fragment_pieces = fragment[0].split(';') # Get the name of the genome - genome = fragments[0].rsplit("_", 1)[0] + genome = fragment[1] # Get the gff and its path try: - gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] + gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] # TODO - fix that using a locus_tag it is not possible to identify genes. How do we make it so that we can? except IndexError: raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') @@ -54,12 +53,12 @@ def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): gff_database = gffutils.FeatureDB(db_name) # Check that all fragments are on the same contig. - first_fragment_contig = gff_database[fragments[0]][0] - frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragments]) + first_fragment_contig = gff_database[fragment_pieces[0]][0] + frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragment_pieces]) if frag_same_contig: # Get all coordinates frag_coors = [] - for frag in fragments: + for frag in fragment_pieces: frag_coors.append(gff_database[frag][3]) frag_coors.append(gff_database[frag][4]) @@ -73,7 +72,7 @@ def check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path): # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) - excess_genes = region_locus_tags.difference(fragments) + excess_genes = region_locus_tags.difference(fragment_pieces) # check the number of excess genes, if any then False to being core if len(excess_genes) > 0: @@ -154,28 +153,27 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, no_seq_presence = int(line[4]) # Check if core gene, if then add annotations to genomes - # TODO - Handle genes that have a paralog and are concatenated by ';', and check if neighbours # Check if gene is present in all genomes and no one gene is fragmented if core_gene_isolate_presence <= gene_isolate_presence == no_seq_presence: # Add gene cluster to genomes - for genome in core_gene_dict.keys(): # TODO - Change this to go through genomes with something in them - so that core threshold can be lower + for genome in core_gene_dict.keys(): # Check if there is an annotation for the given genome if len(line[14 + gff_file_dict[genome]]) > 0: core_gene_dict[genome][line[14+gff_file_dict[genome]]] = line[0] core_gene_number += 1 - # Check if gene is present in all genomes, but more than one copy is pressent + # Check if gene is present in all genomes, but more than one copy is present elif core_gene_isolate_presence <= gene_isolate_presence: # Identify annotations for genomes that are fragmented genes - fragments_in_line = [genes for genes in line[14:] if ';' in genes] + fragment_info = [[genes, gff] for genes, gff in zip(line[14:], gff_file_names[14:]) if ';' in genes] # Check that each annotation is neighboring the other annotation. - return_list = check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + return_list = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? # Check if gene was found to be a core gene if all(return_list): # Add the gene to the annotation dict - for genome in core_gene_dict.keys(): + for genome in core_gene_dict.keys(): # TODO - Check if .keys can be omitted # Get the annoations for a specific genome genes_in_genome = line[14 + gff_file_dict[genome]] # If there is an annotation add id diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 2a79efe..54c8428 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -277,12 +277,10 @@ test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_run_expect test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -# TODO - Test unsuccessful reannotation of Panaroo call_new_test "Test unsuccessful reannotation of Panaroo" test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 rm -r test_out_folder -# TODO - test Panaroo input w. correction call_new_test "Test Panaroo input with correction of gff files" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected @@ -293,11 +291,45 @@ test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chro test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected rm -r test_out_folder +# TODO - Set up test with a single core gene on a contig that is not complete +call_new_test "Test Panaroo input with correction of gff files" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv single_core_contig_draft_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - Set up test with a single core gene on a contig that is complete +call_new_test "Test Panaroo input with correction of gff files" +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ -cg complete_genomes_file > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv single_core_contig_complete_expected/core_pair_summary.csv.expected +rm -r test_out_folder + # TODO - test for core genes being fragmented. +call_new_test "Test Panaroo input with correction of gff files" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_run/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO Test a fragmented core gene not accepted as core +#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ +# TODO - run the test check results and transfer to expected folder +#rm -r test_out_folder # TODO - test for accessory genes being fragmented. +call_new_test "Test Panaroo input with correction of gff files" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder +# TODO - set up a test with a core-less contig. # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv new file mode 100644 index 0000000..fe60eb6 --- /dev/null +++ b/functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","4","1","","","","","","","","","single_comp_A","single_comp_2_A","single_comp_A","single_comp_2_A" +"B","","","3","3","1","","","","","","","","","single_comp_B","","single_comp_B","single_comp_2_B" +"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","2","2","1","","","","","","","","","","","","single_comp_2_D" +"E","","","3","4","1.33","","","","","","","","","single_comp_E;single_comp_D","single_comp_2_B","","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","single_comp_F","","","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","","","" \ No newline at end of file diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..b8cf701 --- /dev/null +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,11 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger A C B intermediate_frequency +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..f4afb24 --- /dev/null +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 +A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..e8df429 --- /dev/null +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom_2 A C 109 1 +genome_single_chrom_larger A C 109 1 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger_rearrange A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger C Sequence_break 310 3 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv new file mode 100644 index 0000000..f2904ea --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","5","1.25","","","","","","","","","single_comp_A;single_comp_D","single_comp_2_A","single_comp_A","single_comp_2_A" +"B","","","3","3","1","","","","","","","","","single_comp_B","","single_comp_B","single_comp_2_B" +"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","1","1","1","","","","","","","","","","","","single_comp_2_D" +"E","","","3","3","1","","","","","","","","","single_comp_E","single_comp_2_B","","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","single_comp_F","","","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","","","" \ No newline at end of file diff --git a/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv new file mode 100644 index 0000000..8223354 --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv @@ -0,0 +1,8 @@ +"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","5","1.25","","","","","","","","","single_comp_A;single_comp_B","single_comp_2_A","single_comp_A","single_comp_2_A" +"B","","","2","2","1","","","","","","","","","","","single_comp_B","single_comp_2_B" +"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","1","1","1","","","","","","","","","single_comp_D","","","single_comp_2_D" +"E","","","3","3","1","","","","","","","","","single_comp_E","single_comp_2_B","","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","single_comp_F","","","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","single_comp_G","","","" \ No newline at end of file diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..7e1ce8b --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,11 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger_rearrange A C D low_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break D low_frequency +genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..6167deb --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 +A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..78dcc4c --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom_2 A C 109 1 +genome_single_chrom_larger A C 9 0 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom_2 A Sequence_break 0 0 +genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger_rearrange A Sequence_break 0 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger C Sequence_break 310 4 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 diff --git a/functional_tests/test_data/Single_core_contig/gene_presence_absence.csv b/functional_tests/test_data/Single_core_contig/gene_presence_absence.csv new file mode 100644 index 0000000..7ba0367 --- /dev/null +++ b/functional_tests/test_data/Single_core_contig/gene_presence_absence.csv @@ -0,0 +1,7 @@ +"","","","","","","","","","","","","","","complete_genome_single_chrom","complete_genome_double_chrom" +"A","","","2","2","1","","","","","","","","","single_comp_A","dub_chrom_A" +"B","","","2","2","1","","","","","","","","","single_comp_B","dub_chrom_B" +"C","","","1","1","1","","","","","","","","","","dub_chrom_C" +"D","","","1","1","1","","","","","","","","","","dub_chrom_D" +"E","","","2","2","1","","","","","","","","","single_comp_C","dub_chrom_E" +"F","","","1","1","1","","","","","","","","","","dub_chrom_F" diff --git a/functional_tests/test_data/complete_genome_double_chrom.gff b/functional_tests/test_data/complete_genome_double_chrom.gff index fc59b4a..73e361d 100644 --- a/functional_tests/test_data/complete_genome_double_chrom.gff +++ b/functional_tests/test_data/complete_genome_double_chrom.gff @@ -1,10 +1,10 @@ ##gff-version3 -contig_1 . CDS 1 90 . . . ID=dub_chrom_A;Other_info -contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info -contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info -contig_2 . CDS 1 90 . . . ID=dub_chrom_D;Other_info -contig_2 . CDS 100 190 . . . ID=dub_chrom_E;Other_info -contig_2 . CDS 200 290 . . . ID=dub_chrom_F;Other_info +contig_1 . CDS 1 90 . . . ID=dub_chrom_A;Other_info;locus_tag=locus_tag=0001 +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info;locus_tag=locus_tag=0002 +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info;locus_tag=locus_tag=0003 +contig_2 . CDS 1 90 . . . ID=dub_chrom_D;Other_info;locus_tag=locus_tag=0004 +contig_2 . CDS 100 190 . . . ID=dub_chrom_E;Other_info;locus_tag=locus_tag=0005 +contig_2 . CDS 200 290 . . . ID=dub_chrom_F;Other_info;locus_tag=locus_tag=0006 ##FASTA >contig_1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/functional_tests/test_data/single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..5ebfe10 --- /dev/null +++ b/functional_tests/test_data/single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,4 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_double_chrom A B C low_frequency +complete_genome_double_chrom E E D low_frequency +complete_genome_double_chrom E E F low_frequency diff --git a/functional_tests/test_data/single_core_contig_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_complete_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..5ff94bd --- /dev/null +++ b/functional_tests/test_data/single_core_contig_complete_expected/core_pair_summary.csv.expected @@ -0,0 +1,5 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,110,59.5,59.5,0,1,0.5,0.5 +A-E,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +E-E,1,4,4,2,209,209,209.0,209.0,2,2,2.0,2.0 diff --git a/functional_tests/test_data/single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..f065c53 --- /dev/null +++ b/functional_tests/test_data/single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,6 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom A B 110 1 +complete_genome_single_chrom A B 9 0 +complete_genome_single_chrom A E 10 0 +complete_genome_single_chrom B E 9 0 +complete_genome_double_chrom E E 209 2 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..ec996bc --- /dev/null +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,3 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_double_chrom B Sequence_break C low_frequency +complete_genome_double_chrom E Sequence_break F low_frequency diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..aacd5c6 --- /dev/null +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -0,0 +1,6 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 +E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..96337b1 --- /dev/null +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom A B 9 0 +complete_genome_single_chrom A B 9 0 +complete_genome_double_chrom A Sequence_break 0 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom B E 9 0 +complete_genome_double_chrom B Sequence_break 110 1 +complete_genome_double_chrom E Sequence_break 110 1 +complete_genome_single_chrom E Sequence_break 10 0 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 6667f8e..d604907 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -228,7 +228,7 @@ def tearDown(self): def test_fragmented_gene_true(self): """ Gene is fragmented but found next to each other with nothing in between """ - fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2'] + fragments_info = [['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', 'Silas_the_Salmonella']] input_gffs =['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', @@ -243,13 +243,13 @@ def test_fragmented_gene_true(self): expected_return = [True] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) self.assertEqual(expected_return, return_bool) def test_fragmented_gene_fasle(self): """ Gene is fragmented but found next to each other with another gene in between """ - fragments_in_line = ['Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + fragments_info = [['Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2', 'Silas_the_Salmonella']] input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', @@ -264,13 +264,14 @@ def test_fragmented_gene_fasle(self): expected_return = [False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) self.assertEqual(expected_return, return_bool) def test_fragmented_gene_mutiple_genes_fasle(self): """ Two genes fragmented with one having nothing and the other having something in between fragments """ - fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', 'Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + fragment_info = [['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', 'Silas_the_Salmonella'], + ['Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2', 'Silas_the_Salmonella']] input_gffs = ['TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff', 'TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', @@ -285,14 +286,14 @@ def test_fragmented_gene_mutiple_genes_fasle(self): expected_return = [True, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) self.assertEqual(expected_return, return_bool) def test_fragments_on_separate_contigs(self): """ One gene fragmented with parts on separate contigs """ - fragments_in_line = ['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', - 'Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2'] + fragments_info = [['Silas_the_Salmonella_tag-1-2.1;Silas_the_Salmonella_tag-1-2.2', 'Silas_the_Salmonella'], + ['Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2', 'Silas_the_Salmonella']] input_gffs = ['TestCheckingFragmentedGenes/Silas_the_Salmonella.gff', 'TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff', 'TestCheckingFragmentedGenes/Silas_the_Legionella.gff', @@ -301,7 +302,7 @@ def test_fragments_on_separate_contigs(self): expected_return = [False, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_in_line, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) self.assertEqual(expected_return, return_bool) @@ -935,6 +936,7 @@ def test_gene_not_found(self): # TODO - Add test for annotating of second contig + class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} From 1ee3f7b068329e249966f0670750f1dd2dae7616 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 09:21:26 +1100 Subject: [PATCH 061/135] Make small changes to test for fragmented core genes to make it run correctly. Rename some tests --- functional_tests/Corekaburra-test.sh | 8 ++++---- .../gene_presence_absence.csv | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 54c8428..c180b73 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -292,7 +292,7 @@ test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_ rm -r test_out_folder # TODO - Set up test with a single core gene on a contig that is not complete -call_new_test "Test Panaroo input with correction of gff files" +call_new_test "Test with a single core gene on a contig that is not complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -300,7 +300,7 @@ test_output_file test_out_folder/core_pair_summary.csv single_core_contig_draft_ rm -r test_out_folder # TODO - Set up test with a single core gene on a contig that is complete -call_new_test "Test Panaroo input with correction of gff files" +call_new_test "Test with a single core gene on a contig that is complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ -cg complete_genomes_file > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected @@ -308,7 +308,7 @@ test_output_file test_out_folder/core_pair_summary.csv single_core_contig_comple rm -r test_out_folder # TODO - test for core genes being fragmented. -call_new_test "Test Panaroo input with correction of gff files" +call_new_test "Test for core genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -321,7 +321,7 @@ rm -r test_out_folder #rm -r test_out_folder # TODO - test for accessory genes being fragmented. -call_new_test "Test Panaroo input with correction of gff files" +call_new_test "Test for accessory genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected diff --git a/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv index 8223354..0418a20 100644 --- a/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv +++ b/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv @@ -1,8 +1,8 @@ -"","","","","","","","","","","","","","","genome_single_chrom_larger","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" -"A","","","4","5","1.25","","","","","","","","","single_comp_A;single_comp_B","single_comp_2_A","single_comp_A","single_comp_2_A" +"","","","","","","","","","","","","","","genome_single_chrom_larger_2","complete_genome_single_chrom_2","complete_genome_single_chrom","genome_single_chrom_larger_rearrange" +"A","","","4","5","1.25","","","","","","","","","locus_tag_0001;locus_tag_0002","single_comp_2_A","single_comp_A","single_comp_2_A" "B","","","2","2","1","","","","","","","","","","","single_comp_B","single_comp_2_B" -"C","","","4","4","1","","","","","","","","","single_comp_C","single_comp_2_C","single_comp_C","single_comp_2_C" -"D","","","1","1","1","","","","","","","","","single_comp_D","","","single_comp_2_D" -"E","","","3","3","1","","","","","","","","","single_comp_E","single_comp_2_B","","single_comp_2_E" -"F","","","2","2","1","","","","","","","","","single_comp_F","","","single_comp_2_F" -"G","","","1","1","1","","","","","","","","","single_comp_G","","","" \ No newline at end of file +"C","","","4","4","1","","","","","","","","","locus_tag_0003","single_comp_2_C","single_comp_C","single_comp_2_C" +"D","","","1","1","1","","","","","","","","","locus_tag_0004","","","single_comp_2_D" +"E","","","3","3","1","","","","","","","","","locus_tag_0005","single_comp_2_B","","single_comp_2_E" +"F","","","2","2","1","","","","","","","","","locus_tag_0006","","","single_comp_2_F" +"G","","","1","1","1","","","","","","","","","locus_tag_0007","","","" \ No newline at end of file From 3dcd2e93de0b03fd52c2329da7900a7aa0a9b2c0 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 09:25:14 +1100 Subject: [PATCH 062/135] Add in the new genome for fragmented core gene test --- .../core_core_accessory_gene_content.tsv.expected | 8 ++++---- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../test_data/genome_single_chrom_larger_2.gff | 11 +++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 functional_tests/test_data/genome_single_chrom_larger_2.gff diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected index 7e1ce8b..eaff33b 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected @@ -2,10 +2,10 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_single_chrom A C B intermediate_frequency complete_genome_single_chrom_2 A C E intermediate_frequency genome_single_chrom_larger_rearrange A C D low_frequency -genome_single_chrom_larger C Sequence_break E intermediate_frequency -genome_single_chrom_larger C Sequence_break F intermediate_frequency -genome_single_chrom_larger C Sequence_break D low_frequency -genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_2 C Sequence_break E intermediate_frequency +genome_single_chrom_larger_2 C Sequence_break F intermediate_frequency +genome_single_chrom_larger_2 C Sequence_break D low_frequency +genome_single_chrom_larger_2 C Sequence_break G low_frequency genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected index 78dcc4c..5558b6d 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,13 +1,13 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 -genome_single_chrom_larger A C 9 0 +genome_single_chrom_larger_2 A C 9 0 genome_single_chrom_larger_rearrange A C 109 1 complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 +genome_single_chrom_larger_2 A Sequence_break 0 0 genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 -genome_single_chrom_larger C Sequence_break 310 4 +genome_single_chrom_larger_2 C Sequence_break 310 4 genome_single_chrom_larger_rearrange C Sequence_break 310 3 diff --git a/functional_tests/test_data/genome_single_chrom_larger_2.gff b/functional_tests/test_data/genome_single_chrom_larger_2.gff new file mode 100644 index 0000000..099b216 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_2.gff @@ -0,0 +1,11 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=locus_tag_0001;Other_info;locus_tag=locus_tag_0001 +contig_1 . CDS 100 190 . . . ID=locus_tag_0002;Other_info;locus_tag=locus_tag_0002 +contig_1 . CDS 200 290 . . . ID=locus_tag_0003;Other_info;locus_tag=locus_tag_0003 +contig_1 . CDS 300 390 . . . ID=locus_tag_0004;Other_info;locus_tag=locus_tag_0004 +contig_1 . CDS 400 490 . . . ID=locus_tag_0005;Other_info;locus_tag=locus_tag_0005 +contig_1 . CDS 500 590 . . . ID=locus_tag_0006;Other_info;locus_tag=locus_tag_0006 +contig_1 . CDS 591 592 . . . ID=locus_tag_0007;Other_info;locus_tag=locus_tag_0007 +##FASTA +>contigo newline at end of file From 0f70eefbeb891c9278a42f71b2d6619d556081ab Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 09:28:43 +1100 Subject: [PATCH 063/135] Add new genome to test --- functional_tests/Corekaburra-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index c180b73..21e7ca0 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -309,7 +309,7 @@ rm -r test_out_folder # TODO - test for core genes being fragmented. call_new_test "Test for core genes being fragmented" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_run/ -o test_out_folder/ > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_2.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_run_expected/core_pair_summary.csv.expected From 3d21188f010f6e72629872998dd0aee9ff993a0e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 11:01:36 +1100 Subject: [PATCH 064/135] Add in the writing of output for contigs that does not contain a core gene but have accessory genes. Add functional and unit test for the function --- Corekaburra/__main__.py | 9 +++-- Corekaburra/output_writer_functions.py | 35 +++++++++++++++++++ functional_tests/Corekaburra-test.sh | 17 ++++++++- ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 4 +++ ...contig_accessory_gene_content.tsv.expected | 3 ++ .../low_frequency_gene_placement.tsv.expected | 7 ++++ .../gene_presence_absence.csv | 10 ++++++ ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 4 +++ ...contig_accessory_gene_content.tsv.expected | 3 ++ .../low_frequency_gene_placement.tsv.expected | 7 ++++ unit_tests/Corekaburra_test.py | 16 +++++++++ .../no_core_contigs.txt | 4 +++ 14 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv create mode 100644 functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected create mode 100644 unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index dafbd49..c6fd1ca 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -61,9 +61,9 @@ from summary_table import calculate_n_create_summaries try: - from Corekaburra.output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer + from Corekaburra.output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer, non_core_contig_writer except ModuleNotFoundError: - from output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer + from output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer, non_core_contig_writer import sys import pkg_resources @@ -295,6 +295,11 @@ def main(): segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) # TODO - Possibly output core gene graph. with segment annotations? + # - Print summary number of genes and names + # - Should we print a low-freq, placement? + if len(non_core_contig_info)> 0: + non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) + print(f'{non_core_contig_info = }') # time_calculator(time_start, time.time(), "writing output files") diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 65260d9..9820a3e 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -170,5 +170,40 @@ def no_acc_segment_writer(no_acc_segments, out_path, prefix, quiet): writer.writerow(info) +def non_core_contig_writer(non_core_contigs, out_path, prefix,): + """ + Function to write output for contigs with no core gene, but with accessory genes + :param non_core_contigs: Dict of info for each contig with no core genes, values are list of lists with intermediate and low-frequency genes + :param out_path: Path to the output folder + :param prefix: A possible prefix for the output files. + :return: Nothing + """ + # if not quiet: # TODO - log + # print("Printing master output") + + # Write gene content in long format + out_file_name = 'coreless_contig_accessory_gene_content.tsv' + if prefix is not None: + out_file_name = f'{prefix}_{out_file_name}' + + with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file: + writer = csv.writer(out_file, delimiter="\t") + + # Create header + header = ['Gff', 'Contig', 'Accessory_count', 'Intermediate_cunt', 'low_frequency_count'] + writer.writerow(header) + + # Write remaining rows: + for key in sorted(non_core_contigs): + genome, contig = key.split('--') + num_intermidiate = len(non_core_contigs[key][0]) + num_low = len(non_core_contigs[key][1]) + num_accessory = num_intermidiate + num_low + + info = [genome, contig, num_accessory, num_intermidiate, num_low] + + writer.writerow(info) + + if __name__ == "__main__": pass diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 21e7ca0..84f8682 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -329,7 +329,22 @@ test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene rm -r test_out_folder -# TODO - set up a test with a core-less contig. +# TODO - set up a test with a core-less contig draft. +# TODO - implement coreless contig output! +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - set up a test with a core-less contig complete. +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected +rm -r test_out_folder + # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..c62e46d --- /dev/null +++ b/functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,2,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..8fa9065 --- /dev/null +++ b/functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected @@ -0,0 +1,3 @@ +Gff Contig Accessory_count Intermediate_cunt low_frequency_count +complete_genome_double_chrom contig_2 3 0 3 +complete_genome_double_chrom_2 contig_2 3 0 3 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..0e7126e --- /dev/null +++ b/functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom A B 9 0 +complete_genome_double_chrom_2 A B 9 0 +complete_genome_double_chrom A C 10 0 +complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom B C 9 0 +complete_genome_double_chrom_2 B C 9 0 diff --git a/functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv b/functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv new file mode 100644 index 0000000..1a57ba5 --- /dev/null +++ b/functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv @@ -0,0 +1,10 @@ +"","","","","","","","","","","","","","","complete_genome_double_chrom","complete_genome_double_chrom_2" +"A","","","2","2","1","","","","","","","","","dub_chrom_A","dub_chrom_2_A" +"B","","","2","2","1","","","","","","","","","dub_chrom_B","dub_chrom_2_B" +"C","","","2","2","1","","","","","","","","","dub_chrom_C","dub_chrom_2_C" +"D","","","1","1","1","","","","","","","","","","dub_chrom_2_D" +"E","","","1","1","1","","","","","","","","","","dub_chrom_2_E" +"F","","","1","1","1","","","","","","","","","","dub_chrom_2_F" +"G","","","1","1","1","","","","","","","","","dub_chrom_D","" +"H","","","1","1","1","","","","","","","","","dub_chrom_E","" +"I","","","1","1","1","","","","","","","","","dub_chrom_F","" \ No newline at end of file diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..c62e46d --- /dev/null +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,2,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..8fa9065 --- /dev/null +++ b/functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected @@ -0,0 +1,3 @@ +Gff Contig Accessory_count Intermediate_cunt low_frequency_count +complete_genome_double_chrom contig_2 3 0 3 +complete_genome_double_chrom_2 contig_2 3 0 3 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..0e7126e --- /dev/null +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom A B 9 0 +complete_genome_double_chrom_2 A B 9 0 +complete_genome_double_chrom A C 10 0 +complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom B C 9 0 +complete_genome_double_chrom_2 B C 9 0 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index d604907..6de2bcd 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -3600,6 +3600,22 @@ def test_no_acc_segment_writer(self): with open('TestWritingOutputFunction/test_no_accessory_core_segments.csv', 'r') as result: self.assertEqual(expected.readlines(), result.readlines()) + def test_coreless_contig_writer(self): + coreless_contigs = {'gff_1--contig_x': [['pan_cluster_2'], ['pan_cluster_3']], + 'gff_1--contig_y': [['pan_cluster_2'], []], + 'gff_1--contig_z': [[], ['pan_cluster_6']]} + + out_path = 'TestWritingOutputFunction' + prefix = 'test' + + expected_summary_table = 'TestWritingOutputFunction/no_core_contigs.txt' + + output_writer_functions.non_core_contig_writer(coreless_contigs, out_path, prefix) + + with open(expected_summary_table, 'r') as expected: + with open('TestWritingOutputFunction/test_coreless_contig_accessory_gene_content.tsv', 'r') as result: + self.assertEqual(expected.readlines(), result.readlines()) + if __name__ == '__main__': unittest.main() diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt new file mode 100644 index 0000000..9711848 --- /dev/null +++ b/unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt @@ -0,0 +1,4 @@ +Gff Contig Accessory_count Intermediate_cunt low_frequency_count +gff_1 contig_x 2 1 1 +gff_1 contig_y 1 1 0 +gff_1 contig_z 1 0 1 From 61757552b80d2612d968d235a73c29692d5dcc2f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 11:05:22 +1100 Subject: [PATCH 065/135] Add change to actually fetch correct files in testing functional output. Remove print statement from main --- Corekaburra/__main__.py | 1 - functional_tests/Corekaburra-test.sh | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index c6fd1ca..20357b9 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -299,7 +299,6 @@ def main(): # - Should we print a low-freq, placement? if len(non_core_contig_info)> 0: non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) - print(f'{non_core_contig_info = }') # time_calculator(time_start, time.time(), "writing output files") diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 84f8682..328aecf 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -332,9 +332,10 @@ rm -r test_out_folder # TODO - set up a test with a core-less contig draft. # TODO - implement coreless contig output! Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv coreless_contig_draft_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder # TODO - set up a test with a core-less contig complete. From 9b1d2d777310350d0f049fff4df14748cd853d70 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 11:14:29 +1100 Subject: [PATCH 066/135] Change output files for draft genomes as they were for a complete genome --- functional_tests/Corekaburra-test.sh | 3 ++- .../core_pair_summary.csv.expected | 3 ++- .../low_frequency_gene_placement.tsv.expected | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 328aecf..29e5f56 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -330,7 +330,7 @@ rm -r test_out_folder # TODO - set up a test with a core-less contig draft. -# TODO - implement coreless contig output! +call_new_test "Test with a core-less contig draft" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -339,6 +339,7 @@ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv core rm -r test_out_folder # TODO - set up a test with a core-less contig complete. +call_new_test "Test with a core-less contig complete" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt test_output_file test_out_folder/core_core_accessory_gene_content.tsv Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected index c62e46d..e48e928 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -1,4 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-C,2,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 +A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 0e7126e..0aa11ff 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -1,7 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_double_chrom A B 9 0 complete_genome_double_chrom_2 A B 9 0 -complete_genome_double_chrom A C 10 0 -complete_genome_double_chrom_2 A C 10 0 +complete_genome_double_chrom A Sequence_break 0 0 +complete_genome_double_chrom_2 A Sequence_break 0 0 complete_genome_double_chrom B C 9 0 complete_genome_double_chrom_2 B C 9 0 +complete_genome_double_chrom C Sequence_break 10 0 +complete_genome_double_chrom_2 C Sequence_break 10 0 From 06de23acbe2dd2b04d2ea706dfaaccd5ceee8777 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 10 Jan 2022 13:49:21 +1100 Subject: [PATCH 067/135] Add logging, also to unit tests. Remove a lot of done TODOs --- Corekaburra/__main__.py | 154 +++++++++++---------- Corekaburra/check_inputs.py | 38 +++-- Corekaburra/consesus_core_genome.py | 27 +++- Corekaburra/correct_gffs.py | 13 +- Corekaburra/exit_with_error.py | 7 +- Corekaburra/gff_parser.py | 19 +-- Corekaburra/output_writer_functions.py | 24 +--- Corekaburra/parse_gene_presence_absence.py | 30 ++-- Corekaburra/read_complete_genome_file.py | 7 +- functional_tests/Corekaburra-test.sh | 6 - unit_tests/Corekaburra_test.py | 117 +++++++++++----- 11 files changed, 258 insertions(+), 184 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 20357b9..6ffb42b 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -71,7 +71,7 @@ EXIT_INPUT_FILE_ERROR = 1 EXIT_COMMAND_LINE_ERROR = 2 EXIT_GFF_REANNOTATION_ERROR = 3 -DEFAULT_MIN_LEN = 0 +EXIT_SEGMENT_IDENTIFICATION_ERROR = 4 DEFAULT_VERBOSE = False PROGRAM_NAME = "Corekaburra" @@ -103,8 +103,8 @@ def init_logging(debug_log, quiet, out_path): file_logger = logging.getLogger(__name__) file_logger.setLevel(level) - formatter = logging.Formatter('[%(asctime)s] %(levelname)s - %(module)s - %(message)s', - datefmt="%Y-%m-%dT%H:%M:%S%z") + formatter = logging.Formatter('[%(asctime)s] - %(levelname)s - %(module)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S%z") file_handler = logging.FileHandler(os.path.join(out_path, 'Corekaburra.log')) file_handler.setLevel(level) @@ -128,7 +128,7 @@ def stream_logging(file_logger): file_logger.addHandler(stream_handler) - file_logger.info('Processing started') + file_logger.info('\n----------------------Processing started----------------------\n') return file_logger @@ -140,88 +140,78 @@ def main(): """ total_time_start = time.time() + inital_check_time_start = time.time() + # get arguments from the commandline args = get_commandline_arguments(sys.argv[1:]) + # Construct output folder + try: + os.mkdir(args.output_path) + except FileExistsError: + pass + + # Run initialisation of logger: + logger = init_logging(args.log, args.quiet, args.output_path) # TODO - if not dependency check is done then it should be possible to add the stream logger following the logging of the command line in the initial logging function. + logger = stream_logging(logger) + # Check that low-frequency cutoff and core cutoff are as expected - check_cutoffs(args.low_cutoff, args.core_cutoff) + check_cutoffs(args.low_cutoff, args.core_cutoff, logger) # TODO - Make Corekaburra take gzipped inputs # TODO - Add so that a single gff file can only be given as input once and not multiple times? # Check the presence of provided complete genomes among input GFFs if args.comp_genomes is not None: - comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs) + comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs, logger) else: comp_genomes = None - # Check source program from pan-genome and presence of nessecary files - if not args.quiet: - print("\n----Checking presence of input files in pan genome folder----\n") - # Check if Panaroo or Roary input folder is given - source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan) + source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan, logger) # Check if gene_data file is present if Panaroo input is given an gffs should be annotated if args.annotate and source_program == 'Panaroo': - gene_data_path = check_gene_data(args.input_pan) + gene_data_path = check_gene_data(args.input_pan, logger) else: gene_data_path = None - if not args.quiet: - print(f"Pan genome determined to come from {source_program}") - print("All files found, let's move on!\n") - print("--------------------------------------------------------------\n") - - # TODO - Make the program work with less than all files in the pangenome. Just make sure that all gff files supplied can be found in the pan genome. This will make is possible to look at hotspots and segments in different lineages - check_gff_in_pan(args.input_gffs, input_pres_abs_file_path) - - # Construct output folder - try: - os.mkdir(args.output_path) - if not args.quiet: - print("Output folder constructed") - except FileExistsError: - if not args.quiet: - print("Output folder exists") + check_gff_in_pan(args.input_gffs, input_pres_abs_file_path, logger) # Construct temporary folder: # TODO - check that the temporary folder does not exist and that the user does not have a folder with same name already. (Maybe use a time stamp for the start to make it unique.) tmp_folder_path = os.path.join(args.output_path, 'Corekaburra_tmp') - os.mkdir(tmp_folder_path) + try: + os.mkdir(tmp_folder_path) + except FileExistsError: + for file in os.listdir(tmp_folder_path): + os.remove(file) + + logger.info('Initial checks successful\n') + inital_check_time_end = time.time() ## Read in gene presence absence file - time_start = time.time() + time_start_read_files = time.time() # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] - core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, - args.core_cutoff, args.low_cutoff, source_program, - args.input_gffs, - tmp_folder_path) + core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, + args.low_cutoff, source_program, + args.input_gffs, tmp_folder_path, logger) + # Prepair folder for reannotated genes and examine if any are already present if source_program == "Panaroo" and args.annotate: - gene_data_dict, corrected_dict, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, args.input_gffs) + gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, + args.input_gffs, logger) else: gene_data_dict = None - corrected_dict = None - - # TODO - Add this into the multiprocessing loop to not doubble files - # TODO - Add a user command to keep and discard the corrected files (But still using them - Make mutually exclusive with -a option) - # Add in the refound genes into the gff files and print the corrected GFF files. - # if source_program == "Panaroo" and args.annotate: - # time_start = time.time() - # print(f"\n----------Adding in refound annotations for gff files---------") - # - # corrected_folder = correct_gffs(args.input_gffs, gene_data_path, args.output_path, attribute_dict, - # temp_folder_path) - # - # args.input_gffs = [join(corrected_folder, file) for file in listdir(corrected_folder) if '.gff' in file] - # if not args.quiet: - # time_calculator(time_start, time.time(), "add refound annotations to gff files") - - # Loop over all gffs and extract info from each of them. + corrected_dir = None + + time_end_read_files = time.time() + time_start_passing_gffs = time.time() + + # Loop over all Gffs and extract info from each of them. time_start = time.time() # Initialise dictionaries to contain results from all gff files core_neighbour_pairs = {} @@ -238,18 +228,17 @@ def main(): progress_update = 1 with concurrent.futures.ProcessPoolExecutor(max_workers=args.cpu) as executor: - print(f"\n------Start core region identification of given gff files-----") - print(f'{len(args.input_gffs)} GFF files to process') + logger.info(f"------Start core region identification of given gff files-----\n") + logger.info(f'{len(args.input_gffs)} GFF files to process') results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes, - source_program, args.annotate, gene_data_dict, corrected_dict, tmp_folder_path, args.discard_gffs) + source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path, args.discard_gffs, logger) for gff in args.input_gffs] for output in concurrent.futures.as_completed(results): progress_counter += 1 if progress_counter % progress_update == 0 or progress_counter == 1: - print( - f"GFF file #{progress_counter} has been processed") + logger.info(f"GFF file #{progress_counter} has been processed") # Split the outputs core_pairs, distance, acc_count, \ @@ -263,10 +252,10 @@ def main(): core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) master_info_total.update(master_info_return) non_core_contig_info.update(core_less_contigs_return) - # - # time_calculator(time_start, time.time(), "searching gff files for core genes") - print(f"\n--------------Identifying segments in pan genome--------------") + time_end_passing_gffs = time.time() + time_start_segments_search = time.time() + time_start = time.time() # Count number of unique accessory genes inserted into a core-core region across the genomes acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} @@ -278,32 +267,54 @@ def main(): combined_acc_gene_count = {key: low_frew_region_count[key] + acc_region_count[key] for key in low_frew_region_count} double_edge_segements, no_acc_segments = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, - len(args.input_gffs), core_dict) + len(args.input_gffs), core_dict, logger) - # time_calculator(time_start, time.time(), "identifying segments in pan genome") + time_end_segments_search = time.time() # Produce dict containing summarised information from master info. + logger.debug("Commence on calculating summary output") master_summary_info = calculate_n_create_summaries(master_info_total, core_dict) ### WRITE OUTPUTS ### - print(f"\n-----------------------Printing outputs-----------------------") + logger.debug("-----------------------Printing outputs-----------------------") # Write master information to output file time_start = time.time() - master_info_writer(master_info_total, args.output_path, args.output_prefix, args.quiet) - summary_info_writer(master_summary_info, args.output_path, args.output_prefix, args.quiet) + logger.debug("Master outputs") + master_info_writer(master_info_total, args.output_path, args.output_prefix) + + logger.debug("Summary output") + summary_info_writer(master_summary_info, args.output_path, args.output_prefix) + if double_edge_segements is not None: - segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) - no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) - # TODO - Possibly output core gene graph. with segment annotations? - # - Print summary number of genes and names - # - Should we print a low-freq, placement? + logger.debug("Segment output") + segment_writer(double_edge_segements, args.output_path, args.output_prefix) + + logger.debug("No Accessory segment output") + no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix) + # TODO - Possibly output core gene graph. with segment annotations in colour - possibly info on edges using weight for conenctions and other atributes for acc content.? + + # TODO - Print summary number of genes and names for non-core contigs + # TODO - Should we print a low-freq, placement? if len(non_core_contig_info)> 0: + logger.debug("Non-core contig output") non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) # time_calculator(time_start, time.time(), "writing output files") # Finish up running - # time_calculator(total_time_start, time.time(), "running the entire program") + total_time = round(time.time() - total_time_start, 1) + initial_time = round(inital_check_time_end - inital_check_time_start, 1) + read_fies_time = round(time_end_read_files - time_start_read_files, 1) + passing_gffs_time = round(time_end_passing_gffs - time_start_passing_gffs, 1) + segment_search_time = round(time_end_segments_search - time_start_segments_search) + + logger.debug("-----------------------Time used in run-----------------------") + logger.debug(f"Total time used: {total_time}s") + logger.debug(f"Initial check time: {initial_time}s") + logger.debug(f"Reading pan-genome files time: {read_fies_time}s") + logger.debug(f"Passing over Gff files time: {passing_gffs_time}s") + logger.debug(f"Searching for segments time: {segment_search_time}s") + # Remove temporary database holding gff databases if os.path.isdir(tmp_folder_path): @@ -311,6 +322,5 @@ def main(): if args.discard_gffs: os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files')) - if __name__ == '__main__': main() diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 9a489ae..5035cfb 100644 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -9,64 +9,82 @@ EXIT_COMMAND_LINE_ERROR = 2 -def check_cutoffs(low_cutoff, core_cutoff): +def check_cutoffs(low_cutoff, core_cutoff, logger): """ Function to check the given cutoffs are legal, otherwise provide more info. :param low_cutoff: Cutoff for low-frequency genes :param core_cutoff: Cutoff for core genes + :param logger: Program logger :return: Nothing """ if 0 <= low_cutoff < core_cutoff <= 1: + logger.debug(f'User provided cutoffs: {low_cutoff = } and {core_cutoff = } were accepted') return else: exit_with_error('Something is wrong with cutoffs for core and low-frequency genes!\n' 'Make sure the cutoff for core genes is larger than for low-frequency, and is >0 or =1.\n' 'Also make sure that the low-frequency gene cutoff is either equal to 0 or <1', - EXIT_COMMAND_LINE_ERROR) + EXIT_COMMAND_LINE_ERROR, logger) -def define_pangenome_program(folder): +def define_pangenome_program(folder, logger): """ Function to examine if input pan genome folder stems from Roary or Panaroo. :param folder: Input folder provided as pan-genome folder + :param logger: Program logger :return: The name of the program from which the pangenome is suspected to come from """ + logger.debug("----Checking presence of input files in pan genome folder----") + try: if os.path.isfile(os.path.join(folder, 'gene_presence_absence.csv')): # See if input is from Roary with open(os.path.join(folder, 'gene_presence_absence.csv'), 'r') as gene_pres_abs: if '"' in gene_pres_abs.readline(): gene_pres_abs_file_path = os.path.join(folder, 'gene_presence_absence.csv') + logger.debug('Pan-genome_program detected to be: Roary') return "Roary", gene_pres_abs_file_path # See if input is from Panaroo gene_pres_abs_file_path = os.path.join(folder, 'gene_presence_absence_roary.csv') if os.path.isfile(gene_pres_abs_file_path): + logger.debug('Pan-genome_program detected to be: Panaroo') return "Panaroo", gene_pres_abs_file_path else: - exit_with_error('No gene presence/absence file was found in given pan-genome folder', EXIT_INPUT_FILE_ERROR) + exit_with_error('No gene presence/absence file was found in given pan-genome folder', EXIT_INPUT_FILE_ERROR, logger) except FileNotFoundError: exit_with_error('No gene presence/absence file was found in given pan-genome folder', EXIT_INPUT_FILE_ERROR) -def check_gene_data(folder): +def check_gene_data(folder, logger): """ Check if the gene_data.csv file is present in the folder from a Panaroo pan-genome run :param folder: Input folder provided as pan-genome folder + :param logger: Program logger :return: Path to the identified gene_data.csv file """ + logger.debug('Identify gene_data.csv from Panaroo') + if os.path.isfile(os.path.join(folder, 'gene_data.csv')): + logger.debug('Gene_data.csv from Panaroo found!') return os.path.join(folder, 'gene_data.csv') else: exit_with_error('gene_data.csv file could not be located in the given pan genome input folder.\n' 'Please give the -a flag to omit this step or locate the gene_data.csv file.', - EXIT_INPUT_FILE_ERROR) + EXIT_INPUT_FILE_ERROR, logger) -def check_gff_in_pan(file_list, gene_presence_absence_path): +def check_gff_in_pan(file_list, gene_presence_absence_path, logger): + """ + Function to check if all given Gff files are in the given pan-genome + :param file_list: List of Gff file paths + :param gene_presence_absence_path: File path to the identified gene_presence_absence_file + :param logger: Program logger + :return: Bool used for unit testing + """ with open(gene_presence_absence_path, 'r') as pan_file: # Read the first line of the gene_presence_absence and extract the genome names pan_header_line = pan_file.readline() @@ -81,12 +99,10 @@ def check_gff_in_pan(file_list, gene_presence_absence_path): # if only a subset then raise warning if set(file_list).issubset(genome_names) or set(file_list_no_suffix).issubset(genome_names): if len(file_list) < len(genome_names): - warnings.warn( - "Not all gff in pan genome given as input. I will run with it but are you sure this is deliberate?") - # TODO - LOG above! + logger.info("\nNot all gff in pan genome given as input. I will run with it but are you sure this is deliberate?\n") return True # True used for unit testing # Exit with error is not all inputs can be found in the pan-genome presence absence file exit_with_error('Unexpected occurrence in the matching of input GFF files and the pan genome presence/absence file', - EXIT_INPUT_FILE_ERROR) + EXIT_INPUT_FILE_ERROR, logger) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index ec34761..1326abe 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -1,4 +1,10 @@ import networkx as nx +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error +EXIT_SEGMENT_IDENTIFICATION_ERROR = 4 + # pylint: disable=E1123, E1121 @@ -122,7 +128,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Check if any node have multiple edges, if not then return. if len(multi_edge_nodes) == 0: - return None # TODO - log and report better that this is the outcome! + return None # Dict to hold connections between >2 edge nodes connect_dict = {} @@ -173,7 +179,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): double_edge_segements[source_target_name] = segment else: if double_edge_segements[source_target_name] != segment[::-1]: - raise NotImplementedError("Path from one node to another was found, but did not match previously found path!") # TODO log and nice exit! + exit_with_error(EXIT_SEGMENT_IDENTIFICATION_ERROR, + f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!") # Calculate the expected number of paths total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) @@ -242,7 +249,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Check that the path does not contain nodes with >2 degrees outside of source and target, # if then add path, # else then find nodes that has >2 edges and remove an edge that leads to the node, to break the path for next run through loop - if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: # TODO - should this != 0 be here? + if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: double_edge_segements[suspected_pair] = path path_identified = True continue @@ -270,17 +277,21 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): return double_edge_segements -def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict): +def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict, logger): """ Function to be called from main that collects the functions for determining core segments in pan-genome :param core_neighbour_pairs: Dict of the number of times core pairs have been detected :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes + :param logger: Program logger :return double_edge_segements: :return no_acc_segments: """ + + logger.debug(f"--------------Searching for segments in pan genome--------------") + # Construct a graph from core gene neighbours core_graph = construct_core_graph(core_neighbour_pairs) @@ -288,9 +299,17 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) if double_edge_segements is not None: + logger.debug(f'A total of {len(double_edge_segements)} core genes were identified to have multiple neighbours.') + logger.debug(f'Genes with multiple neighbours: {double_edge_segements}') + + logger.debug('Search for Segments with no accessory genes starts now') + # Find segments of core genes with no accessory in between no_acc_segments = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) + + logger.debug('Segments with no accessory genes is done') else: + logger.debug(f'No segments can be identified in given pan-genome\n') no_acc_segments = None return double_edge_segements, no_acc_segments diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index f3f9d23..dd6e5c7 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -44,17 +44,21 @@ def read_gene_data(gene_data_file): return gene_data_dict -def prepair_for_reannotation(gene_data_path, output_folder, gffs): +def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): """ Function for creating an output folder for corrected genomes, check if any are present, and if then which. :param gene_data_path: Path to the gene_data.csv file from Panaroo :param output_folder: Folder designated as the output folder for Corekaburra :param gffs: List of file-paths to gff files. + :param logger: Program logger :return gene_data_dict: Dict containing the information expected from the gene_data.csv file :return corrected_gff_out_dir: File path to the created or identified directory of corrected gff files :return gffs: List of gff files, some may be altered to be the corrected verison from prior runs/ """ + + logger.debug('Initialise structures for reannotating genes found by Panaroo') + # Read Gene_data.csv file into dict with a dict of refound genes for each genome gene_data_dict = read_gene_data(gene_data_path) @@ -213,13 +217,14 @@ def write_contig(file, contig_name, sequence): file.write(sequence[len(sequence) - remainder:genome_length+1] + '\n') -def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir): +def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, logger): """ Function to add back in genes that are refound by Panaroo into gff files. :param gff_name: File path of gff to be corrected :param gene_data_dict: Dict of refound genes identified from gene_presence_absence.csv file :param tmp_folder_path: File path to the temporary folder :param corrected_gff_out_dir: File path to the folder where corrected genomes should be place + :param logger: Program logger :return: Nothing. """ """ Function to annotate the genes refound by Panaroo in a gff3 file""" @@ -262,7 +267,7 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_ else: # get reverse complement of the gene - gene_oi = Seq.reverse_complement(gene_oi) # TODO - Should evaluate if this is correct! Make test that test search for both forward and backward genes on first and second contig! + gene_oi = Seq.reverse_complement(gene_oi) if gene_oi in genome_oi: strand = '-' @@ -274,7 +279,7 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_ refound_gene, gene_data_dict[genome_name][refound_gene][1:], largest_locus_tag) else: exit_with_error(f"When correcting gff {gff_name}, the gene: {refound_gene} " - f"did not have any hit in the genome!", EXIT_GFF_REANNOTATION_ERROR) + f"did not have any hit in the genome!", EXIT_GFF_REANNOTATION_ERROR, logger) # Construct a database from the temporary gff that contain the added annotations path_tmp_gff_db = os.path.join(tmp_folder_path, f'{gff_file_name}_tmp_db') diff --git a/Corekaburra/exit_with_error.py b/Corekaburra/exit_with_error.py index 6412273..8403942 100644 --- a/Corekaburra/exit_with_error.py +++ b/Corekaburra/exit_with_error.py @@ -1,15 +1,17 @@ import sys import logging import os +from logging import getLogger -def exit_with_error(message, exit_status, tmp_folder=None): +def exit_with_error(message, exit_status, logger, tmp_folder=None): """ Print an error message to stderr, prefixed by the program name and 'ERROR'. Then exit program with supplied exit status. :param message: Message to give the user upon exit :param exit_status: Status returned as exit status :param tmp_folder: Temporary folder for Corekaburra to be deleted under some circumstances. + :param logger: Logger for program :return: None """ @@ -24,8 +26,7 @@ def exit_with_error(message, exit_status, tmp_folder=None): pass except FileNotFoundError: pass - # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - do so in nice exit function - logging.error(message) + logger.error(message) print(f"Corekaburra ERROR: {message}, exiting", file=sys.stderr) sys.exit(exit_status) diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 3acb7ec..83079e5 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -220,8 +220,6 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene last_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] first_core_gene_cluster = core_genes[gff_name][first_core_gene_gff_line[8]] - if first_core_gene_cluster == last_core_gene_cluster: - print('Same gene') # TODO - Log this? report or what? # Add core neighbours core_gene_neighbours = sorted([last_core_gene_cluster, first_core_gene_cluster]) @@ -378,6 +376,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc if line[8] in low_freq_genes[gff_name]: low_freq_genes_in_region.append(low_freq_genes[gff_name][line[8]]) else: + # acc_genes_in_region.append(acc_genes[gff_name][line[8]]) try: acc_genes_in_region.append(acc_genes[gff_name][line[8]]) except KeyError: # TODO - WHAT DOES THIS DO? - Likely search for fragment within composite, as fragments were previously storred in their composit strings. @@ -560,15 +559,21 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes, source_program, - annotate, gene_data_dict, corrected_dict, tmp_folder_path, discard_corrected): + annotate, gene_data_dict, corrected_dir, tmp_folder_path, discard_corrected, logger): """ Single function segmenting the gff into core gene regions to be used for simple multi processing :param input_gff_file: File-path to the given gff file to be segmented :param core_genes: Dictionary over core genes :param low_freq_genes: Dictionary over low-frequency genes :param acc_gene_dict: Dictionary over accessory genes - :param i: The index of which this process is in loop :param complete_genomes: Bool indicating if this genome should be considered as a complete genome + :param source_program: String indicating if program comes from Roary or Panaroo. + :param annotate: Bool to indicate if refound genes should be annotated + :param gene_data_dict: Dict of genes, annotations, names, and sequences found in the gene_data.csv file from Panaroo + :param corrected_dir: File path to directory where corrected Gff files are to be stored. + :param tmp_folder_path: Path to the temporary working folder. + :param discard_corrected: Bool indicating if corrected Gff files should be preserved as an output + :param logger: Progran logger :return input_gff_file: File path to the gff being searched :return core_genes: Dict of core genes passed to genomes and the pan-genome clusters. @@ -583,9 +588,7 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ if source_program == "Panaroo" and annotate: # check if not already corrected file and if any gene is to be inserted at all if "_corrected" not in input_gff_file and any([x in input_gff_file for x in list(gene_data_dict)]): - input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dict) - - # TODO - likely check if genome should be corrected at this point in the process. - Would require more inputs. + input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dir, logger) gff_generator = parse_gff(input_gff_file) return_data = segment_gff_content(gff_generator=gff_generator, @@ -595,8 +598,6 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ acc_genes=acc_gene_dict, complete_genomes=complete_genomes) - # TODO - Add in an if statement that checks if the corrected files should be kept! - # - If not then delete them and add an if statment that will delete the folder in the main script if "_corrected" in input_gff_file and discard_corrected: os.remove(input_gff_file) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 9820a3e..ed76269 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -3,17 +3,14 @@ import time -def master_info_writer(master_info, out_path, prefix, quiet): +def master_info_writer(master_info, out_path, prefix): """ Function to write two output .tsv files related to regions content and size for each genome :param master_info: Dict of info for each core gene pair across all genomes :param out_path: Path to the output folder :param prefix: A possible prefix for the output files. - :param quiet: :return: Nothing """ - if not quiet: - print("Printing master output") # Write general content out_file_name = 'low_frequency_gene_placement.tsv' @@ -67,18 +64,14 @@ def master_info_writer(master_info, out_path, prefix, quiet): writer.writerow(row) -def summary_info_writer(master_summary_info, out_path, prefix, quiet): +def summary_info_writer(master_summary_info, out_path, prefix): """ Function for writing the summary table for regions identified across genomes :param master_summary_info: Dict holding summary statistics for core pair region identified :param out_path: Path to the output folder :param prefix: Prefix for any output files - :param quiet: # TODO - log instead :return: Nothing """ - if not quiet: - print("Printing master output") - # Generate file name out_file_name = 'core_pair_summary.csv' if prefix is not None: @@ -102,17 +95,14 @@ def summary_info_writer(master_summary_info, out_path, prefix, quiet): writer.writerow(info) -def segment_writer(segments, out_path, prefix, quiet): +def segment_writer(segments, out_path, prefix): """ Function to write segments of core genes identified across the pan-genome :param segments: Dict of segments (lists) in values, under name of segments as keys. :param out_path: Path to output folder :param prefix: Prefix for any output files - :param quiet: # TODO - logger :return: Nothing """ - if not quiet: - print("Printing core segments") # Generate file name out_file_name = 'core_segments.csv' @@ -135,17 +125,14 @@ def segment_writer(segments, out_path, prefix, quiet): writer.writerow(info) -def no_acc_segment_writer(no_acc_segments, out_path, prefix, quiet): +def no_acc_segment_writer(no_acc_segments, out_path, prefix): """ Function for writing segments of core genes with no accessory between them. :param no_acc_segments: Dict of segments with (lists) in values with sub-lists being segments with no accessory genes between them, under name of segments as keys. :param out_path: Path to output folder :param prefix: Prefix for any output files - :param quiet: # TODO - logger :return: Nothing """ - if not quiet: - print("Printing core segments without accessory content") # Generate file name out_file_name = 'no_accessory_core_segments.csv' @@ -178,9 +165,6 @@ def non_core_contig_writer(non_core_contigs, out_path, prefix,): :param prefix: A possible prefix for the output files. :return: Nothing """ - # if not quiet: # TODO - log - # print("Printing master output") - # Write gene content in long format out_file_name = 'coreless_contig_accessory_gene_content.tsv' if prefix is not None: diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 1125ded..d680ad1 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -40,7 +40,7 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): # Get the gff and its path try: - gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] # TODO - fix that using a locus_tag it is not possible to identify genes. How do we make it so that we can? + gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] except IndexError: raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') @@ -84,10 +84,9 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): return return_list # TODO - find out what the non-closed file problem is here! Can be seen when running unit-tests. - # TODO - Find out how the gff parser handles this? Does there need to be a check if a gene cluster is being paired to it self and if then drop it and change the end coordinates. -def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, verbose=True): +def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, logger): """ Function that pass a Roary style gene presence/absence file. :param pres_abs_file: File path to the gene presence/absence file identified @@ -96,7 +95,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, :param source_program: The program from which the pan-genome was produced :param input_gffs: A list of file-paths to the gff files given as input :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run - :param verbose: Indeicater on verbosety level # TODO - Likely change to logger! + :param logger: Program logger :return: Directories of directories of core and low frequency genes, and a directory of pan genome clusters and their annotation. """ @@ -132,10 +131,9 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, core_gene_isolate_presence = floor(len(gff_file_dict.keys()) * core_gene_presence) low_freq_gene_isolate_presence = ceil(len(gff_file_dict.keys()) * low_freq_gene) - if verbose: - print(f"\n------------Opening the gene presence/absence file------------\n") - print(f"Core genes must be found in {core_gene_isolate_presence} or more isolates") - print(f"Low frequency genes must be found in less than {low_freq_gene_isolate_presence} isolates\n") + logger.info(f"------------Opening the gene presence/absence file------------\n" + f"Core genes must be found in {core_gene_isolate_presence} or more genomes\n" + f"Low frequency genes must be found in less than {low_freq_gene_isolate_presence} genomes\n") # initialise dict of dicts to hold genes from each gffs and to be returned core_gene_dict = {item: {} for item in gff_file_names[14:]} @@ -173,7 +171,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, # Check if gene was found to be a core gene if all(return_list): # Add the gene to the annotation dict - for genome in core_gene_dict.keys(): # TODO - Check if .keys can be omitted + for genome in core_gene_dict: # Get the annoations for a specific genome genes_in_genome = line[14 + gff_file_dict[genome]] # If there is an annotation add id @@ -186,7 +184,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, else: # Check if low frequency, if then add else then add as normal accessory - if low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: # TODO - review this == statement, should it be there? + if low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: for genome in low_freq_gene_dict.keys(): if len(line[14 + gff_file_dict[genome]]) > 0: add_gene_to_dict(low_freq_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) @@ -198,7 +196,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, acc_gene_number += 1 # Check if accessory if then add annotation to genomes - elif low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: # TODO - review this == statement, should it be there? + elif low_freq_gene_isolate_presence >= gene_isolate_presence == no_seq_presence: for genome in low_freq_gene_dict.keys(): if len(line[14+gff_file_dict[genome]]) > 0: add_gene_to_dict(low_freq_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) @@ -211,12 +209,10 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, add_gene_to_dict(acc_gene_dict, line[14 + gff_file_dict[genome]], line[0], genome) acc_gene_number += 1 - if verbose: - print("A total of:") - print(f"{core_gene_number} core gene clusters were identified") - print(f"{low_freq_gene_number} low frequency gene clusters were identified") - print(f"{acc_gene_number} intermediate accessory gene clusters were identified\n") - + logger.info("A total of:\n" + f"{core_gene_number} core gene clusters were identified\n" + f"{low_freq_gene_number} low frequency gene clusters were identified\n" + f"{acc_gene_number} intermediate accessory gene clusters were identified\n") # Remove gff databases files_in_tmp = os.listdir(tmp_folder_path) diff --git a/Corekaburra/read_complete_genome_file.py b/Corekaburra/read_complete_genome_file.py index 08f0939..98c580b 100644 --- a/Corekaburra/read_complete_genome_file.py +++ b/Corekaburra/read_complete_genome_file.py @@ -7,11 +7,12 @@ EXIT_INPUT_FILE_ERROR = 1 -def parse_complete_genome_file(complete_genome_file, gff_files): +def parse_complete_genome_file(complete_genome_file, gff_files, logger): """ Function to check if all genomes given as complete genomes can be found in the pan genome. :param complete_genome_file: :param gff_files: + :param logger: Logger for program :return: a list of the base name of the complete genomes. """ @@ -30,9 +31,11 @@ def parse_complete_genome_file(complete_genome_file, gff_files): # If the complete genomes are found, return a list of complete genomes if complete_genome_status: + logger.debug(f'complete genomes: {complete_genomes = } were identified and accepted') return complete_genomes else: - exit_with_error('Genome given in Complete genomes was not identified in pan-genome!', EXIT_INPUT_FILE_ERROR) + logger.debug(f'complete genomes: {complete_genomes = } were identified but not accepted!') + exit_with_error('Genome given in Complete genomes was not identified in pan-genome!', EXIT_INPUT_FILE_ERROR, logger) if __name__ == '__main__': diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 29e5f56..73cba70 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -291,7 +291,6 @@ test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chro test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected rm -r test_out_folder -# TODO - Set up test with a single core gene on a contig that is not complete call_new_test "Test with a single core gene on a contig that is not complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -299,7 +298,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_co test_output_file test_out_folder/core_pair_summary.csv single_core_contig_draft_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - Set up test with a single core gene on a contig that is complete call_new_test "Test with a single core gene on a contig that is complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ -cg complete_genomes_file > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -307,7 +305,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv single_core_co test_output_file test_out_folder/core_pair_summary.csv single_core_contig_complete_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test for core genes being fragmented. call_new_test "Test for core genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_2.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected @@ -320,7 +317,6 @@ rm -r test_out_folder # TODO - run the test check results and transfer to expected folder #rm -r test_out_folder -# TODO - test for accessory genes being fragmented. call_new_test "Test for accessory genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected @@ -329,7 +325,6 @@ test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene rm -r test_out_folder -# TODO - set up a test with a core-less contig draft. call_new_test "Test with a core-less contig draft" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -338,7 +333,6 @@ test_output_file test_out_folder/core_pair_summary.csv coreless_contig_draft_exp test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder -# TODO - set up a test with a core-less contig complete. call_new_test "Test with a core-less contig complete" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt test_output_file test_out_folder/core_core_accessory_gene_content.tsv Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 6de2bcd..88f151f 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -32,6 +32,11 @@ class TestExitWithError(unittest.TestCase): """ Test for the function carrying out a nice exit """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_exit_w_tmp_folder_deletion(self): ''' Test the exit function is able to remove the temporary folder ''' @@ -45,27 +50,38 @@ def test_exit_w_tmp_folder_deletion(self): copyfile(os.path.join(tmp_folder, file), os.path.join(tmp_folder_copy, file)) with self.assertRaises(SystemExit): - exit_with_error.exit_with_error(exit_status=2, message='test msg', tmp_folder=tmp_folder) + exit_with_error.exit_with_error(exit_status=2, message='test msg', logger=self.logger, tmp_folder=tmp_folder) os.rename(tmp_folder_copy, tmp_folder) class TestCutOffViolations(unittest.TestCase): + """ Test for the function that examines the cutoffs given for core and low-frequency genes""" + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_low_below_range(self): with self.assertRaises(SystemExit): - check_inputs.check_cutoffs(-0.1, 1) + check_inputs.check_cutoffs(-0.1, 1, self.logger) def test_core_above_range(self): with self.assertRaises(SystemExit): - check_inputs.check_cutoffs(0.05, 1.1) + check_inputs.check_cutoffs(0.05, 1.1, self.logger) def test_low_larger_than_core(self): with self.assertRaises(SystemExit): - check_inputs.check_cutoffs(0.6, 0.4) + check_inputs.check_cutoffs(0.6, 0.4, self.logger) class TestParsingCompleteGenomes(unittest.TestCase): """ Test for the passing of input file containing names of complete genome and checking their presence in the pan-genome """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_all_files_found(self): gff_files = ['/path/to/complete_genome_1.gff', '/path/complete_genome_2.gff.gz', @@ -81,7 +97,7 @@ def test_all_files_found(self): 'complete_genome_3', 'complete_genome_4'] - return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files) + return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files, self.logger) self.assertEqual(return_object, expected_return) @@ -96,15 +112,20 @@ def test_correct_one_files_not_found(self): with self.assertRaises(SystemExit): read_complete_genome_file.parse_complete_genome_file(complete_genome_file, - gff_files) + gff_files, self.logger) class TestPangenomeSourceProgram(unittest.TestCase): """ Test of the function that determines the program from which the pan-genome originated """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_roary_input(self): input_folder_path = 'TestPangenomeSourceProgram/Mock_roary' - return_program, return_path = check_inputs.define_pangenome_program(input_folder_path) + return_program, return_path = check_inputs.define_pangenome_program(input_folder_path, self.logger) self.assertEqual("Roary", return_program) self.assertEqual(input_folder_path + '/gene_presence_absence.csv', return_path) @@ -112,7 +133,7 @@ def test_roary_input(self): def test_panaroo_input(self): input_folder_path = 'TestPangenomeSourceProgram/Mock_panaroo' - return_program, return_path = check_inputs.define_pangenome_program(input_folder_path) + return_program, return_path = check_inputs.define_pangenome_program(input_folder_path, self.logger) self.assertEqual("Panaroo", return_program) self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path) @@ -129,14 +150,19 @@ def test_unknown_input(self): input_folder_path = 'TestPangenomeSourceProgram/Mock_unknwon' with self.assertRaises(SystemExit): - check_inputs.define_pangenome_program(input_folder_path) + check_inputs.define_pangenome_program(input_folder_path, self.logger) class TestPresenceOfGenedataFile(unittest.TestCase): """ Test the function that ensures the presence of the Gene_data.csv file produced by Panaroo """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_Genedata_File_present(self): input_folder_path = 'TestPresenceOfGenedataFile/present' - return_path = check_inputs.check_gene_data(input_folder_path) + return_path = check_inputs.check_gene_data(input_folder_path, self.logger) self.assertEqual(return_path, input_folder_path +'/gene_data.csv') @@ -144,17 +170,22 @@ def test_Genedata_File_absent(self): input_folder_path = 'TestPresenceOfGenedataFile/absent' with self.assertRaises(SystemExit): - check_inputs.check_gene_data(input_folder_path) + check_inputs.check_gene_data(input_folder_path, self.logger) class TestPresenceOfGffsInPresAbsFile(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + """ Test the function that ensures all gffs given as input are included in the pan-genome provided """ # Test pairing of all files in pan genome def test_input_gff_pres_abs_pairing_all_gffs(self): input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' input_file_list = ['Silas_the_Salmonella', 'Christina_the_Streptococcus', 'Ajwa_the_Shigella'] - return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) self.assertEqual(return_bool, True) @@ -163,8 +194,7 @@ def test_input_gff_pres_abs_pairing_some(self): input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' input_file_list = ['Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff'] - with self.assertWarns(Warning): - return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) self.assertEqual(return_bool, True) @@ -174,14 +204,14 @@ def test_input_gff_pres_abs_file_not_in_pan(self): input_file_list = ['not_found.gff', 'Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff'] with self.assertRaises(SystemExit): - check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) def test_input_gff_pres_abs_some_file_not_in_pan(self): input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' input_file_list = ['not_found.gff', 'also_not_found.gff', 'definitely_not_found.gff'] with self.assertRaises(SystemExit): - check_inputs.check_gff_in_pan(input_file_list, input_pres_abs) + check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) class TestAddingGeneToDict(unittest.TestCase): @@ -306,13 +336,16 @@ def test_fragments_on_separate_contigs(self): self.assertEqual(expected_return, return_bool) - # TODO - Can a fragmented gene be recognised, if spanning contigs? - class TestParsingGenePresenceAbsenceFile(unittest.TestCase): """ Tests for the function that passes the gene presence absence table from pan-genome program """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_parsing_w_100_presence(self): file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv' core_gene_presence = 1 @@ -397,7 +430,7 @@ def test_parsing_w_100_presence(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path) + input_gffs, tmp_folder_path, self.logger) self.assertEqual(expected_core_gene_dict, core_gene_dict) self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) @@ -427,7 +460,7 @@ def test_parsing_w_100_presence_roary(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -517,7 +550,7 @@ def test_parsing_w_90_presence(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -607,7 +640,7 @@ def test_parsing_w_90_presence_roary(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -693,6 +726,11 @@ def test_read_file(self): class TestPrepairForReannotation(unittest.TestCase): """ Test for pre-pairing a folder for corrected genomes, and testing if any are present from previous runs """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def tearDown(self): try: """ Class to remove created corrected output folder""" @@ -702,9 +740,11 @@ def tearDown(self): def test_no_files_annotated(self): input_gffs = ['Mock_1.gff', 'Mock_2.gff'] - gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation('TestPrepairForReannotation/Mock_gene_data.csv', - 'TestPrepairForReannotation/', - input_gffs) + gene_data_dict_return, \ + corrected_gff_out_dir_return, \ + corrected_files_return = correct_gffs.prepair_for_reannotation('TestPrepairForReannotation/Mock_gene_data.csv', + 'TestPrepairForReannotation/', + input_gffs, self.logger) self.assertTrue(os.path.isdir('TestPrepairForReannotation/Corrected_gff_files')) self.assertEqual(input_gffs, corrected_files_return) @@ -714,7 +754,7 @@ def test_some_files_annotated(self): gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( 'TestPrepairForReannotation/Mock_gene_data.csv', 'TestPrepairForReannotation/Some_genomes', - input_gffs) + input_gffs, self.logger) expected_gffs = ['Mock_2.gff', 'Mock_1_corrected.gff'] @@ -725,7 +765,7 @@ def test_all_files_annotated(self): gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( 'TestPrepairForReannotation/Mock_gene_data.csv', 'TestPrepairForReannotation/All_genomes', - input_gffs) + input_gffs, self.logger) expected_gffs = ['Mock_1_corrected.gff', 'Mock_2_corrected.gff'] @@ -849,6 +889,11 @@ class TestAnnotateRefoundGenomes(unittest.TestCase): """ Test of the function used to reannotate refound genes identified by panaroo in a gff file. """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def tearDown(self): """ Class to remove modified gff and rename the original""" try: @@ -886,7 +931,7 @@ def test_annotation_of_pos_stand_gene(self): 'TTTT\n' ] - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: self.assertEqual(expected_lines, added_gff.readlines()) @@ -917,7 +962,7 @@ def test_annotation_of_neg_stand_gene(self): 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', 'TTTT\n'] - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: self.assertEqual(expected_lines, added_gff.readlines()) @@ -932,7 +977,7 @@ def test_gene_not_found(self): corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' with self.assertRaises(SystemExit): - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir) + correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) # TODO - Add test for annotating of second contig @@ -3243,7 +3288,7 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) - self.assertEqual(expected_segments, double_edge_segements) # TODO + self.assertEqual(expected_segments, double_edge_segements) def test_double_edge_segment_identification_segments_node_w_challenging_paths_2(self): expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_F', 'pan_cluster_B'], @@ -3329,7 +3374,7 @@ def test_double_edge_segment_identification_segments_node_w_less_than_all_presen self.assertEqual(expected_segments, double_edge_segements) - def test_double_edge_segment_identification_segments_node_w_two_gene_segment(self): # TODO - see TODO on line 791 in consensus_core_genome! + def test_double_edge_segment_identification_segments_node_w_two_gene_segment(self): expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_B'], 'pan_cluster_A--pan_cluster_G': ['pan_cluster_A', 'pan_cluster_I', 'pan_cluster_H', 'pan_cluster_G'], 'pan_cluster_B--pan_cluster_E': ['pan_cluster_B', 'pan_cluster_C', 'pan_cluster_D', 'pan_cluster_E'], @@ -3523,7 +3568,7 @@ def test_master_info_writer(self): expected_low_freq = 'TestWritingOutputFunction/low_freq.txt' expected_gene_content = 'TestWritingOutputFunction/gene_content.txt' - output_writer_functions.master_info_writer(master_info, out_path, prefix, True) + output_writer_functions.master_info_writer(master_info, out_path, prefix) with open(expected_low_freq, 'r') as expected: with open('TestWritingOutputFunction/test_low_frequency_gene_placement.tsv', 'r') as result: @@ -3550,7 +3595,7 @@ def test_summary_info_writer(self): expected_summary_table = 'TestWritingOutputFunction/summary_table.txt' - output_writer_functions.summary_info_writer(input_dict, out_path, prefix, True) + output_writer_functions.summary_info_writer(input_dict, out_path, prefix) with open(expected_summary_table, 'r') as expected: with open('TestWritingOutputFunction/test_core_pair_summary.csv', 'r') as result: @@ -3572,7 +3617,7 @@ def test_segment_writer(self): expected_summary_table = 'TestWritingOutputFunction/core_segments.txt' - output_writer_functions.segment_writer(input_segments, out_path, prefix, True) + output_writer_functions.segment_writer(input_segments, out_path, prefix) with open(expected_summary_table, 'r') as expected: with open('TestWritingOutputFunction/test_core_segments.csv', 'r') as result: @@ -3594,7 +3639,7 @@ def test_no_acc_segment_writer(self): expected_summary_table = 'TestWritingOutputFunction/no_acc_segments.txt' - output_writer_functions.no_acc_segment_writer(input_segments, out_path, prefix, True) + output_writer_functions.no_acc_segment_writer(input_segments, out_path, prefix) with open(expected_summary_table, 'r') as expected: with open('TestWritingOutputFunction/test_no_accessory_core_segments.csv', 'r') as result: From 644a23682f97fff16411f29d5b4f7819289f090f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 11 Jan 2022 09:38:44 +1100 Subject: [PATCH 068/135] Add in a description for Corekaburra --- Corekaburra/__main__.py | 8 ++++++-- setup.py | 11 +++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 6ffb42b..d0af37c 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -6,8 +6,11 @@ Maintainer : magnus.ganer.j@gmail.com Portability : POSIX -The program reads one or more input FASTA files. For each file it computes a -variety of statistics, and then prints a summary of the statistics as output. # TODO - Change description +Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra +identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters +and distance between core genes. Information from neighboring core genes is further used to identify stretches of core +gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs +from standard pan-genome pipelines: Roary and Panaroo. ''' import os @@ -322,5 +325,6 @@ def main(): if args.discard_gffs: os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files')) + if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 8908109..4513c01 100644 --- a/setup.py +++ b/setup.py @@ -3,12 +3,11 @@ from distutils.core import setup LONG_DESCRIPTION = \ -'''The program reads one or more input FASTA files. -For each file it computes a variety of statistics, and then -prints a summary of the statistics as output. - -The goal is to provide a solid foundation for new bioinformatics command line tools, -and is an ideal starting place for new projects.''' +'''Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra +identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters +and distance between core genes. Information from neighboring core genes is further used to identify stretches of core +gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs +from standard pan-genome pipelines: Roary and Panaroo.''' setup( From 7c744a1dc8e4e2620dc45074da777707c1ecb795 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 11 Jan 2022 12:54:20 +1100 Subject: [PATCH 069/135] Add in a nice README, and unit-test for reading a complete genome file and ideas for additional functional test --- Corekaburra/commandline_interface.py | 11 +- Corekaburra/output_writer_functions.py | 6 +- README.md | 330 +++++------------- functional_tests/Corekaburra-test.sh | 2 + functional_tests/test_data/no_input.expected | 9 +- unit_tests/Corekaburra_test.py | 13 +- .../complete_genomes_file.txt | 1 + 7 files changed, 114 insertions(+), 258 deletions(-) diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 817265f..72614e6 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -12,12 +12,11 @@ def get_commandline_arguments(args): :return: matched argument object for passing in main function. """ # Set up parser - parser = argparse.ArgumentParser(description='Welcome to Corekaburra!\n ' - 'Program to determine consensus core sequence from multiple genomes.\n' - 'Outputs consensus core gene alignment, distance between core genes, ' - 'number of accessory genes between core genes and low frequency genes ' - 'between core genes', - add_help=False) #TODO - Change + parser = argparse.ArgumentParser(description='Welcome to Corekaburra!' + 'An extension to pan-genome analyses that summarise genomic regions ' + 'between core genes and segments of neighbouring core genes using ' + 'gene synteny from a set of input genomes and a pan-genome folder.', + add_help=False) required = parser.add_argument_group('Required arguments') run_mods = parser.add_argument_group('Analysis modifiers') diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index ed76269..9d8117a 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -13,7 +13,7 @@ def master_info_writer(master_info, out_path, prefix): """ # Write general content - out_file_name = 'low_frequency_gene_placement.tsv' + out_file_name = 'low_frequency_gene_placement.tsv' # Previously 'low_frequency_gene_placement.tsv' - Proposed name: core_region_content.tsv if prefix is not None: out_file_name = f'{prefix}_{out_file_name}' with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file: @@ -31,7 +31,7 @@ def master_info_writer(master_info, out_path, prefix): writer.writerow(info) # Write gene content in long format - out_file_name = 'core_core_accessory_gene_content.tsv' + out_file_name = 'core_core_accessory_gene_content.tsv' # Previously core_core_accessory_gene_content.tsv - Proposed name: accessory_gene_placement.tsv if prefix is not None: out_file_name = f'{prefix}_{out_file_name}' @@ -73,7 +73,7 @@ def summary_info_writer(master_summary_info, out_path, prefix): :return: Nothing """ # Generate file name - out_file_name = 'core_pair_summary.csv' + out_file_name = 'core_pair_summary.csv' # Previously: core_pair_summary.csv - proposed name: core_region_summary.csv if prefix is not None: out_file_name = prefix + '_' + out_file_name diff --git a/README.md b/README.md index 8b886fe..720d766 100644 --- a/README.md +++ b/README.md @@ -1,280 +1,124 @@ -[![Build Status](https://app.travis-ci.com/milnus/Corekaburra.svg?token=28TZmx3MewJSs5GVS7VU&branch=main)](https://app.travis-ci.com/milnus/Corekaburra) +[![Test](https://github.com/milnus/Corekaburra/actions/workflows/Test.yml/badge.svg)](https://github.com/milnus/Corekaburra/actions/workflows/Test.yml) # Overview +Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra +identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters +and distance between core genes. Information from neighboring core genes is further used to identify stretches of core +gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs +from standard pan-genome pipelines: [Roary](academic.oup.com/bioinformatics/article/31/22/3691/240757) and [Panaroo](genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02090-4). -This program reads one or more input FASTA files. For each file it computes a variety of statistics, and then prints a summary of the statistics as output. +# When to use +Corekaburra fits into the existing frameworks of bioinformatics pipelines for pan-genomes. It does not reinvent a new pan-genome pipeline, but leverages the existing ones. Because of this, Corekaburra is build to be a natural extension to the analysis of pan-genomes by summarising information and inferring relationships in the pan-genome otherwise not easily accessible via pan-genome graphs. Other tools provide similar outputs or information, but in their own standalone pan-genome analysis framework or pipeline. Such frameworks/pipelines are [PPanGGolin](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007732) and [Panakeia](https://www.biorxiv.org/content/biorxiv/early/2021/03/02/2021.03.02.433540.full.pdf). By building on top of existing tools Corekaburra frees users from potentially cross referencing beteween pan-genomes, which in itself is a challenging task. Corekaburra's workflow also allows it to be extended to any pan-genome tool, with an output similar to the gene_presence_absence.csv produced by Roary, making Corekaburra versatile for future implementations. -In the examples below, `$` indicates the command line prompt. +# Why use Corekaburra? -# Licence - -This program is released as open source software under the terms of [MIT License](https://raw.githubusercontent.com/milnus/Corekaburra/master/LICENSE). -# Installing +# Installation +Corekaburra can be installed via pip and conda. A Docker container is also available. +## pip +```Comming soon``` -You can install Corekaburra directly from the source code or build and run it from within Docker container. +## Conda +```Comming``` -## Installing directly from source code +## Docker +See the Wiki for more information (*** Link to wiki's Docker page ***)[] -Clone this repository: -``` -$ git clone https://github.com/milnus/Corekaburra -``` - -Move into the repository directory: -``` -$ cd Corekaburra +# Help ``` +usage: Corekabura -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] [-h] -Python 3 is required for this software. - -Corekaburra can be installed using `pip` in a variety of ways (`$` indicates the command line prompt): - -1. Inside a virtual environment: -``` -$ python3 -m venv Corekaburra_dev -$ source Corekaburra_dev/bin/activate -$ pip install -U /path/to/Corekaburra -``` -2. Into the global package database for all users: -``` -$ pip install -U /path/to/Corekaburra -``` -3. Into the user package database (for the current user only): -``` -$ pip install -U --user /path/to/Corekaburra -``` +Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes using gene synteny from a set of input genomes and a pan-genome folder. +Required arguments: + -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...] + Path to gff files used for pan-genome + -ip path/to/pan_genome, --input_pangenome path/to/pan_genome + Path to the folder produced by Panaroo or Roary -## Building the Docker container +Analysis modifiers: + -cg complete_genomes.txt, --complete_genomes complete_genomes.txt + text file containing names of genomes that are to be handled as complete genomes + -a, --no_annotate_refound + Flag to toggle off the creation of new gff files, with annotation of refound genes. Only done if input pangenome is detected as coming from Panaroo + -cc 1.0, --core_cutoff 1.0 + Percentage of isolates in which a core gene must be present [default: 1.0] + -lc 0.05, --low_cutoff 0.05 + Percentage of isolates where genes found in less than these are seen as low-frequency genes [default: 0.05] -The file `Dockerfile` contains instructions for building a Docker container for Corekaburra. +Output control: + -o path/to/output, --output path/to/output + Path to where output files will be placed [default: current folder] + -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX + Prefix for output files, if any is desired + -d, --discard_corrected + Discard gff files corrected with refound genes identified by Panaroo - Only compativle if pan-genome comes from Panaroo [Default: Corrected files are kept] -If you have Docker installed on your computer you can build the container like so: +Other arguments: + -c int, --cpu int Give max number of CPUs [default: 1] + -l, --log Record program progress in for debugging purpose + -q, --quiet Only print warnings + -h, --help Show help function ``` -$ docker build -t Corekaburra . -``` -See below for information about running Corekaburra within the Docker container. - -# General behaviour - -Corekaburra accepts zero or more FASTA filenames on the command line. If zero filenames are specified it reads a single FASTA file from the standard input device (stdin). Otherwise it reads each named FASTA file in the order specified on the command line. Corekaburra reads each input FASTA file, computes various statistics about the contents of the file, and then displays a tab-delimited summary of the statistics as output. Each input file produces at most one output line of statistics. Each line of output is prefixed by the input filename or by the text "`stdin`" if the standard input device was used. - -Corekaburra processes each FASTA file one sequence at a time. Therefore the memory usage is proportional to the longest sequence in the file. - -An optional command line argument `--minlen` can be supplied. Sequences with length strictly less than the given value will be ignored by Corekaburra and do not contribute to the computed statistics. By default `--minlen` is set to zero. -These are the statistics computed by Corekaburra, for all sequences with length greater-than-or-equal-to `--minlen`: +# Inputs +## Gff files +Input Gff files must be included in the pan-genome gene_presence_absence.csv-style file. +The Gffs are also required to contain a ```##FASTA``` dividing the file into annotations at the top and the Fasta genome in the bottom of the file. +All coding sequences (CDS) annotated in the GFF must also carry an ```ID``` and a ```locus_tag```. +*** Input Gffs can be gzipped *** -* *NUMSEQ*: the number of sequences in the file satisfying the minimum length requirement. -* *TOTAL*: the total length of all the counted sequences. -* *MIN*: the minimum length of the counted sequences. -* *AVERAGE*: the average length of the counted sequences rounded down to an integer. -* *MAX*: the maximum length of the counted sequences. - -If there are zero sequences counted in a file, the values of MIN, AVERAGE and MAX cannot be computed. In that case Corekaburra will print a dash (`-`) in the place of the numerical value. Note that when `--minlen` is set to a value greater than zero it is possible that an input FASTA file does not contain any sequences with length greater-than-or-equal-to the specified value. If this situation arises Corekaburra acts in the same way as if there are no sequences in the file. - -## Help message - -Corekaburra can display usage information on the command line via the `-h` or `--help` argument: +## Pan-genome folder +This is the output folder from a Roary or Panaroo run, or a folder that at minimum contains the gene_presence_absence.csv from Roary or the gene_presence_absence_roary.csv from Panaroo. +## Complete genomes +If some input Gff are to processed as complete or closed genomes, a plain text file can be provided with the filename of these. +example: ``` -$ Corekaburra -h -usage: Corekaburra [-h] [--minlen N] [--version] [--log LOG_FILE] - [FASTA_FILE [FASTA_FILE ...]] - -Print fasta stats - -positional arguments: - FASTA_FILE Input FASTA files - -optional arguments: - -h, --help show this help message and exit - --minlen N Minimum length sequence to include in stats (default 0) - --version show program's version number and exit - --log LOG_FILE record program progress in LOG_FILE +complete_genome.gff +complete_genome.gff.gz +/paths/are/allowed/complete_genome.gff +complete_genome ``` +All files given in the plain text file of complete genomes must be found in a given gene presence/absence file, but are not required to be among the input gffs, meaning that a single plain text file of complete genomes can be used for analysing subsets of genomes in the pan-genome. -## Reading FASTA files named on the command line +## Adjusting cutoffs +To comply with common practice when handling pan-genomes, the cutoff for when a pan-genome cluster (gene) is perceived as core can be changed using the ```-cc``` arguments with a ratio of gene presence required. By default, this is set to a conservative 100% presence of core gene clusters. +A second argument dividing accessory genes into two groups (Low frequency and Intermediate frequency) can be controlled using the ```-lc``` argument, with the ratio indicating the maximum presence of a gene cluster to be identified as having a low frequency in the pan-genome. This division of low- and intermediate frequency can be disabled by ```-lc 0```, resulting in all genes being considered as intermediate. -Corekaburra accepts zero or more named FASTA files on the command line. These must be specified following all other command line arguments. If zero files are named, Corekaburra will read a single FASTA file from the standard input device (stdin). +# Outputs +Corekaburra outputs multiple files ranging from summaries to more fine grained outputs. This is aimed at giving the user easy access to information, but still allowing for tailored or deep exploration. -There are no restrictions on the name of the FASTA files. Often FASTA filenames end in `.fa` or `.fasta`, but that is merely a convention, which is not enforced by Corekaburra. +## Core regions +A Core region is defined by two core gene clusters flanking a stretch of the genome in at least one input genome (Gff). A core region can be described by a distance between the flanking core gene clusters, positive if nucleotides can be found between then, and negative if the two clusters overlap). A region can also be described by the number of encoded accessory genes, low- and intermediate frequency. Using core gene clusters as a reference for a region it is possible to compare the same region across genomes, and in the larger framework of the pan-genome. Additionally, with either or both the distance and number of encoded accessory genes in a region it is possible to identify regions of variability, due to horizontal genetic transfer, deletion or other genomic processes. -The example below illustrates Corekaburra applied to a single named FASTA file called `file1.fa`: -``` -$ Corekaburra file1.fa -FILENAME NUMSEQ TOTAL MIN AVG MAX -file1.fa 5264 3801855 31 722 53540 -``` +```core_pair_summary.csv``` is a file that summarises the identified core regions identified across the input genomes (Gffs). Here information about occurrence and co-occurnece of each core gene pair, and individual core gene occurrences can be found. as well as, distance and accessory gene summary statistics (minimum, maximum, mean, and median). +This file is a good entery point to the results in most analysis, and should give a good indication of which core regions that could be of interest. -The example below illustrates Corekaburra applied to three FASTA files called `file1.fa`, `file2.fa` and `file3.fa`: -``` -$ Corekaburra file1.fa file2.fa file3.fa -FILENAME NUMSEQ TOTAL MIN AVG MAX -file1.fa 5264 3801855 31 722 53540 -file2.fa 1245 982374 8 393 928402 -file3.fa 64 8376 102 123 212 -``` +```core_core_accessory_gene_content.tsv``` gives the placement of each accessory genes identified in a core region across all genomes (Gff). It is also given if the accessory gene is identified as a low- or intermediate frequency gene. -## Reading a single FASTA file from standard input +```low_frequency_gene_placement.tsv``` summarises each core region across all genomes (Gff) with the distance between core gene clusters, and the number of accessory genes found between them. -The example below illustrates Corekaburra reading a FASTA file from standard input. In this example we have redirected the contents of a file called `file1.fa` into the standard input using the shell redirection operator `<`: +## Core segments +The two following files are only given if any core gene is found to have more than two different core gene clusters as neighbours across all input genomes (Gff). -``` -$ Corekaburra < file1.fa -FILENAME NUMSEQ TOTAL MIN AVG MAX -stdin 5264 3801855 31 722 53540 -``` +The file ```core_segments.csv``` containg all segments of minimum two core genes identified in a pan-genome, where the start and end of a segments is defined by core gene clusters with more than two neighbours, meaning they could be a potential breakpoint of a genomic rearrangements in at least a single input genome (Gff), or be a misassembly. -Equivalently, you could achieve the same result by piping a FASTA file into Corekaburra: +```no_accessory_core_segments.csv``` divides the segments identified in ```core_segments.csv``` into potential smaller segments where core gene clusters must form regions with no accessory genes between them across all genomes. These segments could indicate potential operon structures or other stable genomic feature, that could be disturbed by insertion of accessory genes. -``` -$ cat file1.fa | Corekaburra -FILENAME NUMSEQ TOTAL MIN AVG MAX -stdin 5264 3801855 31 722 53540 -``` - -## Filtering sequences by length - -Corekaburra provides an optional command line argument `--minlen` which causes it to ignore (not count) any sequences in the input FASTA files with length strictly less than the supplied value. - -The example below illustrates Corekaburra applied to a single FASTA file called `file`.fa` with a `--minlen` filter of 1000. -``` -$ Corekaburra --minlen 1000 file.fa -FILENAME NUMSEQ TOTAL MIN AVG MAX -file1.fa 4711 2801855 1021 929 53540 -``` +## Core-less contigs +```core_segments.csv``` gives all contigs identified in genomes (Gff) that does not contain a core gene cluster, but only accessory genes. Each contig is given by contig name, its Gff file, and number of low- and intermediate frequency genes found on the contig. -## Logging - -If the ``--log FILE`` command line argument is specified, Corekaburra will output a log file containing information about program progress. The log file includes the command line used to execute the program, and a note indicating which files have been processes so far. Events in the log file are annotated with their date and time of occurrence. - -``` -$ Corekaburra --log bt.log file1.fasta file2.fasta -``` -``` -$ cat bt.log -2016-12-04T19:14:47 program started -2016-12-04T19:14:47 command line: /usr/local/bin/Corekaburra --log bt.log file1.fasta file2.fasta -2016-12-04T19:14:47 Processing FASTA file from file1.fasta -2016-12-04T19:14:47 Processing FASTA file from file2.fasta -``` - - -## Empty files - -It is possible that the input FASTA file contains zero sequences, or, when the `--minlen` command line argument is used, it is possible that the file contains no sequences of length greater-than-or-equal-to the supplied value. In both of those cases Corekaburra will not be able to compute minimum, maximum or average sequence lengths, and instead it shows output in the following way: - -The example below illustrates Corekaburra applied to a single FASTA file called `empty.fa` which contains zero sequences: -``` -$ Corekaburra empty.fa -FILENAME NUMSEQ TOTAL MIN AVG MAX -empty.fa 0 0 - - - -``` +## Corrected Gffs +A folder containing Gff files that have been corrected by annotating the genes refound by Panaroo. This folder is only expected when a pan-genome from Panaroo is provided, and the ```-a``` or ```-d``` arguments are not given as inputs. +**Notice this will duplicate your Gff files, meaning that ```-a``` or ```-d``` arguments should be used to avoid this, when dealing with memory issues or large datasets** -## Exit status values +# For more info +For more into on Corekaburra, its workings, inputs, outputs and more see the (wiki)[*** Wiki link ***] -Corekaburra returns the following exit status values: - -* 0: The program completed successfully. -* 1: File I/O error. This can occur if at least one of the input FASTA files cannot be opened for reading. This can occur because the file does not exist at the specified path, or Corekaburra does not have permission to read from the file. -* 2: A command line error occurred. This can happen if the user specifies an incorrect command line argument. In this circumstance Corekaburra will also print a usage message to the standard error device (stderr). - -# Running within the Docker container - -The following section describes how to run Corekaburra within the Docker container. It assumes you have Docker installed on your computer and have built the container as described above. -The container behaves in the same way as the normal version of Corekaburra, however there are some Docker-specific details that you must be aware of. - -The general syntax for running Corekaburra within Docker is as follows: -``` -$ docker run -i Corekaburra CMD -``` -where CMD should be replaced by the specific command line invocation of Corekaburra. Specific examples are below. - -Display the help message: -``` -$ docker run -i Corekaburra Corekaburra -h -``` -Note: it may seem strange that `Corekaburra` is mentioned twice in the command. The first instance is the name of the Docker container and the second instance is the name of the Corekaburra executable that you want to run inside the container. - -Display the version number: -``` -$ docker run -i Corekaburra Corekaburra --version -``` - -Read from a single input FASTA file redirected from standard input: -``` -$ docker run -i Corekaburra Corekaburra < file.FASTA -``` - -Read from multuple input FASTA files named on the command line, where all the files are in the same directory. You must replace `DATA` with the absolute file path of the directory containing the FASTA files: -``` -$ docker run -i -v DATA:/in Corekaburra Corekaburra /in/file1.fasta /in/file2.fasta /in/file3.fasta -``` -The argument `DATA:/in` maps the directory called DATA on your local machine into the `/in` directory within the Docker container. - -Logging progress to a file in the directory OUT: -``` -$ docker run -i -v DATA:/in -v OUT:/out Corekaburra-c Corekaburra --log /out/logfile.txt /in/file1.fasta /in/file2.fasta /in/file3.fasta -``` -Replace `OUT` with the absolute path of the directory to write the log file. For example, if you want the log file written to the current working directory, replace `OUT` with `$PWD`. -As above, you will also need to replace `DATA` with the absolite path to the directory containing your input FASTA files. - -# Testing - -## Unit tests - -You can run the unit tests for Corekaburra with the following commands: -``` -$ cd Corekaburra/python/Corekaburra -$ python -m unittest -v Corekaburra_test -``` - -## Test suite - -Sample test input files are provided in the `functional_tests/test_data` folder. -``` -$ cd functional_tests/test_data -$ Corekaburra two_sequence.fasta -FILENAME TOTAL NUMSEQ MIN AVG MAX -two_sequence.fasta 2 357 120 178 237 -``` - -Automated tests can be run using the `functional_tests/Corekaburra-test.sh` script like so: - -``` -$ cd functional_tests -$ ./Corekaburra-test.sh -p Corekaburra -d test_data -``` - -The `-p` argument specifies the name of the program to test, the `-d` argument specifies the path of the directory containing test data. -The script will print the number of passed and failed test cases. More detailed information about each test case can be obtained -by requesting "verbose" output with the `-d` flag: - -``` -$ ./Corekaburra-test.sh -p Corekaburra -d test_data -v -``` - -The test script can also be run inside the Docker container: -``` -$ docker run Corekaburra /Corekaburra/functional_tests/Corekaburra-test.sh -p Corekaburra -d /Corekaburra/functional_tests/test_data -v -``` - -# Common Workflow Language (CWL) wrapper - -The [Common Workflow Language (CWL)](https://www.commonwl.org/) specifies a portable mechanism for running software tools and workflows across many different platforms. -We provide an example CWL wrapper for Corekaburra in the file `Corekaburra.cwl`. It invokes Corekaburra using the Docker container (described above). This wrapper allows you -to easily incorporate Corekaburra into CWL workflows, and can be executed by any CWL-supporting workflow engine. - -You can test the wrapper using the `cwltool` workflow runner, which is provided by the CWL project (see the CWL documentation for how to install this on your computer). - -``` -$ cwltool Corekaburra.cwl --fasta_file file.fasta -``` # Bug reporting and feature requests +Please submit bug reports and feature requests to the issue tracker on GitHub: [Corekaburra issue tracker](https://github.com/milnus/Corekaburra/issues) -Please submit bug reports and feature requests to the issue tracker on GitHub: - -[Corekaburra issue tracker](https://github.com/milnus/Corekaburra/issues) +# Licence +This program is released as open source software under the terms of [MIT License](https://raw.githubusercontent.com/milnus/Corekaburra/master/LICENSE). diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 73cba70..5b464ce 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -247,6 +247,8 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder +# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. + call_new_test "Test with decreased core-gene cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 72e50ed..251840c 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,12 +1,11 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome +usage: __main__.py -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] [-h] -Welcome to Corekaburra! Program to determine consensus core sequence from -multiple genomes. Outputs consensus core gene alignment, distance between core -genes, number of accessory genes between core genes and low frequency genes -between core genes +Welcome to Corekaburra!An extension to pan-genome analyses that summarise +genomic regions between core genes and segments of neighbouring core genes +using gene synteny from a set of input genomes and a pan-genome folder. Required arguments: -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...] diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 88f151f..2b073ea 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -87,6 +87,7 @@ def test_all_files_found(self): '/path/complete_genome_2.gff.gz', 'complete_genome_3.gff.gz', 'complete_genome_4.gff', + 'complete_genome_5', 'dummy_index_1', 'dummy_index_2'] @@ -95,7 +96,8 @@ def test_all_files_found(self): expected_return = ['complete_genome_1', 'complete_genome_2', 'complete_genome_3', - 'complete_genome_4'] + 'complete_genome_4', + 'complete_genome_5'] return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files, self.logger) @@ -105,6 +107,7 @@ def test_correct_one_files_not_found(self): gff_files = ['/path/complete_genome_2.gff.gz', 'complete_genome_3.gff.gz', 'complete_genome_4.gff', + 'complete_genome_5', 'dummy_index_1', 'dummy_index_2'] @@ -138,6 +141,14 @@ def test_panaroo_input(self): self.assertEqual("Panaroo", return_program) self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path) + def test_minimal_panaroo_input(self): + input_folder_path = 'TestPangenomeSourceProgram/Mock_minimal_panaroo' + + return_program, return_path = check_inputs.define_pangenome_program(input_folder_path, self.logger) + + self.assertEqual("Panaroo", return_program) + self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path) + # def test_pirate_input(self): TODO - Make Corekaburra take Pirate input! # pass # input_folder_path = 'TestPangenomeSourceProgram/Mock_pirate' diff --git a/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt b/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt index 13d5bce..4bfd8f6 100644 --- a/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt +++ b/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt @@ -2,3 +2,4 @@ complete_genome_1.gff /test/path/complete_genome_2.gff complete_genome_3.gff.gz /test/path/complete_genome_4.gff.gz +complete_genome_5 \ No newline at end of file From 63eae62cbf6baeae728c4969e3334883257a7068 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 11 Jan 2022 12:54:47 +1100 Subject: [PATCH 070/135] Add in data for new unit test. --- .../gene_presence_absence_roary.csv | 1775 +++++++++++++++++ 1 file changed, 1775 insertions(+) create mode 100644 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv new file mode 100644 index 0000000..a09ee88 --- /dev/null +++ b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv @@ -0,0 +1,1775 @@ +Gene,Non-unique Gene name,Annotation,No. isolates,No. sequences,Avg sequences per isolate,Genome Fragment,Order within Fragment,Accessory Fragment,Accessory Order with Fragment,QC,Min group size nuc,Max group size nuc,Avg group size nuc,GCA_000006785,GCA_000006785_1,GCA_000006785_2,GCA_000006785_3,GCA_000006785_4 +group_1096,phiMGAS5005.2_8,phage protein,5,5,1.0,1,1774,,,,141,141,141.0,EEABGJJD_01234,EEABGJJD_01234,EEABGJJD_01234,EEABGJJD_01234,EEABGJJD_01234 +phiMGAS50052_8,phiMGAS5005.2_8,phage protein,5,5,1.0,1,1773,,,,141,141,141.0,EEABGJJD_00795,EEABGJJD_00795,EEABGJJD_00795,EEABGJJD_00795,EEABGJJD_00795 +group_1095,phiMGAS5005.2_7,phage protein,5,5,1.0,1,1772,,,,234,234,234.0,EEABGJJD_01235,EEABGJJD_01235,EEABGJJD_01235,EEABGJJD_01235,EEABGJJD_01235 +phiMGAS50052_7,phiMGAS5005.2_7,phage protein,5,5,1.0,1,1771,,,,234,234,234.0,EEABGJJD_00794,EEABGJJD_00794,EEABGJJD_00794,EEABGJJD_00794,EEABGJJD_00794 +group_1094,phiNCTC8198.1_3,DnaD domain-containing protein,5,5,1.0,1,1770,,,,387,387,387.0,EEABGJJD_01236,EEABGJJD_01236,EEABGJJD_01236,EEABGJJD_01236,EEABGJJD_01236 +phiNCTC81981_3,phiNCTC8198.1_3,DnaD domain-containing protein,5,5,1.0,1,1769,,,,387,387,387.0,EEABGJJD_00793,EEABGJJD_00793,EEABGJJD_00793,EEABGJJD_00793,EEABGJJD_00793 +group_1093,phiSF370.1_6,putative holin phage associated,5,5,1.0,1,1768,,,,228,228,228.0,EEABGJJD_01198,EEABGJJD_01198,EEABGJJD_01198,EEABGJJD_01198,EEABGJJD_01198 +phiSF3701_6,phiSF370.1_6,putative holin phage associated,5,5,1.0,1,1767,,,,228,228,228.0,EEABGJJD_00587,EEABGJJD_00587,EEABGJJD_00587,EEABGJJD_00587,EEABGJJD_00587 +group_1092,phiMGAS5005.2_2,phage protein,5,5,1.0,1,1766,,,,276,276,276.0,EEABGJJD_01199,EEABGJJD_01199,EEABGJJD_01199,EEABGJJD_01199,EEABGJJD_01199 +phiMGAS50052_2,phiMGAS5005.2_2,phage protein,5,5,1.0,1,1765,,,,276,276,276.0,EEABGJJD_00586,EEABGJJD_00586,EEABGJJD_00586,EEABGJJD_00586,EEABGJJD_00586 +group_1091,,hypothetical protein,5,5,1.0,1,1764,,,,618,618,618.0,EEABGJJD_01200,EEABGJJD_01200,EEABGJJD_01200,EEABGJJD_01200,EEABGJJD_01200 +group_1090,,hypothetical protein,5,5,1.0,1,1763,,,,618,618,618.0,EEABGJJD_00585,EEABGJJD_00585,EEABGJJD_00585,EEABGJJD_00585,EEABGJJD_00585 +group_1089,phiMGAS5005.2_1,phage protein,5,5,1.0,1,1762,,,,636,636,636.0,EEABGJJD_01225,EEABGJJD_01225,EEABGJJD_01225,EEABGJJD_01225,EEABGJJD_01225 +phiMGAS50052_1,phiMGAS5005.2_1,phage protein,5,5,1.0,1,1761,,,,636,636,636.0,EEABGJJD_00561,EEABGJJD_00561,EEABGJJD_00561,EEABGJJD_00561,EEABGJJD_00561 +group_1088,,ISAs1 family transposase,5,5,1.0,1,1760,,,,1134,1134,1134.0,EEABGJJD_01800,EEABGJJD_01800,EEABGJJD_01800,EEABGJJD_01800,EEABGJJD_01800 +group_1087,,ISAs1 family transposase,5,5,1.0,1,1759,,,,1134,1134,1134.0,EEABGJJD_01505,EEABGJJD_01505,EEABGJJD_01505,EEABGJJD_01505,EEABGJJD_01505 +group_1086,,ISAs1 family transposase,5,5,1.0,1,1758,,,,1134,1134,1134.0,EEABGJJD_01316,EEABGJJD_01316,EEABGJJD_01316,EEABGJJD_01316,EEABGJJD_01316 +group_1085,,ISAs1 family transposase,5,5,1.0,1,1757,,,,1134,1134,1134.0,EEABGJJD_00282,EEABGJJD_00282,EEABGJJD_00282,EEABGJJD_00282,EEABGJJD_00282 +group_1084,comX1.1_1,putative competence protein,5,5,1.0,1,1756,,,,486,486,486.0,EEABGJJD_01581,EEABGJJD_01581,EEABGJJD_01581,EEABGJJD_01581,EEABGJJD_01581 +comX11_1,comX1.1_1,putative competence protein,5,5,1.0,1,1755,,,,486,486,486.0,EEABGJJD_00281,EEABGJJD_00281,EEABGJJD_00281,EEABGJJD_00281,EEABGJJD_00281 +group_1083,,ISAs1 family transposase,5,5,1.0,1,1754,,,,459,459,459.0,EEABGJJD_00318,EEABGJJD_00318,EEABGJJD_00318,EEABGJJD_00318,EEABGJJD_00318 +group_1082,,ISAs1 family transposase,5,5,1.0,1,1753,,,,459,459,459.0,EEABGJJD_00215,EEABGJJD_00215,EEABGJJD_00215,EEABGJJD_00215,EEABGJJD_00215 +group_1081,,ISAs1 family transposase,5,5,1.0,1,1752,,,,705,705,705.0,EEABGJJD_00319,EEABGJJD_00319,EEABGJJD_00319,EEABGJJD_00319,EEABGJJD_00319 +group_1080,,ISAs1 family transposase,5,5,1.0,1,1751,,,,705,705,705.0,EEABGJJD_00214,EEABGJJD_00214,EEABGJJD_00214,EEABGJJD_00214,EEABGJJD_00214 +group_1079,,hypothetical protein,5,5,1.0,1,1750,,,,516,516,516.0,EEABGJJD_01120,EEABGJJD_01120,EEABGJJD_01120,EEABGJJD_01120,EEABGJJD_01120 +group_1078,,hypothetical protein,5,5,1.0,1,1749,,,,516,516,516.0,EEABGJJD_00201,EEABGJJD_00201,EEABGJJD_00201,EEABGJJD_00201,EEABGJJD_00201 +group_1077,,hypothetical protein,5,5,1.0,1,1748,,,,153,153,153.0,EEABGJJD_01584,EEABGJJD_01584,EEABGJJD_01584,EEABGJJD_01584,EEABGJJD_01584 +group_1076,,hypothetical protein,5,5,1.0,1,1747,,,,153,153,153.0,EEABGJJD_01348,EEABGJJD_01348,EEABGJJD_01348,EEABGJJD_01348,EEABGJJD_01348 +group_1075,,hypothetical protein,5,5,1.0,1,1746,,,,153,153,153.0,EEABGJJD_00278,EEABGJJD_00278,EEABGJJD_00278,EEABGJJD_00278,EEABGJJD_00278 +group_1074,,hypothetical protein,5,5,1.0,1,1745,,,,153,153,153.0,EEABGJJD_00095,EEABGJJD_00095,EEABGJJD_00095,EEABGJJD_00095,EEABGJJD_00095 +group_1073,,hypothetical protein,5,5,1.0,1,1744,,,,153,153,153.0,EEABGJJD_00027,EEABGJJD_00027,EEABGJJD_00027,EEABGJJD_00027,EEABGJJD_00027 +group_1072,,hypothetical protein,5,5,1.0,1,1743,,,,153,153,153.0,EEABGJJD_00016,EEABGJJD_00016,EEABGJJD_00016,EEABGJJD_00016,EEABGJJD_00016 +group_1071,,hypothetical protein,5,5,1.0,1,1742,,,,90,90,90.0,EEABGJJD_01421,EEABGJJD_01421,EEABGJJD_01421,EEABGJJD_01421,EEABGJJD_01421 +group_1070,,hypothetical protein,5,5,1.0,1,1741,,,,96,96,96.0,EEABGJJD_00056,EEABGJJD_00056,EEABGJJD_00056,EEABGJJD_00056,EEABGJJD_00056 +group_1069,,hypothetical protein,5,5,1.0,1,1740,,,,105,105,105.0,EEABGJJD_00413,EEABGJJD_00413,EEABGJJD_00413,EEABGJJD_00413,EEABGJJD_00413 +group_1068,,exfoliative toxin,5,5,1.0,1,1739,,,,105,105,105.0,EEABGJJD_00216,EEABGJJD_00216,EEABGJJD_00216,EEABGJJD_00216,EEABGJJD_00216 +group_1067,,hypothetical protein,5,5,1.0,1,1738,,,,108,108,108.0,EEABGJJD_00840,EEABGJJD_00840,EEABGJJD_00840,EEABGJJD_00840,EEABGJJD_00840 +group_1066,,hypothetical protein,5,5,1.0,1,1737,,,,108,108,108.0,EEABGJJD_00062,EEABGJJD_00062,EEABGJJD_00062,EEABGJJD_00062,EEABGJJD_00062 +group_1065,,hypothetical protein,5,5,1.0,1,1736,,,,111,111,111.0,EEABGJJD_01743,EEABGJJD_01743,EEABGJJD_01743,EEABGJJD_01743,EEABGJJD_01743 +group_1064,,hypothetical protein,5,5,1.0,1,1735,,,,111,111,111.0,EEABGJJD_01197,EEABGJJD_01197,EEABGJJD_01197,EEABGJJD_01197,EEABGJJD_01197 +group_1063,,hypothetical protein,5,5,1.0,1,1734,,,,114,114,114.0,EEABGJJD_00808,EEABGJJD_00808,EEABGJJD_00808,EEABGJJD_00808,EEABGJJD_00808 +group_1062,,hypothetical protein,5,5,1.0,1,1733,,,,117,117,117.0,EEABGJJD_00258,EEABGJJD_00258,EEABGJJD_00258,EEABGJJD_00258,EEABGJJD_00258 +rpmJ,rpmJ,50S ribosomal protein B,5,5,1.0,1,1732,,,,117,117,117.0,EEABGJJD_00089,EEABGJJD_00089,EEABGJJD_00089,EEABGJJD_00089,EEABGJJD_00089 +group_1061,,hypothetical protein,5,5,1.0,1,1731,,,,120,120,120.0,EEABGJJD_00971,EEABGJJD_00971,EEABGJJD_00971,EEABGJJD_00971,EEABGJJD_00971 +group_1060,,hypothetical protein,5,5,1.0,1,1730,,,,120,120,120.0,EEABGJJD_00210,EEABGJJD_00210,EEABGJJD_00210,EEABGJJD_00210,EEABGJJD_00210 +group_1059,,hypothetical protein,5,5,1.0,1,1729,,,,123,123,123.0,EEABGJJD_01377,EEABGJJD_01377,EEABGJJD_01377,EEABGJJD_01377,EEABGJJD_01377 +group_1058,,hypothetical protein,5,5,1.0,1,1728,,,,126,126,126.0,EEABGJJD_00547,EEABGJJD_00547,EEABGJJD_00547,EEABGJJD_00547,EEABGJJD_00547 +group_1057,,hypothetical protein,5,5,1.0,1,1727,,,,126,126,126.0,EEABGJJD_00009,EEABGJJD_00009,EEABGJJD_00009,EEABGJJD_00009,EEABGJJD_00009 +group_1056,,hypothetical protein,5,5,1.0,1,1726,,,,129,129,129.0,EEABGJJD_01119,EEABGJJD_01119,EEABGJJD_01119,EEABGJJD_01119,EEABGJJD_01119 +group_1055,,hypothetical protein,5,5,1.0,1,1725,,,,132,132,132.0,EEABGJJD_01239,EEABGJJD_01239,EEABGJJD_01239,EEABGJJD_01239,EEABGJJD_01239 +group_1054,,integrase,5,5,1.0,1,1724,,,,132,132,132.0,EEABGJJD_00910,EEABGJJD_00910,EEABGJJD_00910,EEABGJJD_00910,EEABGJJD_00910 +group_1053,,hypothetical protein,5,5,1.0,1,1723,,,,132,132,132.0,EEABGJJD_00213,EEABGJJD_00213,EEABGJJD_00213,EEABGJJD_00213,EEABGJJD_00213 +group_1052,,membrane protein,5,5,1.0,1,1722,,,,135,135,135.0,EEABGJJD_00996,EEABGJJD_00996,EEABGJJD_00996,EEABGJJD_00996,EEABGJJD_00996 +rpmH,rpmH,50S ribosomal protein L34,5,5,1.0,1,1721,,,,135,135,135.0,EEABGJJD_00235,EEABGJJD_00235,EEABGJJD_00235,EEABGJJD_00235,EEABGJJD_00235 +group_1051,,hypothetical protein,5,5,1.0,1,1720,,,,138,138,138.0,EEABGJJD_00546,EEABGJJD_00546,EEABGJJD_00546,EEABGJJD_00546,EEABGJJD_00546 +group_1050,,hypothetical protein,5,5,1.0,1,1719,,,,141,141,141.0,EEABGJJD_01395,EEABGJJD_01395,EEABGJJD_01395,EEABGJJD_01395,EEABGJJD_01395 +srtA,srtA,lantibiotic precursor,5,5,1.0,1,1718,,,,141,141,141.0,EEABGJJD_00904,EEABGJJD_00904,EEABGJJD_00904,EEABGJJD_00904,EEABGJJD_00904 +vlg,vlg,regulatory protein,5,5,1.0,1,1717,,,,141,141,141.0,EEABGJJD_00192,EEABGJJD_00192,EEABGJJD_00192,EEABGJJD_00192,EEABGJJD_00192 +group_1049,,hypothetical protein,5,5,1.0,1,1716,,,,144,144,144.0,EEABGJJD_01478,EEABGJJD_01478,EEABGJJD_01478,EEABGJJD_01478,EEABGJJD_01478 +group_1048,,teichoic acid D-Ala incorporation-associated protein DltX,5,5,1.0,1,1715,,,,144,144,144.0,EEABGJJD_01103,EEABGJJD_01103,EEABGJJD_01103,EEABGJJD_01103,EEABGJJD_01103 +group_1047,,hypothetical protein,5,5,1.0,1,1714,,,,147,147,147.0,EEABGJJD_01740,EEABGJJD_01740,EEABGJJD_01740,EEABGJJD_01740,EEABGJJD_01740 +salA,salA,lantibiotic precursor,5,5,1.0,1,1713,,,,147,147,147.0,EEABGJJD_01598,EEABGJJD_01598,EEABGJJD_01598,EEABGJJD_01598,EEABGJJD_01598 +group_1046,,acetoin dehydrogenase,5,5,1.0,1,1712,,,,147,147,147.0,EEABGJJD_00535,EEABGJJD_00535,EEABGJJD_00535,EEABGJJD_00535,EEABGJJD_00535 +rpmG,rpmG,50S ribosomal protein L33,5,5,1.0,1,1711,,,,150,150,150.0,EEABGJJD_01785,EEABGJJD_01785,EEABGJJD_01785,EEABGJJD_01785,EEABGJJD_01785 +group_1045,,IS256-like element ISLgar5 family transposase,5,5,1.0,1,1710,,,,150,150,150.0,EEABGJJD_00408,EEABGJJD_00408,EEABGJJD_00408,EEABGJJD_00408,EEABGJJD_00408 +group_1044,,50S ribosomal protein L33,5,5,1.0,1,1709,,,,153,153,153.0,EEABGJJD_01702,EEABGJJD_01702,EEABGJJD_01702,EEABGJJD_01702,EEABGJJD_01702 +group_1043,,hypothetical protein,5,5,1.0,1,1708,,,,153,153,153.0,EEABGJJD_01107,EEABGJJD_01107,EEABGJJD_01107,EEABGJJD_01107,EEABGJJD_01107 +group_1042,,hypothetical protein,5,5,1.0,1,1707,,,,156,156,156.0,EEABGJJD_01628,EEABGJJD_01628,EEABGJJD_01628,EEABGJJD_01628,EEABGJJD_01628 +group_1041,,hypothetical protein,5,5,1.0,1,1706,,,,156,156,156.0,EEABGJJD_00486,EEABGJJD_00486,EEABGJJD_00486,EEABGJJD_00486,EEABGJJD_00486 +group_1040,,XRE family transcriptional regulator,5,5,1.0,1,1705,,,,159,159,159.0,EEABGJJD_01709,EEABGJJD_01709,EEABGJJD_01709,EEABGJJD_01709,EEABGJJD_01709 +group_1039,,hypothetical protein,5,5,1.0,1,1704,,,,159,159,159.0,EEABGJJD_01117,EEABGJJD_01117,EEABGJJD_01117,EEABGJJD_01117,EEABGJJD_01117 +group_1038,,NAD(P)-dependent oxidoreductase,5,5,1.0,1,1703,,,,159,159,159.0,EEABGJJD_00947,EEABGJJD_00947,EEABGJJD_00947,EEABGJJD_00947,EEABGJJD_00947 +phiMGAS50051_2,phiMGAS5005.1_2,phage protein,5,5,1.0,1,1702,,,,159,159,159.0,EEABGJJD_00784,EEABGJJD_00784,EEABGJJD_00784,EEABGJJD_00784,EEABGJJD_00784 +group_1037,,terminase,5,5,1.0,1,1701,,,,159,159,159.0,EEABGJJD_00463,EEABGJJD_00463,EEABGJJD_00463,EEABGJJD_00463,EEABGJJD_00463 +phiMGAS50053_3,phiMGAS5005.3_3,phage protein,5,5,1.0,1,1700,,,,162,162,162.0,EEABGJJD_00836,EEABGJJD_00836,EEABGJJD_00836,EEABGJJD_00836,EEABGJJD_00836 +sagA,sagA,streptolysin S associated protein,5,5,1.0,1,1699,,,,162,162,162.0,EEABGJJD_00611,EEABGJJD_00611,EEABGJJD_00611,EEABGJJD_00611,EEABGJJD_00611 +group_1036,,hypothetical protein,5,5,1.0,1,1698,,,,165,165,165.0,EEABGJJD_00849,EEABGJJD_00849,EEABGJJD_00849,EEABGJJD_00849,EEABGJJD_00849 +group_1035,,bacteriocin,5,5,1.0,1,1697,,,,165,165,165.0,EEABGJJD_00410,EEABGJJD_00410,EEABGJJD_00410,EEABGJJD_00410,EEABGJJD_00410 +group_1034,,hypothetical protein,5,5,1.0,1,1696,,,,168,168,168.0,EEABGJJD_00175,EEABGJJD_00175,EEABGJJD_00175,EEABGJJD_00175,EEABGJJD_00175 +group_1033,,hypothetical protein,5,5,1.0,1,1695,,,,171,171,171.0,EEABGJJD_01688,EEABGJJD_01688,EEABGJJD_01688,EEABGJJD_01688,EEABGJJD_01688 +group_1032,,hypothetical protein,5,5,1.0,1,1694,,,,171,171,171.0,EEABGJJD_00806,EEABGJJD_00806,EEABGJJD_00806,EEABGJJD_00806,EEABGJJD_00806 +group_1031,,hypothetical protein,5,5,1.0,1,1693,,,,171,171,171.0,EEABGJJD_00732,EEABGJJD_00732,EEABGJJD_00732,EEABGJJD_00732,EEABGJJD_00732 +rpsN,rpsN,30S ribosomal protein S14,5,5,1.0,1,1692,,,,171,171,171.0,EEABGJJD_00079,EEABGJJD_00079,EEABGJJD_00079,EEABGJJD_00079,EEABGJJD_00079 +group_1030,,hypothetical protein,5,5,1.0,1,1691,,,,174,174,174.0,EEABGJJD_01768,EEABGJJD_01768,EEABGJJD_01768,EEABGJJD_01768,EEABGJJD_01768 +group_1029,,hypothetical protein,5,5,1.0,1,1690,,,,174,174,174.0,EEABGJJD_01767,EEABGJJD_01767,EEABGJJD_01767,EEABGJJD_01767,EEABGJJD_01767 +group_1028,,preprotein translocase subunit SecE,5,5,1.0,1,1689,,,,177,177,177.0,EEABGJJD_01701,EEABGJJD_01701,EEABGJJD_01701,EEABGJJD_01701,EEABGJJD_01701 +group_1027,,hypothetical protein,5,5,1.0,1,1688,,,,177,177,177.0,EEABGJJD_00765,EEABGJJD_00765,EEABGJJD_00765,EEABGJJD_00765,EEABGJJD_00765 +group_1026,,nucleoside-diphosphate kinase,5,5,1.0,1,1687,,,,177,177,177.0,EEABGJJD_00714,EEABGJJD_00714,EEABGJJD_00714,EEABGJJD_00714,EEABGJJD_00714 +phiMGAS50052_33,phiMGAS5005.2_33,phage protein,5,5,1.0,1,1686,,,,180,180,180.0,EEABGJJD_01219,EEABGJJD_01219,EEABGJJD_01219,EEABGJJD_01219,EEABGJJD_01219 +group_1025,,hypothetical protein,5,5,1.0,1,1685,,,,180,180,180.0,EEABGJJD_00816,EEABGJJD_00816,EEABGJJD_00816,EEABGJJD_00816,EEABGJJD_00816 +rpmF,rpmF,50S ribosomal protein L32,5,5,1.0,1,1684,,,,183,183,183.0,EEABGJJD_01784,EEABGJJD_01784,EEABGJJD_01784,EEABGJJD_01784,EEABGJJD_01784 +phiMGAS50052_13,phiMGAS5005.2_13,phage protein,5,5,1.0,1,1683,,,,183,183,183.0,EEABGJJD_01193,EEABGJJD_01193,EEABGJJD_01193,EEABGJJD_01193,EEABGJJD_01193 +group_1024,,class IIb bacteriocin lactobin A/cerein 7B family,5,5,1.0,1,1682,,,,183,183,183.0,EEABGJJD_00407,EEABGJJD_00407,EEABGJJD_00407,EEABGJJD_00407,EEABGJJD_00407 +rpmD,rpmD,50S ribosomal protein L30,5,5,1.0,1,1681,,,,183,183,183.0,EEABGJJD_00084,EEABGJJD_00084,EEABGJJD_00084,EEABGJJD_00084,EEABGJJD_00084 +group_1023,,hypothetical protein,5,5,1.0,1,1680,,,,186,186,186.0,EEABGJJD_01317,EEABGJJD_01317,EEABGJJD_01317,EEABGJJD_01317,EEABGJJD_01317 +phiMGAS50052_47,phiMGAS5005.2_47,phage protein,5,5,1.0,1,1679,,,,186,186,186.0,EEABGJJD_01237,EEABGJJD_01237,EEABGJJD_01237,EEABGJJD_01237,EEABGJJD_01237 +group_1022,,putative 4-oxalocrotonate tautomerase,5,5,1.0,1,1678,,,,186,186,186.0,EEABGJJD_00955,EEABGJJD_00955,EEABGJJD_00955,EEABGJJD_00955,EEABGJJD_00955 +phiMGAS50053_5,phiMGAS5005.3_5,phage protein,5,5,1.0,1,1677,,,,186,186,186.0,EEABGJJD_00839,EEABGJJD_00839,EEABGJJD_00839,EEABGJJD_00839,EEABGJJD_00839 +phiMGAS50052_6,phiMGAS5005.2_6,phage protein,5,5,1.0,1,1676,,,,186,186,186.0,EEABGJJD_00792,EEABGJJD_00792,EEABGJJD_00792,EEABGJJD_00792,EEABGJJD_00792 +rpmB,rpmB,50S ribosomal protein L28,5,5,1.0,1,1675,,,,189,189,189.0,EEABGJJD_01569,EEABGJJD_01569,EEABGJJD_01569,EEABGJJD_01569,EEABGJJD_01569 +group_1021,,hypothetical protein,5,5,1.0,1,1674,,,,189,189,189.0,EEABGJJD_01063,EEABGJJD_01063,EEABGJJD_01063,EEABGJJD_01063,EEABGJJD_01063 +group_1020,,hypothetical protein,5,5,1.0,1,1673,,,,189,189,189.0,EEABGJJD_00634,EEABGJJD_00634,EEABGJJD_00634,EEABGJJD_00634,EEABGJJD_00634 +group_1019,,hypothetical protein,5,5,1.0,1,1672,,,,192,192,192.0,EEABGJJD_01762,EEABGJJD_01762,EEABGJJD_01762,EEABGJJD_01762,EEABGJJD_01762 +group_1018,,hypothetical protein,5,5,1.0,1,1671,,,,192,192,192.0,EEABGJJD_01705,EEABGJJD_01705,EEABGJJD_01705,EEABGJJD_01705,EEABGJJD_01705 +group_1017,,hypothetical protein,5,5,1.0,1,1670,,,,192,192,192.0,EEABGJJD_01590,EEABGJJD_01590,EEABGJJD_01590,EEABGJJD_01590,EEABGJJD_01590 +group_1016,,hypothetical protein,5,5,1.0,1,1669,,,,195,195,195.0,EEABGJJD_00900,EEABGJJD_00900,EEABGJJD_00900,EEABGJJD_00900,EEABGJJD_00900 +group_1015,,hypothetical protein,5,5,1.0,1,1668,,,,195,195,195.0,EEABGJJD_00803,EEABGJJD_00803,EEABGJJD_00803,EEABGJJD_00803,EEABGJJD_00803 +group_1014,,hypothetical protein,5,5,1.0,1,1667,,,,195,195,195.0,EEABGJJD_00412,EEABGJJD_00412,EEABGJJD_00412,EEABGJJD_00412,EEABGJJD_00412 +group_1013,,nucleoside-diphosphate kinase,5,5,1.0,1,1666,,,,198,198,198.0,EEABGJJD_00876,EEABGJJD_00876,EEABGJJD_00876,EEABGJJD_00876,EEABGJJD_00876 +rpmI,rpmI,50S ribosomal protein L35,5,5,1.0,1,1665,,,,198,198,198.0,EEABGJJD_00668,EEABGJJD_00668,EEABGJJD_00668,EEABGJJD_00668,EEABGJJD_00668 +atpE,atpE,H+-transporting ATP synthase chain C,5,5,1.0,1,1664,,,,198,198,198.0,EEABGJJD_00624,EEABGJJD_00624,EEABGJJD_00624,EEABGJJD_00624,EEABGJJD_00624 +group_1012,,hypothetical protein,5,5,1.0,1,1663,,,,198,198,198.0,EEABGJJD_00003,EEABGJJD_00003,EEABGJJD_00003,EEABGJJD_00003,EEABGJJD_00003 +group_1011,,CsbD family protein,5,5,1.0,1,1662,,,,201,201,201.0,EEABGJJD_01666,EEABGJJD_01666,EEABGJJD_01666,EEABGJJD_01666,EEABGJJD_01666 +group_1010,,hypothetical protein,5,5,1.0,1,1661,,,,201,201,201.0,EEABGJJD_01230,EEABGJJD_01230,EEABGJJD_01230,EEABGJJD_01230,EEABGJJD_01230 +phiMGAS50052_28,phiMGAS5005.2_28,phage protein,5,5,1.0,1,1660,,,,201,201,201.0,EEABGJJD_01214,EEABGJJD_01214,EEABGJJD_01214,EEABGJJD_01214,EEABGJJD_01214 +group_1009,,CsbD family protein,5,5,1.0,1,1659,,,,201,201,201.0,EEABGJJD_01061,EEABGJJD_01061,EEABGJJD_01061,EEABGJJD_01061,EEABGJJD_01061 +group_1008,,histidine--tRNA ligase,5,5,1.0,1,1658,,,,201,201,201.0,EEABGJJD_00473,EEABGJJD_00473,EEABGJJD_00473,EEABGJJD_00473,EEABGJJD_00473 +group_1007,,hypothetical protein,5,5,1.0,1,1657,,,,201,201,201.0,EEABGJJD_00411,EEABGJJD_00411,EEABGJJD_00411,EEABGJJD_00411,EEABGJJD_00411 +group_1006,,hypothetical protein,5,5,1.0,1,1656,,,,201,201,201.0,EEABGJJD_00174,EEABGJJD_00174,EEABGJJD_00174,EEABGJJD_00174,EEABGJJD_00174 +copZ,copZ,putative copper chaperone - copper transport operon,5,5,1.0,1,1655,,,,204,204,204.0,EEABGJJD_01430,EEABGJJD_01430,EEABGJJD_01430,EEABGJJD_01430,EEABGJJD_01430 +group_1005,,hypothetical protein,5,5,1.0,1,1654,,,,207,207,207.0,EEABGJJD_01775,EEABGJJD_01775,EEABGJJD_01775,EEABGJJD_01775,EEABGJJD_01775 +group_1004,,helix-turn-helix transcriptional regulator,5,5,1.0,1,1653,,,,207,207,207.0,EEABGJJD_01756,EEABGJJD_01756,EEABGJJD_01756,EEABGJJD_01756,EEABGJJD_01756 +group_1003,,putative transcription regulator,5,5,1.0,1,1652,,,,207,207,207.0,EEABGJJD_01611,EEABGJJD_01611,EEABGJJD_01611,EEABGJJD_01611,EEABGJJD_01611 +phiMGAS50052_44,phiMGAS5005.2_44,phage protein,5,5,1.0,1,1651,,,,207,207,207.0,EEABGJJD_01233,EEABGJJD_01233,EEABGJJD_01233,EEABGJJD_01233,EEABGJJD_01233 +phiMGAS50052_9,phiMGAS5005.2_9,phage protein,5,5,1.0,1,1650,,,,207,207,207.0,EEABGJJD_00796,EEABGJJD_00796,EEABGJJD_00796,EEABGJJD_00796,EEABGJJD_00796 +group_1002,,helix-turn-helix domain-containing protein,5,5,1.0,1,1649,,,,207,207,207.0,EEABGJJD_00787,EEABGJJD_00787,EEABGJJD_00787,EEABGJJD_00787,EEABGJJD_00787 +rpsU,rpsU,30S ribosomal protein S21,5,5,1.0,1,1648,,,,207,207,207.0,EEABGJJD_00646,EEABGJJD_00646,EEABGJJD_00646,EEABGJJD_00646,EEABGJJD_00646 +rpmC,rpmC,50S ribosomal protein L29,5,5,1.0,1,1647,,,,207,207,207.0,EEABGJJD_00074,EEABGJJD_00074,EEABGJJD_00074,EEABGJJD_00074,EEABGJJD_00074 +group_1001,,cold-shock protein,5,5,1.0,1,1646,,,,210,210,210.0,EEABGJJD_01714,EEABGJJD_01714,EEABGJJD_01714,EEABGJJD_01714,EEABGJJD_01714 +group_1000,,transcriptional regulator,5,5,1.0,1,1645,,,,210,210,210.0,EEABGJJD_00458,EEABGJJD_00458,EEABGJJD_00458,EEABGJJD_00458,EEABGJJD_00458 +group_999,,hypothetical protein,5,5,1.0,1,1644,,,,210,210,210.0,EEABGJJD_00195,EEABGJJD_00195,EEABGJJD_00195,EEABGJJD_00195,EEABGJJD_00195 +phiSF3703_2,phiSF370.3_2,putative Cro-like repressor protein - phage associated,5,5,1.0,1,1643,,,,213,213,213.0,EEABGJJD_01240,EEABGJJD_01240,EEABGJJD_01240,EEABGJJD_01240,EEABGJJD_01240 +group_998,,hypothetical protein,5,5,1.0,1,1642,,,,213,213,213.0,EEABGJJD_00496,EEABGJJD_00496,EEABGJJD_00496,EEABGJJD_00496,EEABGJJD_00496 +xseB,xseB,exodeoxyribonuclease VII (small subunit),5,5,1.0,1,1641,,,,216,216,216.0,EEABGJJD_01253,EEABGJJD_01253,EEABGJJD_01253,EEABGJJD_01253,EEABGJJD_01253 +group_997,,YozE family protein,5,5,1.0,1,1640,,,,216,216,216.0,EEABGJJD_00396,EEABGJJD_00396,EEABGJJD_00396,EEABGJJD_00396,EEABGJJD_00396 +group_996,,transposase,5,5,1.0,1,1639,,,,216,216,216.0,EEABGJJD_00196,EEABGJJD_00196,EEABGJJD_00196,EEABGJJD_00196,EEABGJJD_00196 +nrdH,nrdH,putative glutaredoxin,5,5,1.0,1,1638,,,,219,219,219.0,EEABGJJD_01151,EEABGJJD_01151,EEABGJJD_01151,EEABGJJD_01151,EEABGJJD_01151 +group_995,,putative repressor protein - phage associated,5,5,1.0,1,1637,,,,219,219,219.0,EEABGJJD_00909,EEABGJJD_00909,EEABGJJD_00909,EEABGJJD_00909,EEABGJJD_00909 +group_994,,transposase,5,5,1.0,1,1636,,,,219,219,219.0,EEABGJJD_00712,EEABGJJD_00712,EEABGJJD_00712,EEABGJJD_00712,EEABGJJD_00712 +group_993,,hypothetical protein,5,5,1.0,1,1635,,,,219,219,219.0,EEABGJJD_00557,EEABGJJD_00557,EEABGJJD_00557,EEABGJJD_00557,EEABGJJD_00557 +infA,infA,putative translation initiation factor IF-1,5,5,1.0,1,1634,,,,219,219,219.0,EEABGJJD_00088,EEABGJJD_00088,EEABGJJD_00088,EEABGJJD_00088,EEABGJJD_00088 +group_992,,hypothetical protein,5,5,1.0,1,1633,,,,222,222,222.0,EEABGJJD_01761,EEABGJJD_01761,EEABGJJD_01761,EEABGJJD_01761,EEABGJJD_01761 +group_991,,XRE family transcriptional regulator,5,5,1.0,1,1632,,,,222,222,222.0,EEABGJJD_01544,EEABGJJD_01544,EEABGJJD_01544,EEABGJJD_01544,EEABGJJD_01544 +group_990,,hypothetical protein,5,5,1.0,1,1631,,,,222,222,222.0,EEABGJJD_01279,EEABGJJD_01279,EEABGJJD_01279,EEABGJJD_01279,EEABGJJD_01279 +group_989,,putative transcriptional regulator protein,5,5,1.0,1,1630,,,,222,222,222.0,EEABGJJD_01159,EEABGJJD_01159,EEABGJJD_01159,EEABGJJD_01159,EEABGJJD_01159 +group_988,,transposase,5,5,1.0,1,1629,,,,222,222,222.0,EEABGJJD_01156,EEABGJJD_01156,EEABGJJD_01156,EEABGJJD_01156,EEABGJJD_01156 +group_987,,hypothetical protein,5,5,1.0,1,1628,,,,222,222,222.0,EEABGJJD_00912,EEABGJJD_00912,EEABGJJD_00912,EEABGJJD_00912,EEABGJJD_00912 +group_986,,hypothetical protein,5,5,1.0,1,1627,,,,222,222,222.0,EEABGJJD_00911,EEABGJJD_00911,EEABGJJD_00911,EEABGJJD_00911,EEABGJJD_00911 +group_985,,hypothetical protein,5,5,1.0,1,1626,,,,222,222,222.0,EEABGJJD_00559,EEABGJJD_00559,EEABGJJD_00559,EEABGJJD_00559,EEABGJJD_00559 +acpP,acpP,putative acyl carrier protein,5,5,1.0,1,1625,,,,225,225,225.0,EEABGJJD_01466,EEABGJJD_01466,EEABGJJD_01466,EEABGJJD_01466,EEABGJJD_01466 +group_984,,hypothetical protein,5,5,1.0,1,1624,,,,225,225,225.0,EEABGJJD_01387,EEABGJJD_01387,EEABGJJD_01387,EEABGJJD_01387,EEABGJJD_01387 +phiMGAS50052_34,phiMGAS5005.2_34,phage protein,5,5,1.0,1,1623,,,,225,225,225.0,EEABGJJD_01220,EEABGJJD_01220,EEABGJJD_01220,EEABGJJD_01220,EEABGJJD_01220 +group_983,,hypothetical protein,5,5,1.0,1,1622,,,,225,225,225.0,EEABGJJD_00461,EEABGJJD_00461,EEABGJJD_00461,EEABGJJD_00461,EEABGJJD_00461 +group_982,,hypothetical protein,5,5,1.0,1,1621,,,,228,228,228.0,EEABGJJD_01172,EEABGJJD_01172,EEABGJJD_01172,EEABGJJD_01172,EEABGJJD_01172 +group_981,,hypothetical protein,5,5,1.0,1,1620,,,,228,228,228.0,EEABGJJD_00786,EEABGJJD_00786,EEABGJJD_00786,EEABGJJD_00786,EEABGJJD_00786 +group_980,,hypothetical protein,5,5,1.0,1,1619,,,,228,228,228.0,EEABGJJD_00568,EEABGJJD_00568,EEABGJJD_00568,EEABGJJD_00568,EEABGJJD_00568 +group_979,,hypothetical protein,5,5,1.0,1,1618,,,,228,228,228.0,EEABGJJD_00548,EEABGJJD_00548,EEABGJJD_00548,EEABGJJD_00548,EEABGJJD_00548 +group_978,,transposase,5,5,1.0,1,1617,,,,228,228,228.0,EEABGJJD_00143,EEABGJJD_00143,EEABGJJD_00143,EEABGJJD_00143,EEABGJJD_00143 +group_977,,4-diphosphocytidyl-2C-methyl-D-erythritol kinase,5,5,1.0,1,1616,,,,228,228,228.0,EEABGJJD_00108,EEABGJJD_00108,EEABGJJD_00108,EEABGJJD_00108,EEABGJJD_00108 +group_976,,hypothetical protein,5,5,1.0,1,1615,,,,231,231,231.0,EEABGJJD_01619,EEABGJJD_01619,EEABGJJD_01619,EEABGJJD_01619,EEABGJJD_01619 +group_975,,hypothetical protein,5,5,1.0,1,1614,,,,231,231,231.0,EEABGJJD_01559,EEABGJJD_01559,EEABGJJD_01559,EEABGJJD_01559,EEABGJJD_01559 +group_974,,hypothetical protein,5,5,1.0,1,1613,,,,231,231,231.0,EEABGJJD_01260,EEABGJJD_01260,EEABGJJD_01260,EEABGJJD_01260,EEABGJJD_01260 +group_973,,hypothetical protein,5,5,1.0,1,1612,,,,231,231,231.0,EEABGJJD_00756,EEABGJJD_00756,EEABGJJD_00756,EEABGJJD_00756,EEABGJJD_00756 +phiSF3701_2,phiSF370.1_2,putative Cro-like protein phage associated,5,5,1.0,1,1611,,,,231,231,231.0,EEABGJJD_00545,EEABGJJD_00545,EEABGJJD_00545,EEABGJJD_00545,EEABGJJD_00545 +rpsT,rpsT,30S ribosomal protein S20,5,5,1.0,1,1610,,,,234,234,234.0,EEABGJJD_01037,EEABGJJD_01037,EEABGJJD_01037,EEABGJJD_01037,EEABGJJD_01037 +group_972,,hypothetical protein,5,5,1.0,1,1609,,,,237,237,237.0,EEABGJJD_01684,EEABGJJD_01684,EEABGJJD_01684,EEABGJJD_01684,EEABGJJD_01684 +phiMGAS50052_25,phiMGAS5005.2_25,phage protein,5,5,1.0,1,1608,,,,237,237,237.0,EEABGJJD_01211,EEABGJJD_01211,EEABGJJD_01211,EEABGJJD_01211,EEABGJJD_01211 +phiMGAS50052_5,phiMGAS5005.2_5,phage protein,5,5,1.0,1,1607,,,,237,237,237.0,EEABGJJD_00791,EEABGJJD_00791,EEABGJJD_00791,EEABGJJD_00791,EEABGJJD_00791 +group_971,,preprotein translocase subunit SecG,5,5,1.0,1,1606,,,,237,237,237.0,EEABGJJD_00422,EEABGJJD_00422,EEABGJJD_00422,EEABGJJD_00422,EEABGJJD_00422 +group_970,,hypothetical protein,5,5,1.0,1,1605,,,,237,237,237.0,EEABGJJD_00374,EEABGJJD_00374,EEABGJJD_00374,EEABGJJD_00374,EEABGJJD_00374 +group_969,,transcriptional regulator,5,5,1.0,1,1604,,,,240,240,240.0,EEABGJJD_01734,EEABGJJD_01734,EEABGJJD_01734,EEABGJJD_01734,EEABGJJD_01734 +rpsR,rpsR,30S ribosomal protein S18,5,5,1.0,1,1603,,,,240,240,240.0,EEABGJJD_01520,EEABGJJD_01520,EEABGJJD_01520,EEABGJJD_01520,EEABGJJD_01520 +group_968,,GlsB/YeaQ/YmgE family stress response membrane protein,5,5,1.0,1,1602,,,,240,240,240.0,EEABGJJD_01477,EEABGJJD_01477,EEABGJJD_01477,EEABGJJD_01477,EEABGJJD_01477 +dltC,dltC,putative D-alanyl carrier protein,5,5,1.0,1,1601,,,,240,240,240.0,EEABGJJD_01100,EEABGJJD_01100,EEABGJJD_01100,EEABGJJD_01100,EEABGJJD_01100 +group_967,,KH domain-containing protein,5,5,1.0,1,1600,,,,240,240,240.0,EEABGJJD_00697,EEABGJJD_00697,EEABGJJD_00697,EEABGJJD_00697,EEABGJJD_00697 +group_966,,hypothetical protein,5,5,1.0,1,1599,,,,240,240,240.0,EEABGJJD_00353,EEABGJJD_00353,EEABGJJD_00353,EEABGJJD_00353,EEABGJJD_00353 +group_965,,YneF family protein,5,5,1.0,1,1598,,,,243,243,243.0,EEABGJJD_00323,EEABGJJD_00323,EEABGJJD_00323,EEABGJJD_00323,EEABGJJD_00323 +group_964,,acyl carrier protein,5,5,1.0,1,1597,,,,243,243,243.0,EEABGJJD_00042,EEABGJJD_00042,EEABGJJD_00042,EEABGJJD_00042,EEABGJJD_00042 +group_963,,hypothetical protein,5,5,1.0,1,1596,,,,246,246,246.0,EEABGJJD_00804,EEABGJJD_00804,EEABGJJD_00804,EEABGJJD_00804,EEABGJJD_00804 +group_962,,hypothetical protein,5,5,1.0,1,1595,,,,246,246,246.0,EEABGJJD_00406,EEABGJJD_00406,EEABGJJD_00406,EEABGJJD_00406,EEABGJJD_00406 +group_961,,hypothetical protein,5,5,1.0,1,1594,,,,249,249,249.0,EEABGJJD_01171,EEABGJJD_01171,EEABGJJD_01171,EEABGJJD_01171,EEABGJJD_01171 +group_960,,GlsB/YeaQ/YmgE family stress response membrane protein,5,5,1.0,1,1593,,,,249,249,249.0,EEABGJJD_01065,EEABGJJD_01065,EEABGJJD_01065,EEABGJJD_01065,EEABGJJD_01065 +group_959,,hypothetical protein,5,5,1.0,1,1592,,,,249,249,249.0,EEABGJJD_00818,EEABGJJD_00818,EEABGJJD_00818,EEABGJJD_00818,EEABGJJD_00818 +group_958,,hypothetical protein (mga-associated),5,5,1.0,1,1591,,,,252,252,252.0,EEABGJJD_01675,EEABGJJD_01675,EEABGJJD_01675,EEABGJJD_01675,EEABGJJD_01675 +group_957,,hypothetical protein,5,5,1.0,1,1590,,,,252,252,252.0,EEABGJJD_01275,EEABGJJD_01275,EEABGJJD_01275,EEABGJJD_01275,EEABGJJD_01275 +group_956,,membrane protein,5,5,1.0,1,1589,,,,252,252,252.0,EEABGJJD_00380,EEABGJJD_00380,EEABGJJD_00380,EEABGJJD_00380,EEABGJJD_00380 +group_955,,hypothetical protein,5,5,1.0,1,1588,,,,252,252,252.0,EEABGJJD_00369,EEABGJJD_00369,EEABGJJD_00369,EEABGJJD_00369,EEABGJJD_00369 +group_954,,hypothetical protein,5,5,1.0,1,1587,,,,255,255,255.0,EEABGJJD_01314,EEABGJJD_01314,EEABGJJD_01314,EEABGJJD_01314,EEABGJJD_01314 +group_953,,YggT family protein,5,5,1.0,1,1586,,,,255,255,255.0,EEABGJJD_01267,EEABGJJD_01267,EEABGJJD_01267,EEABGJJD_01267,EEABGJJD_01267 +group_952,,glycosyltransferase family 9 protein,5,5,1.0,1,1585,,,,255,255,255.0,EEABGJJD_00405,EEABGJJD_00405,EEABGJJD_00405,EEABGJJD_00405,EEABGJJD_00405 +group_951,,hypothetical protein,5,5,1.0,1,1584,,,,258,258,258.0,EEABGJJD_01409,EEABGJJD_01409,EEABGJJD_01409,EEABGJJD_01409,EEABGJJD_01409 +group_950,,membrane protein,5,5,1.0,1,1583,,,,258,258,258.0,EEABGJJD_00632,EEABGJJD_00632,EEABGJJD_00632,EEABGJJD_00632,EEABGJJD_00632 +rpmE,rpmE,50S ribosomal protein L31 type B,5,5,1.0,1,1582,,,,261,261,261.0,EEABGJJD_00595,EEABGJJD_00595,EEABGJJD_00595,EEABGJJD_00595,EEABGJJD_00595 +rpsQ,rpsQ,30S ribosomal protein S17,5,5,1.0,1,1581,,,,261,261,261.0,EEABGJJD_00075,EEABGJJD_00075,EEABGJJD_00075,EEABGJJD_00075,EEABGJJD_00075 +group_949,,tagatose-6-phosphate kinase,5,5,1.0,1,1580,,,,264,264,264.0,EEABGJJD_01423,EEABGJJD_01423,EEABGJJD_01423,EEABGJJD_01423,EEABGJJD_01423 +phiMGAS50052_22,phiMGAS5005.2_22,phage protein,5,5,1.0,1,1579,,,,264,264,264.0,EEABGJJD_01208,EEABGJJD_01208,EEABGJJD_01208,EEABGJJD_01208,EEABGJJD_01208 +ptsH,ptsH,putative phosphotransferase system phosphohistidine-containing protein,5,5,1.0,1,1578,,,,264,264,264.0,EEABGJJD_01150,EEABGJJD_01150,EEABGJJD_01150,EEABGJJD_01150,EEABGJJD_01150 +group_948,,hypothetical protein,5,5,1.0,1,1577,,,,264,264,264.0,EEABGJJD_01144,EEABGJJD_01144,EEABGJJD_01144,EEABGJJD_01144,EEABGJJD_01144 +group_947,,membrane protein insertion efficiency factor YidD,5,5,1.0,1,1576,,,,264,264,264.0,EEABGJJD_00332,EEABGJJD_00332,EEABGJJD_00332,EEABGJJD_00332,EEABGJJD_00332 +group_946,,hypothetical protein,5,5,1.0,1,1575,,,,267,267,267.0,EEABGJJD_01660,EEABGJJD_01660,EEABGJJD_01660,EEABGJJD_01660,EEABGJJD_01660 +group_945,,bacteriocin immunity protein,5,5,1.0,1,1574,,,,267,267,267.0,EEABGJJD_01400,EEABGJJD_01400,EEABGJJD_01400,EEABGJJD_01400,EEABGJJD_01400 +phiMGAS50052_32,phiMGAS5005.2_32,phage protein,5,5,1.0,1,1573,,,,267,267,267.0,EEABGJJD_01218,EEABGJJD_01218,EEABGJJD_01218,EEABGJJD_01218,EEABGJJD_01218 +group_944,,hypothetical protein,5,5,1.0,1,1572,,,,267,267,267.0,EEABGJJD_00823,EEABGJJD_00823,EEABGJJD_00823,EEABGJJD_00823,EEABGJJD_00823 +phiMGAS50052_3,phiMGAS5005.2_3,phage protein,5,5,1.0,1,1571,,,,267,267,267.0,EEABGJJD_00789,EEABGJJD_00789,EEABGJJD_00789,EEABGJJD_00789,EEABGJJD_00789 +phiMGAS50051_1,phiMGAS5005.1_1,phage protein,5,5,1.0,1,1570,,,,267,267,267.0,EEABGJJD_00569,EEABGJJD_00569,EEABGJJD_00569,EEABGJJD_00569,EEABGJJD_00569 +group_943,,phage antirepressor protein,5,5,1.0,1,1569,,,,270,270,270.0,EEABGJJD_01759,EEABGJJD_01759,EEABGJJD_01759,EEABGJJD_01759,EEABGJJD_01759 +group_942,,IreB family regulatory phosphoprotein,5,5,1.0,1,1568,,,,270,270,270.0,EEABGJJD_01746,EEABGJJD_01746,EEABGJJD_01746,EEABGJJD_01746,EEABGJJD_01746 +rpsO,rpsO,30S ribosomal protein S15,5,5,1.0,1,1567,,,,270,270,270.0,EEABGJJD_01627,EEABGJJD_01627,EEABGJJD_01627,EEABGJJD_01627,EEABGJJD_01627 +rpsN2,rpsN2,30S ribosomal protein S14,5,5,1.0,1,1566,,,,270,270,270.0,EEABGJJD_01555,EEABGJJD_01555,EEABGJJD_01555,EEABGJJD_01555,EEABGJJD_01555 +phiMGAS50052_40,phiMGAS5005.2_40,phage protein,5,5,1.0,1,1565,,,,270,270,270.0,EEABGJJD_01226,EEABGJJD_01226,EEABGJJD_01226,EEABGJJD_01226,EEABGJJD_01226 +phiMGAS50053_2,phiMGAS5005.3_2,phage protein,5,5,1.0,1,1564,,,,270,270,270.0,EEABGJJD_00817,EEABGJJD_00817,EEABGJJD_00817,EEABGJJD_00817,EEABGJJD_00817 +group_941,,quorum-sensing system protein StcA,5,5,1.0,1,1563,,,,270,270,270.0,EEABGJJD_00416,EEABGJJD_00416,EEABGJJD_00416,EEABGJJD_00416,EEABGJJD_00416 +group_940,,hypothetical protein,5,5,1.0,1,1562,,,,273,273,273.0,EEABGJJD_01801,EEABGJJD_01801,EEABGJJD_01801,EEABGJJD_01801,EEABGJJD_01801 +group_939,,MerR family transcriptional regulator,5,5,1.0,1,1561,,,,273,273,273.0,EEABGJJD_01764,EEABGJJD_01764,EEABGJJD_01764,EEABGJJD_01764,EEABGJJD_01764 +rpsP,rpsP,30S ribosomal protein S16,5,5,1.0,1,1560,,,,273,273,273.0,EEABGJJD_00696,EEABGJJD_00696,EEABGJJD_00696,EEABGJJD_00696,EEABGJJD_00696 +phiMGAS50053_1,phiMGAS5005.3_1,phage protein,5,5,1.0,1,1559,,,,273,273,273.0,EEABGJJD_00560,EEABGJJD_00560,EEABGJJD_00560,EEABGJJD_00560,EEABGJJD_00560 +group_938,,PspC domain-containing protein,5,5,1.0,1,1558,,,,273,273,273.0,EEABGJJD_00488,EEABGJJD_00488,EEABGJJD_00488,EEABGJJD_00488,EEABGJJD_00488 +group_937,,hypothetical protein,5,5,1.0,1,1557,,,,273,273,273.0,EEABGJJD_00466,EEABGJJD_00466,EEABGJJD_00466,EEABGJJD_00466,EEABGJJD_00466 +group_936,,RNA-binding S4 domain-containing protein,5,5,1.0,1,1556,,,,273,273,273.0,EEABGJJD_00007,EEABGJJD_00007,EEABGJJD_00007,EEABGJJD_00007,EEABGJJD_00007 +hlpA,hlpA,histone-like DNA-binding protein,5,5,1.0,1,1555,,,,276,276,276.0,EEABGJJD_01244,EEABGJJD_01244,EEABGJJD_01244,EEABGJJD_01244,EEABGJJD_01244 +group_935,,transposase,5,5,1.0,1,1554,,,,276,276,276.0,EEABGJJD_00476,EEABGJJD_00476,EEABGJJD_00476,EEABGJJD_00476,EEABGJJD_00476 +group_934,,hypothetical protein,5,5,1.0,1,1553,,,,276,276,276.0,EEABGJJD_00350,EEABGJJD_00350,EEABGJJD_00350,EEABGJJD_00350,EEABGJJD_00350 +group_933,,GIY-YIG nuclease family protein,5,5,1.0,1,1552,,,,279,279,279.0,EEABGJJD_01179,EEABGJJD_01179,EEABGJJD_01179,EEABGJJD_01179,EEABGJJD_01179 +group_932,,hypothetical protein,5,5,1.0,1,1551,,,,279,279,279.0,EEABGJJD_01049,EEABGJJD_01049,EEABGJJD_01049,EEABGJJD_01049,EEABGJJD_01049 +group_931,,putative acylphosphatase,5,5,1.0,1,1550,,,,279,279,279.0,EEABGJJD_00317,EEABGJJD_00317,EEABGJJD_00317,EEABGJJD_00317,EEABGJJD_00317 +group_930,,PTS ascorbate transporter subunit IIB,5,5,1.0,1,1549,,,,279,279,279.0,EEABGJJD_00180,EEABGJJD_00180,EEABGJJD_00180,EEABGJJD_00180,EEABGJJD_00180 +rpsS,rpsS,30S ribosomal protein S19,5,5,1.0,1,1548,,,,279,279,279.0,EEABGJJD_00070,EEABGJJD_00070,EEABGJJD_00070,EEABGJJD_00070,EEABGJJD_00070 +group_929,,IS30 family transposase,5,5,1.0,1,1547,,,,282,282,282.0,EEABGJJD_01831,EEABGJJD_01831,EEABGJJD_01831,EEABGJJD_01831,EEABGJJD_01831 +group_928,,helix-turn-helix transcriptional regulator,5,5,1.0,1,1546,,,,282,282,282.0,EEABGJJD_01791,EEABGJJD_01791,EEABGJJD_01791,EEABGJJD_01791,EEABGJJD_01791 +group_927,,ABC transporter,5,5,1.0,1,1545,,,,285,285,285.0,EEABGJJD_01682,EEABGJJD_01682,EEABGJJD_01682,EEABGJJD_01682,EEABGJJD_01682 +group_926,,PTS maltose transporter subunit IIBC,5,5,1.0,1,1544,,,,285,285,285.0,EEABGJJD_01625,EEABGJJD_01625,EEABGJJD_01625,EEABGJJD_01625,EEABGJJD_01625 +group_925,,chorismate mutase,5,5,1.0,1,1543,,,,285,285,285.0,EEABGJJD_00598,EEABGJJD_00598,EEABGJJD_00598,EEABGJJD_00598,EEABGJJD_00598 +group_924,,hypothetical protein,5,5,1.0,1,1542,,,,285,285,285.0,EEABGJJD_00493,EEABGJJD_00493,EEABGJJD_00493,EEABGJJD_00493,EEABGJJD_00493 +group_923,,competence protein ComG,5,5,1.0,1,1541,,,,285,285,285.0,EEABGJJD_00122,EEABGJJD_00122,EEABGJJD_00122,EEABGJJD_00122,EEABGJJD_00122 +group_922,,XRE family transcriptional regulator,5,5,1.0,1,1540,,,,288,288,288.0,EEABGJJD_01607,EEABGJJD_01607,EEABGJJD_01607,EEABGJJD_01607,EEABGJJD_01607 +groES,groES,heat shock protein - cochaperonin,5,5,1.0,1,1539,,,,291,291,291.0,EEABGJJD_01711,EEABGJJD_01711,EEABGJJD_01711,EEABGJJD_01711,EEABGJJD_01711 +rpsF,rpsF,30S ribosomal protein S6,5,5,1.0,1,1538,,,,291,291,291.0,EEABGJJD_01522,EEABGJJD_01522,EEABGJJD_01522,EEABGJJD_01522,EEABGJJD_01522 +group_921,,hypothetical protein,5,5,1.0,1,1537,,,,294,294,294.0,EEABGJJD_01523,EEABGJJD_01523,EEABGJJD_01523,EEABGJJD_01523,EEABGJJD_01523 +group_920,,CRISPR-associated endonuclease Cas2,5,5,1.0,1,1536,,,,294,294,294.0,EEABGJJD_01304,EEABGJJD_01304,EEABGJJD_01304,EEABGJJD_01304,EEABGJJD_01304 +group_919,,IS3 family transposase,5,5,1.0,1,1535,,,,294,294,294.0,EEABGJJD_01155,EEABGJJD_01155,EEABGJJD_01155,EEABGJJD_01155,EEABGJJD_01155 +rpmA,rpmA,50S ribosomal protein L27,5,5,1.0,1,1534,,,,294,294,294.0,EEABGJJD_00683,EEABGJJD_00683,EEABGJJD_00683,EEABGJJD_00683,EEABGJJD_00683 +group_918,,IS3 family transposase,5,5,1.0,1,1533,,,,294,294,294.0,EEABGJJD_00197,EEABGJJD_00197,EEABGJJD_00197,EEABGJJD_00197,EEABGJJD_00197 +group_917,,membrane protein,5,5,1.0,1,1532,,,,294,294,294.0,EEABGJJD_00129,EEABGJJD_00129,EEABGJJD_00129,EEABGJJD_00129,EEABGJJD_00129 +group_916,,conversed hypothetical protein,5,5,1.0,1,1531,,,,297,297,297.0,EEABGJJD_01683,EEABGJJD_01683,EEABGJJD_01683,EEABGJJD_01683,EEABGJJD_01683 +group_915,,hypothetical protein,5,5,1.0,1,1530,,,,297,297,297.0,EEABGJJD_01437,EEABGJJD_01437,EEABGJJD_01437,EEABGJJD_01437,EEABGJJD_01437 +phiMGAS50053_4,phiMGAS5005.3_4,phage protein,5,5,1.0,1,1529,,,,297,297,297.0,EEABGJJD_00838,EEABGJJD_00838,EEABGJJD_00838,EEABGJJD_00838,EEABGJJD_00838 +rplW,rplW,50S ribosomal protein L23,5,5,1.0,1,1528,,,,297,297,297.0,EEABGJJD_00068,EEABGJJD_00068,EEABGJJD_00068,EEABGJJD_00068,EEABGJJD_00068 +group_914,,YbaB/EbfC family nucleoid-associated protein,5,5,1.0,1,1527,,,,300,300,300.0,EEABGJJD_01546,EEABGJJD_01546,EEABGJJD_01546,EEABGJJD_01546,EEABGJJD_01546 +gatC,gatC,putative Glu-tRNA Gln amidotransferase subunit C,5,5,1.0,1,1526,,,,303,303,303.0,EEABGJJD_01481,EEABGJJD_01481,EEABGJJD_01481,EEABGJJD_01481,EEABGJJD_01481 +group_913,,putative ribosomal protein,5,5,1.0,1,1525,,,,303,303,303.0,EEABGJJD_01436,EEABGJJD_01436,EEABGJJD_01436,EEABGJJD_01436,EEABGJJD_01436 +group_912,,hypothetical protein,5,5,1.0,1,1524,,,,303,303,303.0,EEABGJJD_01263,EEABGJJD_01263,EEABGJJD_01263,EEABGJJD_01263,EEABGJJD_01263 +group_911,,hypothetical protein,5,5,1.0,1,1523,,,,303,303,303.0,EEABGJJD_00761,EEABGJJD_00761,EEABGJJD_00761,EEABGJJD_00761,EEABGJJD_00761 +group_910,,type II toxin-antitoxin system RelB/DinJ family antitoxin,5,5,1.0,1,1522,,,,303,303,303.0,EEABGJJD_00464,EEABGJJD_00464,EEABGJJD_00464,EEABGJJD_00464,EEABGJJD_00464 +group_909,,hypothetical protein,5,5,1.0,1,1521,,,,306,306,306.0,EEABGJJD_01744,EEABGJJD_01744,EEABGJJD_01744,EEABGJJD_01744,EEABGJJD_01744 +group_908,,streptopain,5,5,1.0,1,1520,,,,306,306,306.0,EEABGJJD_01686,EEABGJJD_01686,EEABGJJD_01686,EEABGJJD_01686,EEABGJJD_01686 +group_907,,putative PTS system enzyme IIB component,5,5,1.0,1,1519,,,,306,306,306.0,EEABGJJD_01427,EEABGJJD_01427,EEABGJJD_01427,EEABGJJD_01427,EEABGJJD_01427 +group_906,,putative acetate kinase,5,5,1.0,1,1518,,,,306,306,306.0,EEABGJJD_01320,EEABGJJD_01320,EEABGJJD_01320,EEABGJJD_01320,EEABGJJD_01320 +group_905,,type II toxin-antitoxin system RelE/ParE family toxin,5,5,1.0,1,1517,,,,306,306,306.0,EEABGJJD_00465,EEABGJJD_00465,EEABGJJD_00465,EEABGJJD_00465,EEABGJJD_00465 +rplX,rplX,50S ribosomal protein L24,5,5,1.0,1,1516,,,,306,306,306.0,EEABGJJD_00077,EEABGJJD_00077,EEABGJJD_00077,EEABGJJD_00077,EEABGJJD_00077 +group_904,,bacteriocin immunity protein,5,5,1.0,1,1515,,,,309,309,309.0,EEABGJJD_01778,EEABGJJD_01778,EEABGJJD_01778,EEABGJJD_01778,EEABGJJD_01778 +group_903,,putative PTS system enzyme IIB,5,5,1.0,1,1514,,,,309,309,309.0,EEABGJJD_01696,EEABGJJD_01696,EEABGJJD_01696,EEABGJJD_01696,EEABGJJD_01696 +group_902,,hypothetical protei,5,5,1.0,1,1513,,,,309,309,309.0,EEABGJJD_01530,EEABGJJD_01530,EEABGJJD_01530,EEABGJJD_01530,EEABGJJD_01530 +citD,citD,citrate lyase acyl carrier protein,5,5,1.0,1,1512,,,,309,309,309.0,EEABGJJD_00997,EEABGJJD_00997,EEABGJJD_00997,EEABGJJD_00997,EEABGJJD_00997 +group_901,,hypothetical protein,5,5,1.0,1,1511,,,,309,309,309.0,EEABGJJD_00825,EEABGJJD_00825,EEABGJJD_00825,EEABGJJD_00825,EEABGJJD_00825 +group_900,,ribosome assembly RNA-binding protein YhbY,5,5,1.0,1,1510,,,,309,309,309.0,EEABGJJD_00285,EEABGJJD_00285,EEABGJJD_00285,EEABGJJD_00285,EEABGJJD_00285 +rpsJ,rpsJ,30S ribosomal protein S10,5,5,1.0,1,1509,,,,309,309,309.0,EEABGJJD_00065,EEABGJJD_00065,EEABGJJD_00065,EEABGJJD_00065,EEABGJJD_00065 +xis,xis,putative excisionase,5,5,1.0,1,1508,,,,312,312,312.0,EEABGJJD_01238,EEABGJJD_01238,EEABGJJD_01238,EEABGJJD_01238,EEABGJJD_01238 +group_899,,putative PTS system enzyme II,5,5,1.0,1,1507,,,,312,312,312.0,EEABGJJD_01112,EEABGJJD_01112,EEABGJJD_01112,EEABGJJD_01112,EEABGJJD_01112 +group_898,,CRISPR-associated endonuclease Cas2,5,5,1.0,1,1506,,,,312,312,312.0,EEABGJJD_00874,EEABGJJD_00874,EEABGJJD_00874,EEABGJJD_00874,EEABGJJD_00874 +group_897,,hypothetical protein,5,5,1.0,1,1505,,,,312,312,312.0,EEABGJJD_00204,EEABGJJD_00204,EEABGJJD_00204,EEABGJJD_00204,EEABGJJD_00204 +trx,trx,putative thioredoxin,5,5,1.0,1,1504,,,,315,315,315.0,EEABGJJD_01526,EEABGJJD_01526,EEABGJJD_01526,EEABGJJD_01526,EEABGJJD_01526 +group_896,,membrane protein,5,5,1.0,1,1503,,,,315,315,315.0,EEABGJJD_00987,EEABGJJD_00987,EEABGJJD_00987,EEABGJJD_00987,EEABGJJD_00987 +rplU,rplU,50S ribosomal protein L21,5,5,1.0,1,1502,,,,315,315,315.0,EEABGJJD_00681,EEABGJJD_00681,EEABGJJD_00681,EEABGJJD_00681,EEABGJJD_00681 +group_895,,hypothetical protein,5,5,1.0,1,1501,,,,315,315,315.0,EEABGJJD_00468,EEABGJJD_00468,EEABGJJD_00468,EEABGJJD_00468,EEABGJJD_00468 +group_894,,phosphohydrolase,5,5,1.0,1,1500,,,,318,318,318.0,EEABGJJD_01806,EEABGJJD_01806,EEABGJJD_01806,EEABGJJD_01806,EEABGJJD_01806 +lacF,lacF,putative PTS system lactose-specific component IIA,5,5,1.0,1,1499,,,,318,318,318.0,EEABGJJD_01601,EEABGJJD_01601,EEABGJJD_01601,EEABGJJD_01601,EEABGJJD_01601 +group_893,,DNA-directed RNA polymerase subunit omega,5,5,1.0,1,1498,,,,318,318,318.0,EEABGJJD_01365,EEABGJJD_01365,EEABGJJD_01365,EEABGJJD_01365,EEABGJJD_01365 +group_892,,thioredoxin,5,5,1.0,1,1497,,,,318,318,318.0,EEABGJJD_00130,EEABGJJD_00130,EEABGJJD_00130,EEABGJJD_00130,EEABGJJD_00130 +group_891,,putative PTS system enzyme III,5,5,1.0,1,1496,,,,321,321,321.0,EEABGJJD_01697,EEABGJJD_01697,EEABGJJD_01697,EEABGJJD_01697,EEABGJJD_01697 +group_890,,hypothetical protein,5,5,1.0,1,1495,,,,321,321,321.0,EEABGJJD_00371,EEABGJJD_00371,EEABGJJD_00371,EEABGJJD_00371,EEABGJJD_00371 +group_889,,V-type ATP synthase subunit F,5,5,1.0,1,1494,,,,321,321,321.0,EEABGJJD_00161,EEABGJJD_00161,EEABGJJD_00161,EEABGJJD_00161,EEABGJJD_00161 +group_888,,hypothetical protein,5,5,1.0,1,1493,,,,321,321,321.0,EEABGJJD_00156,EEABGJJD_00156,EEABGJJD_00156,EEABGJJD_00156,EEABGJJD_00156 +ftsL,ftsL,putative cell division protein,5,5,1.0,1,1492,,,,324,324,324.0,EEABGJJD_01393,EEABGJJD_01393,EEABGJJD_01393,EEABGJJD_01393,EEABGJJD_01393 +group_887,,hypothetical protein,5,5,1.0,1,1491,,,,324,324,324.0,EEABGJJD_00993,EEABGJJD_00993,EEABGJJD_00993,EEABGJJD_00993,EEABGJJD_00993 +group_886,,putative methyl transferase,5,5,1.0,1,1490,,,,324,324,324.0,EEABGJJD_00899,EEABGJJD_00899,EEABGJJD_00899,EEABGJJD_00899,EEABGJJD_00899 +group_885,,DNA replication initiation control protein YabA,5,5,1.0,1,1489,,,,324,324,324.0,EEABGJJD_00354,EEABGJJD_00354,EEABGJJD_00354,EEABGJJD_00354,EEABGJJD_00354 +group_884,,PadR family transcriptional regulator,5,5,1.0,1,1488,,,,327,327,327.0,EEABGJJD_01794,EEABGJJD_01794,EEABGJJD_01794,EEABGJJD_01794,EEABGJJD_01794 +group_883,,cell division regulator GpsB,5,5,1.0,1,1487,,,,327,327,327.0,EEABGJJD_01378,EEABGJJD_01378,EEABGJJD_01378,EEABGJJD_01378,EEABGJJD_01378 +group_882,,hypothetical protein,5,5,1.0,1,1486,,,,327,327,327.0,EEABGJJD_00854,EEABGJJD_00854,EEABGJJD_00854,EEABGJJD_00854,EEABGJJD_00854 +group_881,,ribosomal-processing cysteine protease Prp,5,5,1.0,1,1485,,,,327,327,327.0,EEABGJJD_00682,EEABGJJD_00682,EEABGJJD_00682,EEABGJJD_00682,EEABGJJD_00682 +group_880,,hypothetical protein,5,5,1.0,1,1484,,,,327,327,327.0,EEABGJJD_00176,EEABGJJD_00176,EEABGJJD_00176,EEABGJJD_00176,EEABGJJD_00176 +group_879,,competence protein ComG,5,5,1.0,1,1483,,,,327,327,327.0,EEABGJJD_00124,EEABGJJD_00124,EEABGJJD_00124,EEABGJJD_00124,EEABGJJD_00124 +comYC,comYC,putative competence protein,5,5,1.0,1,1482,,,,327,327,327.0,EEABGJJD_00120,EEABGJJD_00120,EEABGJJD_00120,EEABGJJD_00120,EEABGJJD_00120 +group_878,,DNA-binding protein,5,5,1.0,1,1481,,,,330,330,330.0,EEABGJJD_01763,EEABGJJD_01763,EEABGJJD_01763,EEABGJJD_01763,EEABGJJD_01763 +group_877,,hypothetical protein,5,5,1.0,1,1480,,,,330,330,330.0,EEABGJJD_00830,EEABGJJD_00830,EEABGJJD_00830,EEABGJJD_00830,EEABGJJD_00830 +phiMGAS50052_10,phiMGAS5005.2_10,phage protein,5,5,1.0,1,1479,,,,330,330,330.0,EEABGJJD_00797,EEABGJJD_00797,EEABGJJD_00797,EEABGJJD_00797,EEABGJJD_00797 +phiMGAS50052_43,phiMGAS5005.2_43,phage protein,5,5,1.0,1,1478,,,,333,333,333.0,EEABGJJD_01232,EEABGJJD_01232,EEABGJJD_01232,EEABGJJD_01232,EEABGJJD_01232 +group_876,,putative glycine cleavage system H protein,5,5,1.0,1,1477,,,,333,333,333.0,EEABGJJD_01022,EEABGJJD_01022,EEABGJJD_01022,EEABGJJD_01022,EEABGJJD_01022 +group_875,,type II toxin-antitoxin system RelE/ParE family toxin,5,5,1.0,1,1476,,,,336,336,336.0,EEABGJJD_01608,EEABGJJD_01608,EEABGJJD_01608,EEABGJJD_01608,EEABGJJD_01608 +phiMGAS50052_24,phiMGAS5005.2_24,phage protein,5,5,1.0,1,1475,,,,336,336,336.0,EEABGJJD_01210,EEABGJJD_01210,EEABGJJD_01210,EEABGJJD_01210,EEABGJJD_01210 +group_874,,putative PTS enzyme III,5,5,1.0,1,1474,,,,336,336,336.0,EEABGJJD_01111,EEABGJJD_01111,EEABGJJD_01111,EEABGJJD_01111,EEABGJJD_01111 +group_873,,alkylphosphonate utilization protein,5,5,1.0,1,1473,,,,336,336,336.0,EEABGJJD_01073,EEABGJJD_01073,EEABGJJD_01073,EEABGJJD_01073,EEABGJJD_01073 +cadC,cadC,putative cadmium efflux system accessory,5,5,1.0,1,1472,,,,339,339,339.0,EEABGJJD_01787,EEABGJJD_01787,EEABGJJD_01787,EEABGJJD_01787,EEABGJJD_01787 +group_872,,cupin domain-containing protein,5,5,1.0,1,1471,,,,339,339,339.0,EEABGJJD_01321,EEABGJJD_01321,EEABGJJD_01321,EEABGJJD_01321,EEABGJJD_01321 +phiMGAS50052_26,phiMGAS5005.2_26,phage protein,5,5,1.0,1,1470,,,,339,339,339.0,EEABGJJD_01212,EEABGJJD_01212,EEABGJJD_01212,EEABGJJD_01212,EEABGJJD_01212 +phiMGAS50052_11,phiMGAS5005.2_11,phage protein,5,5,1.0,1,1469,,,,339,339,339.0,EEABGJJD_00801,EEABGJJD_00801,EEABGJJD_00801,EEABGJJD_00801,EEABGJJD_00801 +group_871,,metal-sulfur cluster assembly factor,5,5,1.0,1,1468,,,,339,339,339.0,EEABGJJD_00650,EEABGJJD_00650,EEABGJJD_00650,EEABGJJD_00650,EEABGJJD_00650 +group_870,,Fic family protein,5,5,1.0,1,1467,,,,339,339,339.0,EEABGJJD_00469,EEABGJJD_00469,EEABGJJD_00469,EEABGJJD_00469,EEABGJJD_00469 +group_869,,hypothetical protein,5,5,1.0,1,1466,,,,342,342,342.0,EEABGJJD_01760,EEABGJJD_01760,EEABGJJD_01760,EEABGJJD_01760,EEABGJJD_01760 +phiMGAS50052_42,phiMGAS5005.2_42,phage protein,5,5,1.0,1,1465,,,,342,342,342.0,EEABGJJD_01228,EEABGJJD_01228,EEABGJJD_01228,EEABGJJD_01228,EEABGJJD_01228 +ylxM,ylxM,putative DNA-binding protein,5,5,1.0,1,1464,,,,342,342,342.0,EEABGJJD_01007,EEABGJJD_01007,EEABGJJD_01007,EEABGJJD_01007,EEABGJJD_01007 +group_868,,YlbF/YmcA family competence regulator,5,5,1.0,1,1463,,,,342,342,342.0,EEABGJJD_00674,EEABGJJD_00674,EEABGJJD_00674,EEABGJJD_00674,EEABGJJD_00674 +group_867,,hypothetical protein,5,5,1.0,1,1462,,,,342,342,342.0,EEABGJJD_00660,EEABGJJD_00660,EEABGJJD_00660,EEABGJJD_00660,EEABGJJD_00660 +group_866,,hypothetical protein,5,5,1.0,1,1461,,,,342,342,342.0,EEABGJJD_00382,EEABGJJD_00382,EEABGJJD_00382,EEABGJJD_00382,EEABGJJD_00382 +group_865,,putative protein,5,5,1.0,1,1460,,,,345,345,345.0,EEABGJJD_01408,EEABGJJD_01408,EEABGJJD_01408,EEABGJJD_01408,EEABGJJD_01408 +rplV,rplV,50S ribosomal protein L22,5,5,1.0,1,1459,,,,345,345,345.0,EEABGJJD_00071,EEABGJJD_00071,EEABGJJD_00071,EEABGJJD_00071,EEABGJJD_00071 +group_864,,thioredoxin,5,5,1.0,1,1458,,,,348,348,348.0,EEABGJJD_01638,EEABGJJD_01638,EEABGJJD_01638,EEABGJJD_01638,EEABGJJD_01638 +group_863,,cysteine desulfurase,5,5,1.0,1,1457,,,,348,348,348.0,EEABGJJD_00938,EEABGJJD_00938,EEABGJJD_00938,EEABGJJD_00938,EEABGJJD_00938 +group_862,,YkgJ family cysteine cluster protein,5,5,1.0,1,1456,,,,348,348,348.0,EEABGJJD_00888,EEABGJJD_00888,EEABGJJD_00888,EEABGJJD_00888,EEABGJJD_00888 +rplS,rplS,50S ribosomal protein L19,5,5,1.0,1,1455,,,,348,348,348.0,EEABGJJD_00600,EEABGJJD_00600,EEABGJJD_00600,EEABGJJD_00600,EEABGJJD_00600 +rbfA,rbfA,putative ribosome binding factor A,5,5,1.0,1,1454,,,,351,351,351.0,EEABGJJD_01434,EEABGJJD_01434,EEABGJJD_01434,EEABGJJD_01434,EEABGJJD_01434 +group_861,,acetyl-CoA carboxylase biotin carboxyl carrier protein subunit,5,5,1.0,1,1453,,,,351,351,351.0,EEABGJJD_00988,EEABGJJD_00988,EEABGJJD_00988,EEABGJJD_00988,EEABGJJD_00988 +group_860,,IS66 family insertion sequence hypothetical protein,5,5,1.0,1,1452,,,,351,351,351.0,EEABGJJD_00144,EEABGJJD_00144,EEABGJJD_00144,EEABGJJD_00144,EEABGJJD_00144 +group_859,,phage protein,5,5,1.0,1,1451,,,,354,354,354.0,EEABGJJD_00829,EEABGJJD_00829,EEABGJJD_00829,EEABGJJD_00829,EEABGJJD_00829 +group_858,,phage protein,5,5,1.0,1,1450,,,,354,354,354.0,EEABGJJD_00824,EEABGJJD_00824,EEABGJJD_00824,EEABGJJD_00824,EEABGJJD_00824 +group_857,,hypothetical protein,5,5,1.0,1,1449,,,,354,354,354.0,EEABGJJD_00389,EEABGJJD_00389,EEABGJJD_00389,EEABGJJD_00389,EEABGJJD_00389 +acpS,acpS,putative holo-(acyl carrier protein) synthase,5,5,1.0,1,1448,,,,357,357,357.0,EEABGJJD_01503,EEABGJJD_01503,EEABGJJD_01503,EEABGJJD_01503,EEABGJJD_01503 +group_856,,chemotaxis protein,5,5,1.0,1,1447,,,,357,357,357.0,EEABGJJD_01446,EEABGJJD_01446,EEABGJJD_01446,EEABGJJD_01446,EEABGJJD_01446 +phiMGAS50052_37,phiMGAS5005.2_37,phage protein,5,5,1.0,1,1446,,,,357,357,357.0,EEABGJJD_01223,EEABGJJD_01223,EEABGJJD_01223,EEABGJJD_01223,EEABGJJD_01223 +group_855,,ribonucleoside-triphosphate reductase,5,5,1.0,1,1445,,,,357,357,357.0,EEABGJJD_01016,EEABGJJD_01016,EEABGJJD_01016,EEABGJJD_01016,EEABGJJD_01016 +group_854,,phage capsid protein,5,5,1.0,1,1444,,,,357,357,357.0,EEABGJJD_00574,EEABGJJD_00574,EEABGJJD_00574,EEABGJJD_00574,EEABGJJD_00574 +group_853,,arsenate reductase family protein,5,5,1.0,1,1443,,,,357,357,357.0,EEABGJJD_00358,EEABGJJD_00358,EEABGJJD_00358,EEABGJJD_00358,EEABGJJD_00358 +group_852,,NAD glycohydrolase inhibitor,5,5,1.0,1,1442,,,,357,357,357.0,EEABGJJD_00171,EEABGJJD_00171,EEABGJJD_00171,EEABGJJD_00171,EEABGJJD_00171 +group_851,,50S ribosomal protein L18,5,5,1.0,1,1441,,,,357,357,357.0,EEABGJJD_00082,EEABGJJD_00082,EEABGJJD_00082,EEABGJJD_00082,EEABGJJD_00082 +group_850,,hypothetical protein,5,5,1.0,1,1440,,,,360,360,360.0,EEABGJJD_01455,EEABGJJD_01455,EEABGJJD_01455,EEABGJJD_01455,EEABGJJD_01455 +folQ,folQ,dihydroneopterin aldolase,5,5,1.0,1,1439,,,,360,360,360.0,EEABGJJD_00919,EEABGJJD_00919,EEABGJJD_00919,EEABGJJD_00919,EEABGJJD_00919 +group_849,,hypothetical protein,5,5,1.0,1,1438,,,,360,360,360.0,EEABGJJD_00800,EEABGJJD_00800,EEABGJJD_00800,EEABGJJD_00800,EEABGJJD_00800 +rplT,rplT,50S ribosomal protein L20,5,5,1.0,1,1437,,,,360,360,360.0,EEABGJJD_00669,EEABGJJD_00669,EEABGJJD_00669,EEABGJJD_00669,EEABGJJD_00669 +group_848,,minor capsid protein,5,5,1.0,1,1436,,,,360,360,360.0,EEABGJJD_00573,EEABGJJD_00573,EEABGJJD_00573,EEABGJJD_00573,EEABGJJD_00573 +rnpA,rnpA,putative ribonuclease P protein component,5,5,1.0,1,1435,,,,360,360,360.0,EEABGJJD_00232,EEABGJJD_00232,EEABGJJD_00232,EEABGJJD_00232,EEABGJJD_00232 +group_847,,hypothetical protein,5,5,1.0,1,1434,,,,363,363,363.0,EEABGJJD_01771,EEABGJJD_01771,EEABGJJD_01771,EEABGJJD_01771,EEABGJJD_01771 +group_846,,ABC transporter ATP-binding protein,5,5,1.0,1,1433,,,,363,363,363.0,EEABGJJD_01596,EEABGJJD_01596,EEABGJJD_01596,EEABGJJD_01596,EEABGJJD_01596 +group_845,,phage portal protein,5,5,1.0,1,1432,,,,363,363,363.0,EEABGJJD_00813,EEABGJJD_00813,EEABGJJD_00813,EEABGJJD_00813,EEABGJJD_00813 +mscL,mscL,large conductance mechanosensitive channel protein MscL,5,5,1.0,1,1431,,,,363,363,363.0,EEABGJJD_00647,EEABGJJD_00647,EEABGJJD_00647,EEABGJJD_00647,EEABGJJD_00647 +yajC,yajC,preprotein translocase subunit YajC,5,5,1.0,1,1430,,,,366,366,366.0,EEABGJJD_01637,EEABGJJD_01637,EEABGJJD_01637,EEABGJJD_01637,EEABGJJD_01637 +asp,asp,putative alkaline-shock protein,5,5,1.0,1,1429,,,,366,366,366.0,EEABGJJD_01568,EEABGJJD_01568,EEABGJJD_01568,EEABGJJD_01568,EEABGJJD_01568 +group_844,,diaminopimelate epimerase,5,5,1.0,1,1428,,,,366,366,366.0,EEABGJJD_01123,EEABGJJD_01123,EEABGJJD_01123,EEABGJJD_01123,EEABGJJD_01123 +rplL,rplL,50S ribosomal protein L7/L12,5,5,1.0,1,1427,,,,366,366,366.0,EEABGJJD_00897,EEABGJJD_00897,EEABGJJD_00897,EEABGJJD_00897,EEABGJJD_00897 +group_843,,HK97 gp10 family phage protein,5,5,1.0,1,1426,,,,366,366,366.0,EEABGJJD_00826,EEABGJJD_00826,EEABGJJD_00826,EEABGJJD_00826,EEABGJJD_00826 +group_842,,hypothetical protein,5,5,1.0,1,1425,,,,366,366,366.0,EEABGJJD_00543,EEABGJJD_00543,EEABGJJD_00543,EEABGJJD_00543,EEABGJJD_00543 +group_841,,RidA family protein,5,5,1.0,1,1424,,,,366,366,366.0,EEABGJJD_00154,EEABGJJD_00154,EEABGJJD_00154,EEABGJJD_00154,EEABGJJD_00154 +group_840,,putative DNA binding protein,5,5,1.0,1,1423,,,,366,366,366.0,EEABGJJD_00117,EEABGJJD_00117,EEABGJJD_00117,EEABGJJD_00117,EEABGJJD_00117 +group_839,,30S ribosomal protein S13,5,5,1.0,1,1422,,,,366,366,366.0,EEABGJJD_00090,EEABGJJD_00090,EEABGJJD_00090,EEABGJJD_00090,EEABGJJD_00090 +group_838,,hypothetical protein,5,5,1.0,1,1421,,,,369,369,369.0,EEABGJJD_00972,EEABGJJD_00972,EEABGJJD_00972,EEABGJJD_00972,EEABGJJD_00972 +group_837,,IFN-response binding factor 1,5,5,1.0,1,1420,,,,369,369,369.0,EEABGJJD_00898,EEABGJJD_00898,EEABGJJD_00898,EEABGJJD_00898,EEABGJJD_00898 +group_836,,50S ribosomal protein L14,5,5,1.0,1,1419,,,,369,369,369.0,EEABGJJD_00076,EEABGJJD_00076,EEABGJJD_00076,EEABGJJD_00076,EEABGJJD_00076 +group_835,,putative transcriptional regulator,5,5,1.0,1,1418,,,,372,372,372.0,EEABGJJD_01562,EEABGJJD_01562,EEABGJJD_01562,EEABGJJD_01562,EEABGJJD_01562 +phiMGAS50052_21,phiMGAS5005.2_21,phage protein,5,5,1.0,1,1417,,,,372,372,372.0,EEABGJJD_01207,EEABGJJD_01207,EEABGJJD_01207,EEABGJJD_01207,EEABGJJD_01207 +group_834,,putative transcriptional regulator (GntR family),5,5,1.0,1,1416,,,,372,372,372.0,EEABGJJD_01079,EEABGJJD_01079,EEABGJJD_01079,EEABGJJD_01079,EEABGJJD_01079 +divIC,divIC,putative cell division protein (DivIC),5,5,1.0,1,1415,,,,372,372,372.0,EEABGJJD_00008,EEABGJJD_00008,EEABGJJD_00008,EEABGJJD_00008,EEABGJJD_00008 +group_833,,putative repressor - phage associated,5,5,1.0,1,1414,,,,375,375,375.0,EEABGJJD_01545,EEABGJJD_01545,EEABGJJD_01545,EEABGJJD_01545,EEABGJJD_01545 +group_832,,putative polyribonucleotide nucleotidyltransferase (general stress protein),5,5,1.0,1,1413,,,,378,378,378.0,EEABGJJD_01355,EEABGJJD_01355,EEABGJJD_01355,EEABGJJD_01355,EEABGJJD_01355 +group_831,,OsmC family peroxiredoxin,5,5,1.0,1,1412,,,,378,378,378.0,EEABGJJD_00638,EEABGJJD_00638,EEABGJJD_00638,EEABGJJD_00638,EEABGJJD_00638 +gloA,gloA,putative lactoylglutathione lyase,5,5,1.0,1,1411,,,,378,378,378.0,EEABGJJD_00430,EEABGJJD_00430,EEABGJJD_00430,EEABGJJD_00430,EEABGJJD_00430 +group_830,,ribosome silencing factor,5,5,1.0,1,1410,,,,378,378,378.0,EEABGJJD_00288,EEABGJJD_00288,EEABGJJD_00288,EEABGJJD_00288,EEABGJJD_00288 +group_829,,RidA family protein,5,5,1.0,1,1409,,,,381,381,381.0,EEABGJJD_01704,EEABGJJD_01704,EEABGJJD_01704,EEABGJJD_01704,EEABGJJD_01704 +phiSF3702_6,phiSF370.2_6,putative structural protein - phage associated,5,5,1.0,1,1408,,,,381,381,381.0,EEABGJJD_00821,EEABGJJD_00821,EEABGJJD_00821,EEABGJJD_00821,EEABGJJD_00821 +group_828,,hypothetical protein,5,5,1.0,1,1407,,,,381,381,381.0,EEABGJJD_00715,EEABGJJD_00715,EEABGJJD_00715,EEABGJJD_00715,EEABGJJD_00715 +group_827,,ArpU family transcriptional regulator,5,5,1.0,1,1406,,,,384,384,384.0,EEABGJJD_01772,EEABGJJD_01772,EEABGJJD_01772,EEABGJJD_01772,EEABGJJD_01772 +group_826,,tyrosine-type recombinase/integrase,5,5,1.0,1,1405,,,,384,384,384.0,EEABGJJD_00913,EEABGJJD_00913,EEABGJJD_00913,EEABGJJD_00913,EEABGJJD_00913 +group_825,,HIT family protein,5,5,1.0,1,1404,,,,384,384,384.0,EEABGJJD_00112,EEABGJJD_00112,EEABGJJD_00112,EEABGJJD_00112,EEABGJJD_00112 +rpsK,rpsK,30S ribosomal protein S11,5,5,1.0,1,1403,,,,384,384,384.0,EEABGJJD_00091,EEABGJJD_00091,EEABGJJD_00091,EEABGJJD_00091,EEABGJJD_00091 +group_824,,hypothetical protein,5,5,1.0,1,1402,,,,387,387,387.0,EEABGJJD_00639,EEABGJJD_00639,EEABGJJD_00639,EEABGJJD_00639,EEABGJJD_00639 +group_823,,hypothetical protein,5,5,1.0,1,1401,,,,387,387,387.0,EEABGJJD_00515,EEABGJJD_00515,EEABGJJD_00515,EEABGJJD_00515,EEABGJJD_00515 +group_822,,hypothetical protein,5,5,1.0,1,1400,,,,387,387,387.0,EEABGJJD_00504,EEABGJJD_00504,EEABGJJD_00504,EEABGJJD_00504,EEABGJJD_00504 +group_821,,50S ribosomal protein L17,5,5,1.0,1,1399,,,,387,387,387.0,EEABGJJD_00093,EEABGJJD_00093,EEABGJJD_00093,EEABGJJD_00093,EEABGJJD_00093 +group_820,,Asp23/Gls24 family envelope stress response protein,5,5,1.0,1,1398,,,,390,390,390.0,EEABGJJD_01513,EEABGJJD_01513,EEABGJJD_01513,EEABGJJD_01513,EEABGJJD_01513 +group_819,,rhodanese-like domain-containing protein,5,5,1.0,1,1397,,,,390,390,390.0,EEABGJJD_01277,EEABGJJD_01277,EEABGJJD_01277,EEABGJJD_01277,EEABGJJD_01277 +cdd,cdd,putative cytidine deaminase,5,5,1.0,1,1396,,,,390,390,390.0,EEABGJJD_01034,EEABGJJD_01034,EEABGJJD_01034,EEABGJJD_01034,EEABGJJD_01034 +group_818,,phage capsid protein,5,5,1.0,1,1395,,,,390,390,390.0,EEABGJJD_00827,EEABGJJD_00827,EEABGJJD_00827,EEABGJJD_00827,EEABGJJD_00827 +rpsI,rpsI,ribosomal protein S9,5,5,1.0,1,1394,,,,393,393,393.0,EEABGJJD_01609,EEABGJJD_01609,EEABGJJD_01609,EEABGJJD_01609,EEABGJJD_01609 +group_817,,membrane protein,5,5,1.0,1,1393,,,,393,393,393.0,EEABGJJD_00356,EEABGJJD_00356,EEABGJJD_00356,EEABGJJD_00356,EEABGJJD_00356 +group_816,,helix-hairpin-helix domain-containing protein,5,5,1.0,1,1392,,,,396,396,396.0,EEABGJJD_01313,EEABGJJD_01313,EEABGJJD_01313,EEABGJJD_01313,EEABGJJD_01313 +group_815,,hypothetical protein,5,5,1.0,1,1391,,,,396,396,396.0,EEABGJJD_00558,EEABGJJD_00558,EEABGJJD_00558,EEABGJJD_00558,EEABGJJD_00558 +group_814,,LysR family transcriptional regulator,5,5,1.0,1,1390,,,,396,396,396.0,EEABGJJD_00148,EEABGJJD_00148,EEABGJJD_00148,EEABGJJD_00148,EEABGJJD_00148 +group_813,,single-stranded DNA-binding protein,5,5,1.0,1,1389,,,,396,396,396.0,EEABGJJD_00132,EEABGJJD_00132,EEABGJJD_00132,EEABGJJD_00132,EEABGJJD_00132 +group_812,,transcriptional regulator Spx,5,5,1.0,1,1388,,,,399,399,399.0,EEABGJJD_01747,EEABGJJD_01747,EEABGJJD_01747,EEABGJJD_01747,EEABGJJD_01747 +group_811,,acetyl-CoA carboxylase biotin carboxyl carrier protein subunit,5,5,1.0,1,1387,,,,399,399,399.0,EEABGJJD_00994,EEABGJJD_00994,EEABGJJD_00994,EEABGJJD_00994,EEABGJJD_00994 +phiSF3701_4,phiSF370.1_4,putative minor capsid protein phage associated,5,5,1.0,1,1386,,,,399,399,399.0,EEABGJJD_00575,EEABGJJD_00575,EEABGJJD_00575,EEABGJJD_00575,EEABGJJD_00575 +rpsH,rpsH,30S ribosomal protein S8,5,5,1.0,1,1385,,,,399,399,399.0,EEABGJJD_00080,EEABGJJD_00080,EEABGJJD_00080,EEABGJJD_00080,EEABGJJD_00080 +group_810,,PaaI family thioesterase,5,5,1.0,1,1384,,,,402,402,402.0,EEABGJJD_01126,EEABGJJD_01126,EEABGJJD_01126,EEABGJJD_01126,EEABGJJD_01126 +group_809,,pyridoxamine 5'-phosphate oxidase,5,5,1.0,1,1383,,,,402,402,402.0,EEABGJJD_00699,EEABGJJD_00699,EEABGJJD_00699,EEABGJJD_00699,EEABGJJD_00699 +group_808,,membrane protein,5,5,1.0,1,1382,,,,402,402,402.0,EEABGJJD_00059,EEABGJJD_00059,EEABGJJD_00059,EEABGJJD_00059,EEABGJJD_00059 +group_807,,regulatory protein Spx,5,5,1.0,1,1381,,,,405,405,405.0,EEABGJJD_01050,EEABGJJD_01050,EEABGJJD_01050,EEABGJJD_01050,EEABGJJD_01050 +group_806,,hypothetical protein,5,5,1.0,1,1380,,,,408,408,408.0,EEABGJJD_00491,EEABGJJD_00491,EEABGJJD_00491,EEABGJJD_00491,EEABGJJD_00491 +dgk,dgk,putative diacylglycerol kinase,5,5,1.0,1,1379,,,,408,408,408.0,EEABGJJD_00402,EEABGJJD_00402,EEABGJJD_00402,EEABGJJD_00402,EEABGJJD_00402 +fms,fms,putative polypeptide deformylase,5,5,1.0,1,1378,,,,411,411,411.0,EEABGJJD_00719,EEABGJJD_00719,EEABGJJD_00719,EEABGJJD_00719,EEABGJJD_00719 +yaaA,yaaA,S4 domain-containing protein YaaA,5,5,1.0,1,1377,,,,414,414,414.0,EEABGJJD_01821,EEABGJJD_01821,EEABGJJD_01821,EEABGJJD_01821,EEABGJJD_01821 +group_805,,cysteine--tRNA ligase,5,5,1.0,1,1376,,,,414,414,414.0,EEABGJJD_01617,EEABGJJD_01617,EEABGJJD_01617,EEABGJJD_01617,EEABGJJD_01617 +group_804,,VOC family protein,5,5,1.0,1,1375,,,,414,414,414.0,EEABGJJD_00970,EEABGJJD_00970,EEABGJJD_00970,EEABGJJD_00970,EEABGJJD_00970 +rpsL,rpsL,30S ribosomal protein S12,5,5,1.0,1,1374,,,,414,414,414.0,EEABGJJD_00254,EEABGJJD_00254,EEABGJJD_00254,EEABGJJD_00254,EEABGJJD_00254 +rplP,rplP,50S ribosomal protein L16,5,5,1.0,1,1373,,,,414,414,414.0,EEABGJJD_00073,EEABGJJD_00073,EEABGJJD_00073,EEABGJJD_00073,EEABGJJD_00073 +atpC,atpC,putative proton-translocating ATPase epsiron subunit,5,5,1.0,1,1372,,,,417,417,417.0,EEABGJJD_00631,EEABGJJD_00631,EEABGJJD_00631,EEABGJJD_00631,EEABGJJD_00631 +group_803,,hypothetical protein,5,5,1.0,1,1371,,,,417,417,417.0,EEABGJJD_00572,EEABGJJD_00572,EEABGJJD_00572,EEABGJJD_00572,EEABGJJD_00572 +group_802,,Holliday junction resolvase RuvX,5,5,1.0,1,1370,,,,420,420,420.0,EEABGJJD_01745,EEABGJJD_01745,EEABGJJD_01745,EEABGJJD_01745,EEABGJJD_01745 +hit,hit,putative cell-cycle regulation histidine triad (HIT) protein,5,5,1.0,1,1369,,,,420,420,420.0,EEABGJJD_01445,EEABGJJD_01445,EEABGJJD_01445,EEABGJJD_01445,EEABGJJD_01445 +phiMGAS50052_38,phiMGAS5005.2_38,phage protein,5,5,1.0,1,1368,,,,420,420,420.0,EEABGJJD_01224,EEABGJJD_01224,EEABGJJD_01224,EEABGJJD_01224,EEABGJJD_01224 +group_801,,glutathione S-transferase,5,5,1.0,1,1367,,,,420,420,420.0,EEABGJJD_00763,EEABGJJD_00763,EEABGJJD_00763,EEABGJJD_00763,EEABGJJD_00763 +group_800,,LysR family transcriptional regulator,5,5,1.0,1,1366,,,,420,420,420.0,EEABGJJD_00149,EEABGJJD_00149,EEABGJJD_00149,EEABGJJD_00149,EEABGJJD_00149 +group_799,,beta-hydroxyacyl-ACP dehydratase,5,5,1.0,1,1365,,,,423,423,423.0,EEABGJJD_01460,EEABGJJD_01460,EEABGJJD_01460,EEABGJJD_01460,EEABGJJD_01460 +phiMGAS50052_27,phiMGAS5005.2_27,phage protein,5,5,1.0,1,1364,,,,423,423,423.0,EEABGJJD_01213,EEABGJJD_01213,EEABGJJD_01213,EEABGJJD_01213,EEABGJJD_01213 +group_798,,hypothetical protein,5,5,1.0,1,1363,,,,423,423,423.0,EEABGJJD_00551,EEABGJJD_00551,EEABGJJD_00551,EEABGJJD_00551,EEABGJJD_00551 +group_797,,IS256-like element ISLgar5 family transposase,5,5,1.0,1,1362,,,,423,423,423.0,EEABGJJD_00409,EEABGJJD_00409,EEABGJJD_00409,EEABGJJD_00409,EEABGJJD_00409 +lacA1,lacA.1,putative galactose-6-phosphate isomerase,5,5,1.0,1,1361,,,,426,426,426.0,EEABGJJD_01425,EEABGJJD_01425,EEABGJJD_01425,EEABGJJD_01425,EEABGJJD_01425 +group_796,,PTS mannose transporter subunit IIA,5,5,1.0,1,1360,,,,426,426,426.0,EEABGJJD_00881,EEABGJJD_00881,EEABGJJD_00881,EEABGJJD_00881,EEABGJJD_00881 +rplK,rplK,50S ribosomal protein L11,5,5,1.0,1,1359,,,,426,426,426.0,EEABGJJD_00390,EEABGJJD_00390,EEABGJJD_00390,EEABGJJD_00390,EEABGJJD_00390 +group_795,,MarR family transcriptional regulator,5,5,1.0,1,1358,,,,429,429,429.0,EEABGJJD_01631,EEABGJJD_01631,EEABGJJD_01631,EEABGJJD_01631,EEABGJJD_01631 +lacA2,lacA.2,galactosidase acetyltransferase,5,5,1.0,1,1357,,,,429,429,429.0,EEABGJJD_01605,EEABGJJD_01605,EEABGJJD_01605,EEABGJJD_01605,EEABGJJD_01605 +group_794,,N-acetyltransferase,5,5,1.0,1,1356,,,,429,429,429.0,EEABGJJD_01293,EEABGJJD_01293,EEABGJJD_01293,EEABGJJD_01293,EEABGJJD_01293 +group_793,,YtxH domain-containing protein,5,5,1.0,1,1355,,,,429,429,429.0,EEABGJJD_00492,EEABGJJD_00492,EEABGJJD_00492,EEABGJJD_00492,EEABGJJD_00492 +group_792,,hypothetical protein,5,5,1.0,1,1354,,,,429,429,429.0,EEABGJJD_00173,EEABGJJD_00173,EEABGJJD_00173,EEABGJJD_00173,EEABGJJD_00173 +group_791,,putative competence protein,5,5,1.0,1,1353,,,,429,429,429.0,EEABGJJD_00121,EEABGJJD_00121,EEABGJJD_00121,EEABGJJD_00121,EEABGJJD_00121 +phiMGAS50052_17,phiMGAS5005.2_17,phage protein,5,5,1.0,1,1352,,,,432,432,432.0,EEABGJJD_01201,EEABGJJD_01201,EEABGJJD_01201,EEABGJJD_01201,EEABGJJD_01201 +group_790,,hypothetical protein,5,5,1.0,1,1351,,,,432,432,432.0,EEABGJJD_00584,EEABGJJD_00584,EEABGJJD_00584,EEABGJJD_00584,EEABGJJD_00584 +group_789,,putative transcriptional regulator (MarR family),5,5,1.0,1,1350,,,,435,435,435.0,EEABGJJD_01468,EEABGJJD_01468,EEABGJJD_01468,EEABGJJD_01468,EEABGJJD_01468 +group_788,,CopY/TcrY family copper transport repressor,5,5,1.0,1,1349,,,,435,435,435.0,EEABGJJD_01432,EEABGJJD_01432,EEABGJJD_01432,EEABGJJD_01432,EEABGJJD_01432 +group_787,,membrane protein,5,5,1.0,1,1348,,,,435,435,435.0,EEABGJJD_01195,EEABGJJD_01195,EEABGJJD_01195,EEABGJJD_01195,EEABGJJD_01195 +group_786,,hypothetical protein,5,5,1.0,1,1347,,,,435,435,435.0,EEABGJJD_00577,EEABGJJD_00577,EEABGJJD_00577,EEABGJJD_00577,EEABGJJD_00577 +group_785,,hypothetical protein,5,5,1.0,1,1346,,,,435,435,435.0,EEABGJJD_00562,EEABGJJD_00562,EEABGJJD_00562,EEABGJJD_00562,EEABGJJD_00562 +comYD,comYD,putative competence protein,5,5,1.0,1,1345,,,,435,435,435.0,EEABGJJD_00123,EEABGJJD_00123,EEABGJJD_00123,EEABGJJD_00123,EEABGJJD_00123 +ahrC,ahrC,putative arginine repressor,5,5,1.0,1,1344,,,,438,438,438.0,EEABGJJD_01776,EEABGJJD_01776,EEABGJJD_01776,EEABGJJD_01776,EEABGJJD_01776 +group_784,,peptide-methionine (R)-S-oxide reductase,5,5,1.0,1,1343,,,,438,438,438.0,EEABGJJD_00879,EEABGJJD_00879,EEABGJJD_00879,EEABGJJD_00879,EEABGJJD_00879 +agaF,agaF,putative PTS dependent N-acetyl-galactosamine- and galactosamine IIA component,5,5,1.0,1,1342,,,,438,438,438.0,EEABGJJD_00523,EEABGJJD_00523,EEABGJJD_00523,EEABGJJD_00523,EEABGJJD_00523 +group_783,,putative protein-tyrosine phosphatase,5,5,1.0,1,1341,,,,438,438,438.0,EEABGJJD_00058,EEABGJJD_00058,EEABGJJD_00058,EEABGJJD_00058,EEABGJJD_00058 +group_782,,pai1 protein (theoretical repressor),5,5,1.0,1,1340,,,,441,441,441.0,EEABGJJD_01657,EEABGJJD_01657,EEABGJJD_01657,EEABGJJD_01657,EEABGJJD_01657 +group_781,,hypothetical protein,5,5,1.0,1,1339,,,,441,441,441.0,EEABGJJD_00975,EEABGJJD_00975,EEABGJJD_00975,EEABGJJD_00975,EEABGJJD_00975 +group_780,,acyl-ACP thioesterase,5,5,1.0,1,1338,,,,441,441,441.0,EEABGJJD_00869,EEABGJJD_00869,EEABGJJD_00869,EEABGJJD_00869,EEABGJJD_00869 +phiNCTC81981_4,phiNCTC8198.1_4,ArpU family transcriptional regulator,5,5,1.0,1,1337,,,,441,441,441.0,EEABGJJD_00807,EEABGJJD_00807,EEABGJJD_00807,EEABGJJD_00807,EEABGJJD_00807 +arsC,arsC,putative arsenate reductase,5,5,1.0,1,1336,,,,441,441,441.0,EEABGJJD_00739,EEABGJJD_00739,EEABGJJD_00739,EEABGJJD_00739,EEABGJJD_00739 +group_779,,membrane protein,5,5,1.0,1,1335,,,,441,441,441.0,EEABGJJD_00677,EEABGJJD_00677,EEABGJJD_00677,EEABGJJD_00677,EEABGJJD_00677 +rplO,rplO,50S ribosomal protein L15,5,5,1.0,1,1334,,,,441,441,441.0,EEABGJJD_00085,EEABGJJD_00085,EEABGJJD_00085,EEABGJJD_00085,EEABGJJD_00085 +group_778,,D-aminoacyl-tRNA deacylase,5,5,1.0,1,1333,,,,444,444,444.0,EEABGJJD_01644,EEABGJJD_01644,EEABGJJD_01644,EEABGJJD_01644,EEABGJJD_01644 +group_777,,transcriptional regulator,5,5,1.0,1,1332,,,,444,444,444.0,EEABGJJD_00109,EEABGJJD_00109,EEABGJJD_00109,EEABGJJD_00109,EEABGJJD_00109 +rplM,rplM,50S ribosomal protein L13,5,5,1.0,1,1331,,,,447,447,447.0,EEABGJJD_01610,EEABGJJD_01610,EEABGJJD_01610,EEABGJJD_01610,EEABGJJD_01610 +group_776,,histidine kinase,5,5,1.0,1,1330,,,,447,447,447.0,EEABGJJD_00415,EEABGJJD_00415,EEABGJJD_00415,EEABGJJD_00415,EEABGJJD_00415 +group_775,,putative dUTPase phage associated,5,5,1.0,1,1329,,,,447,447,447.0,EEABGJJD_00224,EEABGJJD_00224,EEABGJJD_00224,EEABGJJD_00224,EEABGJJD_00224 +group_774,,hypothetical protein,5,5,1.0,1,1328,,,,450,450,450.0,EEABGJJD_01110,EEABGJJD_01110,EEABGJJD_01110,EEABGJJD_01110,EEABGJJD_01110 +group_773,,putative flavodoxin,5,5,1.0,1,1327,,,,450,450,450.0,EEABGJJD_00597,EEABGJJD_00597,EEABGJJD_00597,EEABGJJD_00597,EEABGJJD_00597 +group_772,,MarR family transcriptional regulator,5,5,1.0,1,1326,,,,450,450,450.0,EEABGJJD_00220,EEABGJJD_00220,EEABGJJD_00220,EEABGJJD_00220,EEABGJJD_00220 +rplI,rplI,50S ribosomal protein L9,5,5,1.0,1,1325,,,,453,453,453.0,EEABGJJD_01803,EEABGJJD_01803,EEABGJJD_01803,EEABGJJD_01803,EEABGJJD_01803 +nusB,nusB,putative transcriptional terminator,5,5,1.0,1,1324,,,,453,453,453.0,EEABGJJD_01512,EEABGJJD_01512,EEABGJJD_01512,EEABGJJD_01512,EEABGJJD_01512 +group_771,,universal stress protein,5,5,1.0,1,1323,,,,453,453,453.0,EEABGJJD_01485,EEABGJJD_01485,EEABGJJD_01485,EEABGJJD_01485,EEABGJJD_01485 +group_770,,N-acetyltransferase,5,5,1.0,1,1322,,,,453,453,453.0,EEABGJJD_00960,EEABGJJD_00960,EEABGJJD_00960,EEABGJJD_00960,EEABGJJD_00960 +group_769,,RofA family transcriptional regulator,5,5,1.0,1,1321,,,,453,453,453.0,EEABGJJD_00608,EEABGJJD_00608,EEABGJJD_00608,EEABGJJD_00608,EEABGJJD_00608 +group_768,,putative portal protein - phage associated,5,5,1.0,1,1320,,,,453,453,453.0,EEABGJJD_00467,EEABGJJD_00467,EEABGJJD_00467,EEABGJJD_00467,EEABGJJD_00467 +rimI,rimI,ribosomal-protein-alanine N-acetyltransferase,5,5,1.0,1,1319,,,,456,456,456.0,EEABGJJD_01557,EEABGJJD_01557,EEABGJJD_01557,EEABGJJD_01557,EEABGJJD_01557 +mutT,mutT,putative mutator protein,5,5,1.0,1,1318,,,,456,456,456.0,EEABGJJD_01262,EEABGJJD_01262,EEABGJJD_01262,EEABGJJD_01262,EEABGJJD_01262 +group_767,,CoA-binding protein,5,5,1.0,1,1317,,,,456,456,456.0,EEABGJJD_00190,EEABGJJD_00190,EEABGJJD_00190,EEABGJJD_00190,EEABGJJD_00190 +lsp,lsp,putative prolipoprotein signal peptidase,5,5,1.0,1,1316,,,,459,459,459.0,EEABGJJD_00685,EEABGJJD_00685,EEABGJJD_00685,EEABGJJD_00685,EEABGJJD_00685 +group_766,,hypothetical protein,5,5,1.0,1,1315,,,,459,459,459.0,EEABGJJD_00605,EEABGJJD_00605,EEABGJJD_00605,EEABGJJD_00605,EEABGJJD_00605 +ctsR,ctsR,putative transcriptional regulator,5,5,1.0,1,1314,,,,462,462,462.0,EEABGJJD_01713,EEABGJJD_01713,EEABGJJD_01713,EEABGJJD_01713,EEABGJJD_01713 +comEB,comEB,putative late competence protein required for DNA binding protein,5,5,1.0,1,1313,,,,462,462,462.0,EEABGJJD_01515,EEABGJJD_01515,EEABGJJD_01515,EEABGJJD_01515,EEABGJJD_01515 +group_765,,tRNA (adenosine(37)-N6)-threonylcarbamoyltransferase complex ATPase subunit type 1 TsaE,5,5,1.0,1,1312,,,,462,462,462.0,EEABGJJD_01449,EEABGJJD_01449,EEABGJJD_01449,EEABGJJD_01449,EEABGJJD_01449 +phiMGAS50052_30,phiMGAS5005.2_30,phage protein,5,5,1.0,1,1311,,,,462,462,462.0,EEABGJJD_01216,EEABGJJD_01216,EEABGJJD_01216,EEABGJJD_01216,EEABGJJD_01216 +group_764,,hypothetical protein,5,5,1.0,1,1310,,,,462,462,462.0,EEABGJJD_00810,EEABGJJD_00810,EEABGJJD_00810,EEABGJJD_00810,EEABGJJD_00810 +group_763,,hypothetical protein,5,5,1.0,1,1309,,,,462,462,462.0,EEABGJJD_00805,EEABGJJD_00805,EEABGJJD_00805,EEABGJJD_00805,EEABGJJD_00805 +group_762,,CBS domain-containing protein,5,5,1.0,1,1308,,,,462,462,462.0,EEABGJJD_00327,EEABGJJD_00327,EEABGJJD_00327,EEABGJJD_00327,EEABGJJD_00327 +group_761,,SsrA-binding protein,5,5,1.0,1,1307,,,,468,468,468.0,EEABGJJD_00424,EEABGJJD_00424,EEABGJJD_00424,EEABGJJD_00424,EEABGJJD_00424 +spf,spf,Ferric transport regulator protein,5,5,1.0,1,1306,,,,468,468,468.0,EEABGJJD_00191,EEABGJJD_00191,EEABGJJD_00191,EEABGJJD_00191,EEABGJJD_00191 +group_760,,hypothetical protein,5,5,1.0,1,1305,,,,471,471,471.0,EEABGJJD_01652,EEABGJJD_01652,EEABGJJD_01652,EEABGJJD_01652,EEABGJJD_01652 +group_759,,membrane protein,5,5,1.0,1,1304,,,,471,471,471.0,EEABGJJD_01578,EEABGJJD_01578,EEABGJJD_01578,EEABGJJD_01578,EEABGJJD_01578 +group_758,,putative repressor protein,5,5,1.0,1,1303,,,,471,471,471.0,EEABGJJD_01250,EEABGJJD_01250,EEABGJJD_01250,EEABGJJD_01250,EEABGJJD_01250 +phiSF3701_5,phiSF370.1_5,putative major tail shaft protein phage associated,5,5,1.0,1,1302,,,,471,471,471.0,EEABGJJD_00576,EEABGJJD_00576,EEABGJJD_00576,EEABGJJD_00576,EEABGJJD_00576 +rpsG,rpsG,30S ribosomal protein S7,5,5,1.0,1,1301,,,,471,471,471.0,EEABGJJD_00255,EEABGJJD_00255,EEABGJJD_00255,EEABGJJD_00255,EEABGJJD_00255 +group_757,,putative PTS system enzyme IIA component,5,5,1.0,1,1300,,,,474,474,474.0,EEABGJJD_01428,EEABGJJD_01428,EEABGJJD_01428,EEABGJJD_01428,EEABGJJD_01428 +ahrC2,ahrC.2,putative arginine repressor,5,5,1.0,1,1299,,,,474,474,474.0,EEABGJJD_01296,EEABGJJD_01296,EEABGJJD_01296,EEABGJJD_01296,EEABGJJD_01296 +mutX,mutX,putative 78-dihydro-8-oxoguanine-triphosphatase,5,5,1.0,1,1298,,,,477,477,477.0,EEABGJJD_00844,EEABGJJD_00844,EEABGJJD_00844,EEABGJJD_00844,EEABGJJD_00844 +group_756,,LysM domain-containing protein,5,5,1.0,1,1297,,,,477,477,477.0,EEABGJJD_00665,EEABGJJD_00665,EEABGJJD_00665,EEABGJJD_00665,EEABGJJD_00665 +group_755,,hypothetical protein,5,5,1.0,1,1296,,,,477,477,477.0,EEABGJJD_00564,EEABGJJD_00564,EEABGJJD_00564,EEABGJJD_00564,EEABGJJD_00564 +group_754,,DNA mismatch repair protein MutT,5,5,1.0,1,1295,,,,477,477,477.0,EEABGJJD_00404,EEABGJJD_00404,EEABGJJD_00404,EEABGJJD_00404,EEABGJJD_00404 +group_753,,23S rRNA (pseudouridine(1915)-N(3))-methyltransferase RlmH,5,5,1.0,1,1294,,,,480,480,480.0,EEABGJJD_01833,EEABGJJD_01833,EEABGJJD_01833,EEABGJJD_01833,EEABGJJD_01833 +group_752,,glutathione peroxidase,5,5,1.0,1,1293,,,,480,480,480.0,EEABGJJD_00505,EEABGJJD_00505,EEABGJJD_00505,EEABGJJD_00505,EEABGJJD_00505 +group_751,,putative transcription regulator,5,5,1.0,1,1292,,,,480,480,480.0,EEABGJJD_00500,EEABGJJD_00500,EEABGJJD_00500,EEABGJJD_00500,EEABGJJD_00500 +group_750,,NifU-like protein,5,5,1.0,1,1291,,,,480,480,480.0,EEABGJJD_00268,EEABGJJD_00268,EEABGJJD_00268,EEABGJJD_00268,EEABGJJD_00268 +ntpK,ntpK,V-type Na+ -ATPase subunit K,5,5,1.0,1,1290,,,,480,480,480.0,EEABGJJD_00158,EEABGJJD_00158,EEABGJJD_00158,EEABGJJD_00158,EEABGJJD_00158 +group_749,,ribonucleotide reductase assembly protein NrdI,5,5,1.0,1,1289,,,,483,483,483.0,EEABGJJD_01647,EEABGJJD_01647,EEABGJJD_01647,EEABGJJD_01647,EEABGJJD_01647 +group_748,,membrane protein,5,5,1.0,1,1288,,,,483,483,483.0,EEABGJJD_01534,EEABGJJD_01534,EEABGJJD_01534,EEABGJJD_01534,EEABGJJD_01534 +luxS,luxS,autoinducer-2 production protein,5,5,1.0,1,1287,,,,483,483,483.0,EEABGJJD_01374,EEABGJJD_01374,EEABGJJD_01374,EEABGJJD_01374,EEABGJJD_01374 +group_747,,hypothetical protein,5,5,1.0,1,1286,,,,483,483,483.0,EEABGJJD_01003,EEABGJJD_01003,EEABGJJD_01003,EEABGJJD_01003,EEABGJJD_01003 +group_746,,transcriptional regulator,5,5,1.0,1,1285,,,,483,483,483.0,EEABGJJD_00980,EEABGJJD_00980,EEABGJJD_00980,EEABGJJD_00980,EEABGJJD_00980 +phiSF3702_3,phiSF370.2_3,putative terminase small subunit - phage associated,5,5,1.0,1,1284,,,,483,483,483.0,EEABGJJD_00811,EEABGJJD_00811,EEABGJJD_00811,EEABGJJD_00811,EEABGJJD_00811 +greA,greA,putative transcription elongation factor,5,5,1.0,1,1283,,,,483,483,483.0,EEABGJJD_00315,EEABGJJD_00315,EEABGJJD_00315,EEABGJJD_00315,EEABGJJD_00315 +group_745,,putative arylalkylamine n-acetyltransferase,5,5,1.0,1,1282,,,,483,483,483.0,EEABGJJD_00313,EEABGJJD_00313,EEABGJJD_00313,EEABGJJD_00313,EEABGJJD_00313 +group_744,,PTS ascorbate transporter subunit IIA,5,5,1.0,1,1281,,,,486,486,486.0,EEABGJJD_00181,EEABGJJD_00181,EEABGJJD_00181,EEABGJJD_00181,EEABGJJD_00181 +group_743,,hypothetical protein,5,5,1.0,1,1280,,,,489,489,489.0,EEABGJJD_01770,EEABGJJD_01770,EEABGJJD_01770,EEABGJJD_01770,EEABGJJD_01770 +group_742,,Asp23/Gls24 family envelope stress response protein,5,5,1.0,1,1279,,,,489,489,489.0,EEABGJJD_01060,EEABGJJD_01060,EEABGJJD_01060,EEABGJJD_01060,EEABGJJD_01060 +group_741,,putative pore-forming peptide,5,5,1.0,1,1278,,,,489,489,489.0,EEABGJJD_00664,EEABGJJD_00664,EEABGJJD_00664,EEABGJJD_00664,EEABGJJD_00664 +group_740,,PTS N-acetylgalactosamine transporter subunit IIB,5,5,1.0,1,1277,,,,489,489,489.0,EEABGJJD_00521,EEABGJJD_00521,EEABGJJD_00521,EEABGJJD_00521,EEABGJJD_00521 +nrdI,nrdI,putative ribonucleotide reductase (NrdI protein),5,5,1.0,1,1276,,,,489,489,489.0,EEABGJJD_00365,EEABGJJD_00365,EEABGJJD_00365,EEABGJJD_00365,EEABGJJD_00365 +group_739,,putative single strand binding protein - phage associated,5,5,1.0,1,1275,,,,492,492,492.0,EEABGJJD_01521,EEABGJJD_01521,EEABGJJD_01521,EEABGJJD_01521,EEABGJJD_01521 +kdtB,kdtB,putative 3-deoxy-D-manno-octulosonic-acid transferase,5,5,1.0,1,1274,,,,492,492,492.0,EEABGJJD_01286,EEABGJJD_01286,EEABGJJD_01286,EEABGJJD_01286,EEABGJJD_01286 +aroK,aroK,putative shikimate kinase,5,5,1.0,1,1273,,,,492,492,492.0,EEABGJJD_01130,EEABGJJD_01130,EEABGJJD_01130,EEABGJJD_01130,EEABGJJD_01130 +group_738,,putative phosphotransferase system (PTS) enzyme II component B,5,5,1.0,1,1272,,,,492,492,492.0,EEABGJJD_00882,EEABGJJD_00882,EEABGJJD_00882,EEABGJJD_00882,EEABGJJD_00882 +atpF,atpF,putative proton-translocating ATPase subunit b,5,5,1.0,1,1271,,,,495,495,495.0,EEABGJJD_00626,EEABGJJD_00626,EEABGJJD_00626,EEABGJJD_00626,EEABGJJD_00626 +group_737,,transcriptional regulator NrdR,5,5,1.0,1,1270,,,,495,495,495.0,EEABGJJD_00306,EEABGJJD_00306,EEABGJJD_00306,EEABGJJD_00306,EEABGJJD_00306 +rpsE,rpsE,30S ribosomal protein S5,5,5,1.0,1,1269,,,,495,495,495.0,EEABGJJD_00083,EEABGJJD_00083,EEABGJJD_00083,EEABGJJD_00083,EEABGJJD_00083 +group_736,,GAF domain-containing protein,5,5,1.0,1,1268,,,,498,498,498.0,EEABGJJD_01142,EEABGJJD_01142,EEABGJJD_01142,EEABGJJD_01142,EEABGJJD_01142 +group_735,,nicotinamide riboside transporter PnuC,5,5,1.0,1,1267,,,,498,498,498.0,EEABGJJD_01116,EEABGJJD_01116,EEABGJJD_01116,EEABGJJD_01116,EEABGJJD_01116 +dyr,dyr,putative dihydrofolate reductase,5,5,1.0,1,1266,,,,498,498,498.0,EEABGJJD_00731,EEABGJJD_00731,EEABGJJD_00731,EEABGJJD_00731,EEABGJJD_00731 +group_734,,endoribonuclease YbeY,5,5,1.0,1,1265,,,,498,498,498.0,EEABGJJD_00401,EEABGJJD_00401,EEABGJJD_00401,EEABGJJD_00401,EEABGJJD_00401 +group_733,,carbonic anhydrase,5,5,1.0,1,1264,,,,498,498,498.0,EEABGJJD_00226,EEABGJJD_00226,EEABGJJD_00226,EEABGJJD_00226,EEABGJJD_00226 +group_732,,PAP2 family protein,5,5,1.0,1,1263,,,,501,501,501.0,EEABGJJD_01527,EEABGJJD_01527,EEABGJJD_01527,EEABGJJD_01527,EEABGJJD_01527 +group_731,,acetyl-CoA carboxylase biotin carboxyl carrier protein,5,5,1.0,1,1262,,,,501,501,501.0,EEABGJJD_01461,EEABGJJD_01461,EEABGJJD_01461,EEABGJJD_01461,EEABGJJD_01461 +group_730,,2-amino-4-hydroxy-6- hydroxymethyldihydropteridine diphosphokinase,5,5,1.0,1,1261,,,,501,501,501.0,EEABGJJD_00920,EEABGJJD_00920,EEABGJJD_00920,EEABGJJD_00920,EEABGJJD_00920 +rplJ,rplJ,50S ribosomal protein L10,5,5,1.0,1,1260,,,,501,501,501.0,EEABGJJD_00896,EEABGJJD_00896,EEABGJJD_00896,EEABGJJD_00896,EEABGJJD_00896 +group_729,,queuosine transporter QueT,5,5,1.0,1,1259,,,,501,501,501.0,EEABGJJD_00621,EEABGJJD_00621,EEABGJJD_00621,EEABGJJD_00621,EEABGJJD_00621 +group_728,,haloacid dehalogenase,5,5,1.0,1,1258,,,,501,501,501.0,EEABGJJD_00321,EEABGJJD_00321,EEABGJJD_00321,EEABGJJD_00321,EEABGJJD_00321 +group_727,,membrane protein,5,5,1.0,1,1257,,,,504,504,504.0,EEABGJJD_00503,EEABGJJD_00503,EEABGJJD_00503,EEABGJJD_00503,EEABGJJD_00503 +flaR,flaR,topology modulator,5,5,1.0,1,1256,,,,507,507,507.0,EEABGJJD_01658,EEABGJJD_01658,EEABGJJD_01658,EEABGJJD_01658,EEABGJJD_01658 +group_726,,membrane protein,5,5,1.0,1,1255,,,,507,507,507.0,EEABGJJD_01612,EEABGJJD_01612,EEABGJJD_01612,EEABGJJD_01612,EEABGJJD_01612 +group_725,,hypothetical protein,5,5,1.0,1,1254,,,,510,510,510.0,EEABGJJD_01769,EEABGJJD_01769,EEABGJJD_01769,EEABGJJD_01769,EEABGJJD_01769 +group_724,,GNAT family acetyltransferase,5,5,1.0,1,1253,,,,510,510,510.0,EEABGJJD_01738,EEABGJJD_01738,EEABGJJD_01738,EEABGJJD_01738,EEABGJJD_01738 +group_723,,protein low temperature requirement C,5,5,1.0,1,1252,,,,510,510,510.0,EEABGJJD_01691,EEABGJJD_01691,EEABGJJD_01691,EEABGJJD_01691,EEABGJJD_01691 +msrA2,msrA.2,putative peptide methionine sulfoxide reductase,5,5,1.0,1,1251,,,,510,510,510.0,EEABGJJD_00395,EEABGJJD_00395,EEABGJJD_00395,EEABGJJD_00395,EEABGJJD_00395 +phiMGAS50052_41,phiMGAS5005.2_41,phage protein,5,5,1.0,1,1250,,,,513,513,513.0,EEABGJJD_01227,EEABGJJD_01227,EEABGJJD_01227,EEABGJJD_01227,EEABGJJD_01227 +group_722,,phosphorylase,5,5,1.0,1,1249,,,,513,513,513.0,EEABGJJD_01127,EEABGJJD_01127,EEABGJJD_01127,EEABGJJD_01127,EEABGJJD_01127 +phiMGAS50052_12,phiMGAS5005.2_12,phage protein,5,5,1.0,1,1248,,,,513,513,513.0,EEABGJJD_00802,EEABGJJD_00802,EEABGJJD_00802,EEABGJJD_00802,EEABGJJD_00802 +lacB2,lacB.2,putative galactose-6-phosphate isomerase (B subunit),5,5,1.0,1,1247,,,,516,516,516.0,EEABGJJD_01604,EEABGJJD_01604,EEABGJJD_01604,EEABGJJD_01604,EEABGJJD_01604 +lacB1,lacB.1,putative galactose-6-phosphate isomerase,5,5,1.0,1,1246,,,,516,516,516.0,EEABGJJD_01424,EEABGJJD_01424,EEABGJJD_01424,EEABGJJD_01424,EEABGJJD_01424 +group_721,,hypothetical protein,5,5,1.0,1,1245,,,,516,516,516.0,EEABGJJD_01379,EEABGJJD_01379,EEABGJJD_01379,EEABGJJD_01379,EEABGJJD_01379 +group_720,,tRNA-specific adenosine deaminase,5,5,1.0,1,1244,,,,516,516,516.0,EEABGJJD_00207,EEABGJJD_00207,EEABGJJD_00207,EEABGJJD_00207,EEABGJJD_00207 +group_719,,NYN domain-containing protein,5,5,1.0,1,1243,,,,519,519,519.0,EEABGJJD_01614,EEABGJJD_01614,EEABGJJD_01614,EEABGJJD_01614,EEABGJJD_01614 +group_718,,restriction endonuclease,5,5,1.0,1,1242,,,,519,519,519.0,EEABGJJD_01242,EEABGJJD_01242,EEABGJJD_01242,EEABGJJD_01242,EEABGJJD_01242 +apt,apt,putative adenine phosphoribosyltransferase,5,5,1.0,1,1241,,,,519,519,519.0,EEABGJJD_00772,EEABGJJD_00772,EEABGJJD_00772,EEABGJJD_00772,EEABGJJD_00772 +group_717,,putative 16S rRNA processing protein,5,5,1.0,1,1240,,,,519,519,519.0,EEABGJJD_00702,EEABGJJD_00702,EEABGJJD_00702,EEABGJJD_00702,EEABGJJD_00702 +group_716,,putative transcriptional regulator,5,5,1.0,1,1239,,,,522,522,522.0,EEABGJJD_01189,EEABGJJD_01189,EEABGJJD_01189,EEABGJJD_01189,EEABGJJD_01189 +group_715,,TetR/AcrR family transcriptional regulator,5,5,1.0,1,1238,,,,522,522,522.0,EEABGJJD_00701,EEABGJJD_00701,EEABGJJD_00701,EEABGJJD_00701,EEABGJJD_00701 +pyrR,pyrR,putative pyrimidine regulatory protein,5,5,1.0,1,1237,,,,522,522,522.0,EEABGJJD_00687,EEABGJJD_00687,EEABGJJD_00687,EEABGJJD_00687,EEABGJJD_00687 +group_714,,metallophosphoesterase,5,5,1.0,1,1236,,,,522,522,522.0,EEABGJJD_00326,EEABGJJD_00326,EEABGJJD_00326,EEABGJJD_00326,EEABGJJD_00326 +group_713,,N-acetyltransferase,5,5,1.0,1,1235,,,,525,525,525.0,EEABGJJD_01448,EEABGJJD_01448,EEABGJJD_01448,EEABGJJD_01448,EEABGJJD_01448 +dpr,dpr,putative peroxide resistance protein,5,5,1.0,1,1234,,,,528,528,528.0,EEABGJJD_01280,EEABGJJD_01280,EEABGJJD_01280,EEABGJJD_01280,EEABGJJD_01280 +group_712,,YqeG family HAD IIIA-type phosphatase,5,5,1.0,1,1233,,,,528,528,528.0,EEABGJJD_00283,EEABGJJD_00283,EEABGJJD_00283,EEABGJJD_00283,EEABGJJD_00283 +group_711,,hypothetical protein,5,5,1.0,1,1232,,,,531,531,531.0,EEABGJJD_01549,EEABGJJD_01549,EEABGJJD_01549,EEABGJJD_01549,EEABGJJD_01549 +infC,infC,putative translation initiation factor 3 (IF3),5,5,1.0,1,1231,,,,531,531,531.0,EEABGJJD_00667,EEABGJJD_00667,EEABGJJD_00667,EEABGJJD_00667,EEABGJJD_00667 +group_710,,nuclease,5,5,1.0,1,1230,,,,531,531,531.0,EEABGJJD_00194,EEABGJJD_00194,EEABGJJD_00194,EEABGJJD_00194,EEABGJJD_00194 +group_709,,TetR/AcrR family transcriptional regulator,5,5,1.0,1,1229,,,,534,534,534.0,EEABGJJD_01798,EEABGJJD_01798,EEABGJJD_01798,EEABGJJD_01798,EEABGJJD_01798 +grpE,grpE,putative Hsp-70 cofactor,5,5,1.0,1,1228,,,,534,534,534.0,EEABGJJD_01472,EEABGJJD_01472,EEABGJJD_01472,EEABGJJD_01472,EEABGJJD_01472 +group_708,,isoprenylcysteine carboxyl methyltransferase,5,5,1.0,1,1227,,,,534,534,534.0,EEABGJJD_01412,EEABGJJD_01412,EEABGJJD_01412,EEABGJJD_01412,EEABGJJD_01412 +group_707,,hypothetical protein,5,5,1.0,1,1226,,,,534,534,534.0,EEABGJJD_01340,EEABGJJD_01340,EEABGJJD_01340,EEABGJJD_01340,EEABGJJD_01340 +group_706,,hypothetical protein,5,5,1.0,1,1225,,,,534,534,534.0,EEABGJJD_00820,EEABGJJD_00820,EEABGJJD_00820,EEABGJJD_00820,EEABGJJD_00820 +group_705,,hypothetical protein,5,5,1.0,1,1224,,,,534,534,534.0,EEABGJJD_00303,EEABGJJD_00303,EEABGJJD_00303,EEABGJJD_00303,EEABGJJD_00303 +group_704,,ribosome maturation factor RimP,5,5,1.0,1,1223,,,,537,537,537.0,EEABGJJD_01439,EEABGJJD_01439,EEABGJJD_01439,EEABGJJD_01439,EEABGJJD_01439 +group_703,,hypothetical protein,5,5,1.0,1,1222,,,,537,537,537.0,EEABGJJD_00744,EEABGJJD_00744,EEABGJJD_00744,EEABGJJD_00744,EEABGJJD_00744 +atpH,atpH,putative proton-translocating ATPase delta subunit,5,5,1.0,1,1221,,,,537,537,537.0,EEABGJJD_00627,EEABGJJD_00627,EEABGJJD_00627,EEABGJJD_00627,EEABGJJD_00627 +group_702,,hypothetical protein,5,5,1.0,1,1220,,,,537,537,537.0,EEABGJJD_00368,EEABGJJD_00368,EEABGJJD_00368,EEABGJJD_00368,EEABGJJD_00368 +rplF,rplF,50S ribosomal protein L6,5,5,1.0,1,1219,,,,537,537,537.0,EEABGJJD_00081,EEABGJJD_00081,EEABGJJD_00081,EEABGJJD_00081,EEABGJJD_00081 +group_701,,16S rRNA (guanine(966)-N(2))-methyltransferase RsmD,5,5,1.0,1,1218,,,,540,540,540.0,EEABGJJD_01287,EEABGJJD_01287,EEABGJJD_01287,EEABGJJD_01287,EEABGJJD_01287 +group_700,,Asp23/Gls24 family envelope stress response protein,5,5,1.0,1,1217,,,,540,540,540.0,EEABGJJD_01062,EEABGJJD_01062,EEABGJJD_01062,EEABGJJD_01062,EEABGJJD_01062 +endA,endA,putative competence associated membrane nuclease,5,5,1.0,1,1216,,,,540,540,540.0,EEABGJJD_00635,EEABGJJD_00635,EEABGJJD_00635,EEABGJJD_00635,EEABGJJD_00635 +group_699,,putative biotin synthase,5,5,1.0,1,1215,,,,540,540,540.0,EEABGJJD_00205,EEABGJJD_00205,EEABGJJD_00205,EEABGJJD_00205,EEABGJJD_00205 +nusG,nusG,putative transcription antitermination factor,5,5,1.0,1,1214,,,,540,540,540.0,EEABGJJD_00169,EEABGJJD_00169,EEABGJJD_00169,EEABGJJD_00169,EEABGJJD_00169 +pgsA,pgsA,phosphatidylglycerophosphate synthase,5,5,1.0,1,1213,,,,543,543,543.0,EEABGJJD_01814,EEABGJJD_01814,EEABGJJD_01814,EEABGJJD_01814,EEABGJJD_01814 +group_698,,FMN reductase,5,5,1.0,1,1212,,,,543,543,543.0,EEABGJJD_01630,EEABGJJD_01630,EEABGJJD_01630,EEABGJJD_01630,EEABGJJD_01630 +niaX,niaX,Niacin transporter NiaX,5,5,1.0,1,1211,,,,543,543,543.0,EEABGJJD_01188,EEABGJJD_01188,EEABGJJD_01188,EEABGJJD_01188,EEABGJJD_01188 +group_697,,putative acetyl transferase,5,5,1.0,1,1210,,,,543,543,543.0,EEABGJJD_01135,EEABGJJD_01135,EEABGJJD_01135,EEABGJJD_01135,EEABGJJD_01135 +group_696,,TetR/AcrR family transcriptional regulator,5,5,1.0,1,1209,,,,543,543,543.0,EEABGJJD_01058,EEABGJJD_01058,EEABGJJD_01058,EEABGJJD_01058,EEABGJJD_01058 +rplE,rplE,50S ribosomal protein L5,5,5,1.0,1,1208,,,,543,543,543.0,EEABGJJD_00078,EEABGJJD_00078,EEABGJJD_00078,EEABGJJD_00078,EEABGJJD_00078 +hpt,hpt,hypoxanthine phosphoribosyltransferase,5,5,1.0,1,1207,,,,543,543,543.0,EEABGJJD_00012,EEABGJJD_00012,EEABGJJD_00012,EEABGJJD_00012,EEABGJJD_00012 +group_695,,colicin V production protein,5,5,1.0,1,1206,,,,546,546,546.0,EEABGJJD_01529,EEABGJJD_01529,EEABGJJD_01529,EEABGJJD_01529,EEABGJJD_01529 +dfp,dfp,putative DNA/pantothenate metabolism flavoprotein,5,5,1.0,1,1205,,,,546,546,546.0,EEABGJJD_01027,EEABGJJD_01027,EEABGJJD_01027,EEABGJJD_01027,EEABGJJD_01027 +group_694,,PTS transporter subunit IIC,5,5,1.0,1,1204,,,,546,546,546.0,EEABGJJD_00981,EEABGJJD_00981,EEABGJJD_00981,EEABGJJD_00981,EEABGJJD_00981 +group_693,,hypothetical protein,5,5,1.0,1,1203,,,,549,549,549.0,EEABGJJD_01773,EEABGJJD_01773,EEABGJJD_01773,EEABGJJD_01773,EEABGJJD_01773 +group_692,,hypothetical protein,5,5,1.0,1,1202,,,,549,549,549.0,EEABGJJD_01350,EEABGJJD_01350,EEABGJJD_01350,EEABGJJD_01350,EEABGJJD_01350 +group_691,,putative rRNA methylase,5,5,1.0,1,1201,,,,549,549,549.0,EEABGJJD_00333,EEABGJJD_00333,EEABGJJD_00333,EEABGJJD_00333,EEABGJJD_00333 +group_690,,aromatic acid exporter family protein,5,5,1.0,1,1200,,,,552,552,552.0,EEABGJJD_01563,EEABGJJD_01563,EEABGJJD_01563,EEABGJJD_01563,EEABGJJD_01563 +group_689,,segregation/condensation protein B,5,5,1.0,1,1199,,,,552,552,552.0,EEABGJJD_00330,EEABGJJD_00330,EEABGJJD_00330,EEABGJJD_00330,EEABGJJD_00330 +group_688,,putative pyrazinamidase/nicotinamidase,5,5,1.0,1,1198,,,,555,555,555.0,EEABGJJD_01482,EEABGJJD_01482,EEABGJJD_01482,EEABGJJD_01482,EEABGJJD_01482 +group_687,,NUDIX hydrolase,5,5,1.0,1,1197,,,,555,555,555.0,EEABGJJD_00379,EEABGJJD_00379,EEABGJJD_00379,EEABGJJD_00379,EEABGJJD_00379 +group_686,,methyltransferase domain-containing protein,5,5,1.0,1,1196,,,,555,555,555.0,EEABGJJD_00337,EEABGJJD_00337,EEABGJJD_00337,EEABGJJD_00337,EEABGJJD_00337 +purN,purN,Phosphoribosylglycinamide formyltransferase,5,5,1.0,1,1195,,,,555,555,555.0,EEABGJJD_00047,EEABGJJD_00047,EEABGJJD_00047,EEABGJJD_00047,EEABGJJD_00047 +efp,efp,putative translation elongation factor EF-P,5,5,1.0,1,1194,,,,558,558,558.0,EEABGJJD_01514,EEABGJJD_01514,EEABGJJD_01514,EEABGJJD_01514,EEABGJJD_01514 +group_685,,putative transcription regulator,5,5,1.0,1,1193,,,,558,558,558.0,EEABGJJD_01418,EEABGJJD_01418,EEABGJJD_01418,EEABGJJD_01418,EEABGJJD_01418 +group_684,,PTS sugar transporter subunit IIC,5,5,1.0,1,1192,,,,558,558,558.0,EEABGJJD_01109,EEABGJJD_01109,EEABGJJD_01109,EEABGJJD_01109,EEABGJJD_01109 +sipC,sipC,putative signal peptidase I,5,5,1.0,1,1191,,,,558,558,558.0,EEABGJJD_01075,EEABGJJD_01075,EEABGJJD_01075,EEABGJJD_01075,EEABGJJD_01075 +group_683,,membrane protein,5,5,1.0,1,1190,,,,558,558,558.0,EEABGJJD_01013,EEABGJJD_01013,EEABGJJD_01013,EEABGJJD_01013,EEABGJJD_01013 +rrf,rrf,putative ribosome recycling factor,5,5,1.0,1,1189,,,,558,558,558.0,EEABGJJD_00393,EEABGJJD_00393,EEABGJJD_00393,EEABGJJD_00393,EEABGJJD_00393 +lemA,lemA,putative cytoplasmic membrane protein,5,5,1.0,1,1188,,,,558,558,558.0,EEABGJJD_00301,EEABGJJD_00301,EEABGJJD_00301,EEABGJJD_00301,EEABGJJD_00301 +lepB,lepB,signal peptidase I,5,5,1.0,1,1187,,,,558,558,558.0,EEABGJJD_00138,EEABGJJD_00138,EEABGJJD_00138,EEABGJJD_00138,EEABGJJD_00138 +group_682,,DNA-3-methyladenine glycosylase I,5,5,1.0,1,1186,,,,561,561,561.0,EEABGJJD_01750,EEABGJJD_01750,EEABGJJD_01750,EEABGJJD_01750,EEABGJJD_01750 +ahpC,ahpC,putative alkyl hydroperoxidase,5,5,1.0,1,1185,,,,561,561,561.0,EEABGJJD_01716,EEABGJJD_01716,EEABGJJD_01716,EEABGJJD_01716,EEABGJJD_01716 +group_681,,TIGR01440 family protein,5,5,1.0,1,1184,,,,561,561,561.0,EEABGJJD_01577,EEABGJJD_01577,EEABGJJD_01577,EEABGJJD_01577,EEABGJJD_01577 +group_680,,ATPase AAA,5,5,1.0,1,1183,,,,564,564,564.0,EEABGJJD_01312,EEABGJJD_01312,EEABGJJD_01312,EEABGJJD_01312,EEABGJJD_01312 +group_679,,energy-coupled thiamine transporter ThiT,5,5,1.0,1,1182,,,,564,564,564.0,EEABGJJD_00501,EEABGJJD_00501,EEABGJJD_00501,EEABGJJD_00501,EEABGJJD_00501 +group_678,,ECF transporter S component,5,5,1.0,1,1181,,,,564,564,564.0,EEABGJJD_00334,EEABGJJD_00334,EEABGJJD_00334,EEABGJJD_00334,EEABGJJD_00334 +trpG,trpG,anthranilate synthase component II,5,5,1.0,1,1180,,,,567,567,567.0,EEABGJJD_01654,EEABGJJD_01654,EEABGJJD_01654,EEABGJJD_01654,EEABGJJD_01654 +group_677,,acetyltransferase,5,5,1.0,1,1179,,,,567,567,567.0,EEABGJJD_00889,EEABGJJD_00889,EEABGJJD_00889,EEABGJJD_00889,EEABGJJD_00889 +group_676,,putative N-acetyl-muramidase,5,5,1.0,1,1178,,,,570,570,570.0,EEABGJJD_01474,EEABGJJD_01474,EEABGJJD_01474,EEABGJJD_01474,EEABGJJD_01474 +group_675,,ECF transporter S component,5,5,1.0,1,1177,,,,570,570,570.0,EEABGJJD_01028,EEABGJJD_01028,EEABGJJD_01028,EEABGJJD_01028,EEABGJJD_01028 +tdk2,tdk2,putative thymidine kinase,5,5,1.0,1,1176,,,,570,570,570.0,EEABGJJD_00956,EEABGJJD_00956,EEABGJJD_00956,EEABGJJD_00956,EEABGJJD_00956 +group_674,,DNA gyrase subunit B,5,5,1.0,1,1175,,,,570,570,570.0,EEABGJJD_00602,EEABGJJD_00602,EEABGJJD_00602,EEABGJJD_00602,EEABGJJD_00602 +group_673,,ribonuclease M5,5,5,1.0,1,1174,,,,570,570,570.0,EEABGJJD_00245,EEABGJJD_00245,EEABGJJD_00245,EEABGJJD_00245,EEABGJJD_00245 +pth,pth,putative peptidyl-tRNA hydrolase,5,5,1.0,1,1173,,,,570,570,570.0,EEABGJJD_00005,EEABGJJD_00005,EEABGJJD_00005,EEABGJJD_00005,EEABGJJD_00005 +group_672,,DNA-directed RNA polymerase subunit delta,5,5,1.0,1,1172,,,,576,576,576.0,EEABGJJD_01574,EEABGJJD_01574,EEABGJJD_01574,EEABGJJD_01574,EEABGJJD_01574 +group_671,,uracil-DNA glycosylase,5,5,1.0,1,1171,,,,576,576,576.0,EEABGJJD_00400,EEABGJJD_00400,EEABGJJD_00400,EEABGJJD_00400,EEABGJJD_00400 +group_670,,apo-citrate lyase phosphoribosyl-dephospho-CoA transferase,5,5,1.0,1,1170,,,,579,579,579.0,EEABGJJD_01000,EEABGJJD_01000,EEABGJJD_01000,EEABGJJD_01000,EEABGJJD_01000 +group_669,,CYTH domain-containing protein,5,5,1.0,1,1169,,,,579,579,579.0,EEABGJJD_00941,EEABGJJD_00941,EEABGJJD_00941,EEABGJJD_00941,EEABGJJD_00941 +cysE,cysE,serine O-acetyltransferase,5,5,1.0,1,1168,,,,582,582,582.0,EEABGJJD_01620,EEABGJJD_01620,EEABGJJD_01620,EEABGJJD_01620,EEABGJJD_01620 +xpt,xpt,putative xanthine phosphoribosyltransferase,5,5,1.0,1,1167,,,,582,582,582.0,EEABGJJD_00952,EEABGJJD_00952,EEABGJJD_00952,EEABGJJD_00952,EEABGJJD_00952 +group_668,,hypothetical protein,5,5,1.0,1,1166,,,,582,582,582.0,EEABGJJD_00578,EEABGJJD_00578,EEABGJJD_00578,EEABGJJD_00578,EEABGJJD_00578 +group_667,,putative 16S pseudouridylate synthetase,5,5,1.0,1,1165,,,,585,585,585.0,EEABGJJD_01166,EEABGJJD_01166,EEABGJJD_01166,EEABGJJD_01166,EEABGJJD_01166 +group_666,,GTP cyclohydrolase I FolE,5,5,1.0,1,1164,,,,585,585,585.0,EEABGJJD_00917,EEABGJJD_00917,EEABGJJD_00917,EEABGJJD_00917,EEABGJJD_00917 +ntpE,ntpE,putative V-type Na+ -ATPase subunit E,5,5,1.0,1,1163,,,,585,585,585.0,EEABGJJD_00159,EEABGJJD_00159,EEABGJJD_00159,EEABGJJD_00159,EEABGJJD_00159 +group_665,,hypothetical protein,5,5,1.0,1,1162,,,,588,588,588.0,EEABGJJD_01795,EEABGJJD_01795,EEABGJJD_01795,EEABGJJD_01795,EEABGJJD_01795 +dnaQ,dnaQ,putative DNA polymerase III epsilon subunit,5,5,1.0,1,1161,,,,588,588,588.0,EEABGJJD_01548,EEABGJJD_01548,EEABGJJD_01548,EEABGJJD_01548,EEABGJJD_01548 +group_664,,helix-turn-helix transcriptional regulator,5,5,1.0,1,1160,,,,588,588,588.0,EEABGJJD_01525,EEABGJJD_01525,EEABGJJD_01525,EEABGJJD_01525,EEABGJJD_01525 +group_663,,hypothetical protein,5,5,1.0,1,1159,,,,588,588,588.0,EEABGJJD_01245,EEABGJJD_01245,EEABGJJD_01245,EEABGJJD_01245,EEABGJJD_01245 +group_662,,alkaline shock response membrane anchor protein AmaP,5,5,1.0,1,1158,,,,588,588,588.0,EEABGJJD_01064,EEABGJJD_01064,EEABGJJD_01064,EEABGJJD_01064,EEABGJJD_01064 +group_661,,peptidase S11,5,5,1.0,1,1157,,,,591,591,591.0,EEABGJJD_01170,EEABGJJD_01170,EEABGJJD_01170,EEABGJJD_01170,EEABGJJD_01170 +group_660,,threonylcarbamoyl-AMP synthase,5,5,1.0,1,1156,,,,591,591,591.0,EEABGJJD_00959,EEABGJJD_00959,EEABGJJD_00959,EEABGJJD_00959,EEABGJJD_00959 +group_659,,IS110 family transposase,5,5,1.0,1,1155,,,,591,591,591.0,EEABGJJD_00607,EEABGJJD_00607,EEABGJJD_00607,EEABGJJD_00607,EEABGJJD_00607 +clpP,clpP,putative ATP-dependent protease proteolytic subunit,5,5,1.0,1,1154,,,,591,591,591.0,EEABGJJD_00349,EEABGJJD_00349,EEABGJJD_00349,EEABGJJD_00349,EEABGJJD_00349 +spi,spi,putative signal peptidase I,5,5,1.0,1,1153,,,,594,594,594.0,EEABGJJD_01532,EEABGJJD_01532,EEABGJJD_01532,EEABGJJD_01532,EEABGJJD_01532 +group_658,,membrane protein,5,5,1.0,1,1152,,,,594,594,594.0,EEABGJJD_01493,EEABGJJD_01493,EEABGJJD_01493,EEABGJJD_01493,EEABGJJD_01493 +phiMGAS50052_23,phiMGAS5005.2_23,major tail protein,5,5,1.0,1,1151,,,,594,594,594.0,EEABGJJD_01209,EEABGJJD_01209,EEABGJJD_01209,EEABGJJD_01209,EEABGJJD_01209 +group_657,,class I SAM-dependent methyltransferase,5,5,1.0,1,1150,,,,594,594,594.0,EEABGJJD_01035,EEABGJJD_01035,EEABGJJD_01035,EEABGJJD_01035,EEABGJJD_01035 +cpsFP,cpsFP,putative dTDP-4-keto-6-deoxyglucose-35-epimerase,5,5,1.0,1,1149,,,,594,594,594.0,EEABGJJD_00779,EEABGJJD_00779,EEABGJJD_00779,EEABGJJD_00779,EEABGJJD_00779 +mur11,mur1.1,putative peptidoglycan hydrolase,5,5,1.0,1,1148,,,,594,594,594.0,EEABGJJD_00710,EEABGJJD_00710,EEABGJJD_00710,EEABGJJD_00710,EEABGJJD_00710 +group_656,,dephospho-CoA kinase,5,5,1.0,1,1147,,,,594,594,594.0,EEABGJJD_00419,EEABGJJD_00419,EEABGJJD_00419,EEABGJJD_00419,EEABGJJD_00419 +group_655,,HD domain-containing protein,5,5,1.0,1,1146,,,,594,594,594.0,EEABGJJD_00287,EEABGJJD_00287,EEABGJJD_00287,EEABGJJD_00287,EEABGJJD_00287 +ruvA,ruvA,putative Holiday junction DNA helicase,5,5,1.0,1,1145,,,,597,597,597.0,EEABGJJD_01751,EEABGJJD_01751,EEABGJJD_01751,EEABGJJD_01751,EEABGJJD_01751 +group_654,,HutD family protein,5,5,1.0,1,1144,,,,597,597,597.0,EEABGJJD_01723,EEABGJJD_01723,EEABGJJD_01723,EEABGJJD_01723,EEABGJJD_01723 +recR,recR,putative recombination protein,5,5,1.0,1,1143,,,,597,597,597.0,EEABGJJD_01186,EEABGJJD_01186,EEABGJJD_01186,EEABGJJD_01186,EEABGJJD_01186 +group_653,,restriction endonuclease subunit S,5,5,1.0,1,1142,,,,597,597,597.0,EEABGJJD_01055,EEABGJJD_01055,EEABGJJD_01055,EEABGJJD_01055,EEABGJJD_01055 +group_652,,Holliday junction resolvase RecU,5,5,1.0,1,1141,,,,600,600,600.0,EEABGJJD_01380,EEABGJJD_01380,EEABGJJD_01380,EEABGJJD_01380,EEABGJJD_01380 +group_651,,lysozyme family protein,5,5,1.0,1,1140,,,,600,600,600.0,EEABGJJD_00963,EEABGJJD_00963,EEABGJJD_00963,EEABGJJD_00963,EEABGJJD_00963 +group_650,,hypothetical protein,5,5,1.0,1,1139,,,,600,600,600.0,EEABGJJD_00734,EEABGJJD_00734,EEABGJJD_00734,EEABGJJD_00734,EEABGJJD_00734 +group_649,,hypothetical protein,5,5,1.0,1,1138,,,,603,603,603.0,EEABGJJD_01790,EEABGJJD_01790,EEABGJJD_01790,EEABGJJD_01790,EEABGJJD_01790 +group_648,,putative NADH dehydrogenase,5,5,1.0,1,1137,,,,603,603,603.0,EEABGJJD_00893,EEABGJJD_00893,EEABGJJD_00893,EEABGJJD_00893,EEABGJJD_00893 +group_647,,hypothetical protein,5,5,1.0,1,1136,,,,603,603,603.0,EEABGJJD_00311,EEABGJJD_00311,EEABGJJD_00311,EEABGJJD_00311,EEABGJJD_00311 +salR,salR,putative response regulator of salavaricin regulon,5,5,1.0,1,1135,,,,606,606,606.0,EEABGJJD_01591,EEABGJJD_01591,EEABGJJD_01591,EEABGJJD_01591,EEABGJJD_01591 +group_646,,hypothetical protein,5,5,1.0,1,1134,,,,606,606,606.0,EEABGJJD_01327,EEABGJJD_01327,EEABGJJD_01327,EEABGJJD_01327,EEABGJJD_01327 +sodA,sodA,superoxide dismutase (Fe/Mn),5,5,1.0,1,1133,,,,606,606,606.0,EEABGJJD_01173,EEABGJJD_01173,EEABGJJD_01173,EEABGJJD_01173,EEABGJJD_01173 +group_645,,hypothetical protein,5,5,1.0,1,1132,,,,609,609,609.0,EEABGJJD_00570,EEABGJJD_00570,EEABGJJD_00570,EEABGJJD_00570,EEABGJJD_00570 +rpsD,rpsD,30S ribosomal protein S4,5,5,1.0,1,1131,,,,612,612,612.0,EEABGJJD_01799,EEABGJJD_01799,EEABGJJD_01799,EEABGJJD_01799,EEABGJJD_01799 +group_644,,hypothetical protein,5,5,1.0,1,1130,,,,612,612,612.0,EEABGJJD_00837,EEABGJJD_00837,EEABGJJD_00837,EEABGJJD_00837,EEABGJJD_00837 +purE,purE,5-(carboxyamino)imidazole ribonucleotide mutase,5,5,1.0,1,1129,,,,612,612,612.0,EEABGJJD_00051,EEABGJJD_00051,EEABGJJD_00051,EEABGJJD_00051,EEABGJJD_00051 +group_643,,transglycosylase,5,5,1.0,1,1128,,,,615,615,615.0,EEABGJJD_01810,EEABGJJD_01810,EEABGJJD_01810,EEABGJJD_01810,EEABGJJD_01810 +group_642,,cadmium transporter,5,5,1.0,1,1127,,,,615,615,615.0,EEABGJJD_01786,EEABGJJD_01786,EEABGJJD_01786,EEABGJJD_01786,EEABGJJD_01786 +nrdG,nrdG,putative anaerobic ribonucleotide reductase activator,5,5,1.0,1,1126,,,,615,615,615.0,EEABGJJD_01737,EEABGJJD_01737,EEABGJJD_01737,EEABGJJD_01737,EEABGJJD_01737 +def,def,putative polypeptide deformylase,5,5,1.0,1,1125,,,,615,615,615.0,EEABGJJD_01629,EEABGJJD_01629,EEABGJJD_01629,EEABGJJD_01629,EEABGJJD_01629 +group_641,,TVP38/TMEM64 family protein,5,5,1.0,1,1124,,,,615,615,615.0,EEABGJJD_01082,EEABGJJD_01082,EEABGJJD_01082,EEABGJJD_01082,EEABGJJD_01082 +group_640,,lipase/acylhydrolase,5,5,1.0,1,1123,,,,615,615,615.0,EEABGJJD_00933,EEABGJJD_00933,EEABGJJD_00933,EEABGJJD_00933,EEABGJJD_00933 +group_639,,KR domain-containing protein,5,5,1.0,1,1122,,,,615,615,615.0,EEABGJJD_00534,EEABGJJD_00534,EEABGJJD_00534,EEABGJJD_00534,EEABGJJD_00534 +group_638,,hypothetical protein,5,5,1.0,1,1121,,,,615,615,615.0,EEABGJJD_00373,EEABGJJD_00373,EEABGJJD_00373,EEABGJJD_00373,EEABGJJD_00373 +group_637,,membrane protein,5,5,1.0,1,1120,,,,618,618,618.0,EEABGJJD_01420,EEABGJJD_01420,EEABGJJD_01420,EEABGJJD_01420,EEABGJJD_01420 +group_636,,NAD(P)-dependent oxidoreductase,5,5,1.0,1,1119,,,,618,618,618.0,EEABGJJD_00946,EEABGJJD_00946,EEABGJJD_00946,EEABGJJD_00946,EEABGJJD_00946 +group_635,,ATP-binding cassette domain-containing protein,5,5,1.0,1,1118,,,,618,618,618.0,EEABGJJD_00905,EEABGJJD_00905,EEABGJJD_00905,EEABGJJD_00905,EEABGJJD_00905 +group_634,,hypothetical protein,5,5,1.0,1,1117,,,,621,621,621.0,EEABGJJD_01283,EEABGJJD_01283,EEABGJJD_01283,EEABGJJD_01283,EEABGJJD_01283 +group_633,,hypothetical protein,5,5,1.0,1,1116,,,,621,621,621.0,EEABGJJD_01084,EEABGJJD_01084,EEABGJJD_01084,EEABGJJD_01084,EEABGJJD_01084 +group_632,,phage repressor protein,5,5,1.0,1,1115,,,,624,624,624.0,EEABGJJD_01758,EEABGJJD_01758,EEABGJJD_01758,EEABGJJD_01758,EEABGJJD_01758 +group_631,,TlpA family protein disulfide reductase,5,5,1.0,1,1114,,,,624,624,624.0,EEABGJJD_01302,EEABGJJD_01302,EEABGJJD_01302,EEABGJJD_01302,EEABGJJD_01302 +group_630,,histidine phosphatase family protein,5,5,1.0,1,1113,,,,624,624,624.0,EEABGJJD_00499,EEABGJJD_00499,EEABGJJD_00499,EEABGJJD_00499,EEABGJJD_00499 +rplD,rplD,50S ribosomal protein L4,5,5,1.0,1,1112,,,,624,624,624.0,EEABGJJD_00067,EEABGJJD_00067,EEABGJJD_00067,EEABGJJD_00067,EEABGJJD_00067 +group_629,,putative serine cycle enzyme,5,5,1.0,1,1111,,,,627,627,627.0,EEABGJJD_01721,EEABGJJD_01721,EEABGJJD_01721,EEABGJJD_01721,EEABGJJD_01721 +dppE,dppE,ATPase protein,5,5,1.0,1,1110,,,,627,627,627.0,EEABGJJD_01665,EEABGJJD_01665,EEABGJJD_01665,EEABGJJD_01665,EEABGJJD_01665 +group_628,,prepilin peptidase,5,5,1.0,1,1109,,,,627,627,627.0,EEABGJJD_01281,EEABGJJD_01281,EEABGJJD_01281,EEABGJJD_01281,EEABGJJD_01281 +udk,udk,putative uridine kinase,5,5,1.0,1,1108,,,,627,627,627.0,EEABGJJD_01145,EEABGJJD_01145,EEABGJJD_01145,EEABGJJD_01145,EEABGJJD_01145 +ntpD,ntpD,putative V-type Na+ -ATPase subunit D,5,5,1.0,1,1107,,,,627,627,627.0,EEABGJJD_00164,EEABGJJD_00164,EEABGJJD_00164,EEABGJJD_00164,EEABGJJD_00164 +group_627,,hypothetical protein,5,5,1.0,1,1106,,,,627,627,627.0,EEABGJJD_00131,EEABGJJD_00131,EEABGJJD_00131,EEABGJJD_00131,EEABGJJD_00131 +rplC,rplC,50S ribosomal protein L3,5,5,1.0,1,1105,,,,627,627,627.0,EEABGJJD_00066,EEABGJJD_00066,EEABGJJD_00066,EEABGJJD_00066,EEABGJJD_00066 +group_626,,putative amino acid ABC transporter (ATP-binding protein),5,5,1.0,1,1104,,,,630,630,630.0,EEABGJJD_01071,EEABGJJD_01071,EEABGJJD_01071,EEABGJJD_01071,EEABGJJD_01071 +pyrE,pyrE,putative orotate phosphoribosyltransferase,5,5,1.0,1,1103,,,,630,630,630.0,EEABGJJD_00746,EEABGJJD_00746,EEABGJJD_00746,EEABGJJD_00746,EEABGJJD_00746 +group_625,,hypothetical protein,5,5,1.0,1,1102,,,,630,630,630.0,EEABGJJD_00555,EEABGJJD_00555,EEABGJJD_00555,EEABGJJD_00555,EEABGJJD_00555 +group_624,,CutC family protein,5,5,1.0,1,1101,,,,630,630,630.0,EEABGJJD_00357,EEABGJJD_00357,EEABGJJD_00357,EEABGJJD_00357,EEABGJJD_00357 +upp,upp,putative uracil phosphoribosyltransferase,5,5,1.0,1,1100,,,,630,630,630.0,EEABGJJD_00348,EEABGJJD_00348,EEABGJJD_00348,EEABGJJD_00348,EEABGJJD_00348 +group_623,,YigZ family protein,5,5,1.0,1,1099,,,,633,633,633.0,EEABGJJD_01353,EEABGJJD_01353,EEABGJJD_01353,EEABGJJD_01353,EEABGJJD_01353 +group_622,,TIGR01906 family membrane protein,5,5,1.0,1,1098,,,,633,633,633.0,EEABGJJD_00871,EEABGJJD_00871,EEABGJJD_00871,EEABGJJD_00871,EEABGJJD_00871 +group_621,,hypothetical protein,5,5,1.0,1,1097,,,,633,633,633.0,EEABGJJD_00370,EEABGJJD_00370,EEABGJJD_00370,EEABGJJD_00370,EEABGJJD_00370 +nadD,nadD,nicotinate-nicotinamide nucleotide adenylyltransferase,5,5,1.0,1,1096,,,,633,633,633.0,EEABGJJD_00286,EEABGJJD_00286,EEABGJJD_00286,EEABGJJD_00286,EEABGJJD_00286 +group_620,,thiamine diphosphokinase,5,5,1.0,1,1095,,,,633,633,633.0,EEABGJJD_00249,EEABGJJD_00249,EEABGJJD_00249,EEABGJJD_00249,EEABGJJD_00249 +group_619,,tRNA (guanosine(46)-N7)-methyltransferase TrmB,5,5,1.0,1,1094,,,,636,636,636.0,EEABGJJD_01441,EEABGJJD_01441,EEABGJJD_01441,EEABGJJD_01441,EEABGJJD_01441 +gmk,gmk,putative guanylate kinase,5,5,1.0,1,1093,,,,636,636,636.0,EEABGJJD_01366,EEABGJJD_01366,EEABGJJD_01366,EEABGJJD_01366,EEABGJJD_01366 +group_618,,cystathionine beta-lyase,5,5,1.0,1,1092,,,,636,636,636.0,EEABGJJD_00768,EEABGJJD_00768,EEABGJJD_00768,EEABGJJD_00768,EEABGJJD_00768 +group_617,,hypothetical protein,5,5,1.0,1,1091,,,,636,636,636.0,EEABGJJD_00553,EEABGJJD_00553,EEABGJJD_00553,EEABGJJD_00553,EEABGJJD_00553 +group_616,,MBL fold metallo-hydrolase,5,5,1.0,1,1090,,,,636,636,636.0,EEABGJJD_00533,EEABGJJD_00533,EEABGJJD_00533,EEABGJJD_00533,EEABGJJD_00533 +kgdA,kgdA,putative 2-dehydro-3-deoxyphosphogluconate aldolase/4-hydroxy-2-oxoglutarate aldolase,5,5,1.0,1,1089,,,,636,636,636.0,EEABGJJD_00527,EEABGJJD_00527,EEABGJJD_00527,EEABGJJD_00527,EEABGJJD_00527 +tmk,tmk,putative thymidylate kinase,5,5,1.0,1,1088,,,,636,636,636.0,EEABGJJD_00351,EEABGJJD_00351,EEABGJJD_00351,EEABGJJD_00351,EEABGJJD_00351 +adk,adk,adenylate kinase,5,5,1.0,1,1087,,,,639,639,639.0,EEABGJJD_00087,EEABGJJD_00087,EEABGJJD_00087,EEABGJJD_00087,EEABGJJD_00087 +group_615,,CoA pyrophosphatase,5,5,1.0,1,1086,,,,642,642,642.0,EEABGJJD_01793,EEABGJJD_01793,EEABGJJD_01793,EEABGJJD_01793,EEABGJJD_01793 +group_614,,putative two-component response regulator,5,5,1.0,1,1085,,,,642,642,642.0,EEABGJJD_01357,EEABGJJD_01357,EEABGJJD_01357,EEABGJJD_01357,EEABGJJD_01357 +group_613,,putative amino acid ABC transporter (permease protein),5,5,1.0,1,1084,,,,642,642,642.0,EEABGJJD_01072,EEABGJJD_01072,EEABGJJD_01072,EEABGJJD_01072,EEABGJJD_01072 +group_612,,putative transcriptional regulator,5,5,1.0,1,1083,,,,642,642,642.0,EEABGJJD_01059,EEABGJJD_01059,EEABGJJD_01059,EEABGJJD_01059,EEABGJJD_01059 +group_611,,hypothetical protein,5,5,1.0,1,1082,,,,642,642,642.0,EEABGJJD_00867,EEABGJJD_00867,EEABGJJD_00867,EEABGJJD_00867,EEABGJJD_00867 +group_610,,glycerol-3-phosphate acyltransferase,5,5,1.0,1,1081,,,,642,642,642.0,EEABGJJD_00752,EEABGJJD_00752,EEABGJJD_00752,EEABGJJD_00752,EEABGJJD_00752 +group_609,,sugar-phosphate isomerase,5,5,1.0,1,1080,,,,642,642,642.0,EEABGJJD_00525,EEABGJJD_00525,EEABGJJD_00525,EEABGJJD_00525,EEABGJJD_00525 +group_608,,putative deoxyguanosine kinase/deoxyadenosine kinase(I) subunit,5,5,1.0,1,1079,,,,642,642,642.0,EEABGJJD_00133,EEABGJJD_00133,EEABGJJD_00133,EEABGJJD_00133,EEABGJJD_00133 +group_607,,putative transaldolase,5,5,1.0,1,1078,,,,645,645,645.0,EEABGJJD_01402,EEABGJJD_01402,EEABGJJD_01402,EEABGJJD_01402,EEABGJJD_01402 +group_606,,transcriptional regulator,5,5,1.0,1,1077,,,,645,645,645.0,EEABGJJD_00937,EEABGJJD_00937,EEABGJJD_00937,EEABGJJD_00937,EEABGJJD_00937 +pcp,pcp,putative pyrrolidone carboxyl peptidase,5,5,1.0,1,1076,,,,648,648,648.0,EEABGJJD_00426,EEABGJJD_00426,EEABGJJD_00426,EEABGJJD_00426,EEABGJJD_00426 +group_605,,putative metal-dependent transcriptional regulator,5,5,1.0,1,1075,,,,648,648,648.0,EEABGJJD_00383,EEABGJJD_00383,EEABGJJD_00383,EEABGJJD_00383,EEABGJJD_00383 +group_604,,succinyl-CoA--3-ketoacid-CoA transferase,5,5,1.0,1,1074,,,,648,648,648.0,EEABGJJD_00152,EEABGJJD_00152,EEABGJJD_00152,EEABGJJD_00152,EEABGJJD_00152 +group_603,,LPXTG cell wall anchor domain-containing protein,5,5,1.0,1,1073,,,,648,648,648.0,EEABGJJD_00141,EEABGJJD_00141,EEABGJJD_00141,EEABGJJD_00141,EEABGJJD_00141 +atoD1,atoD.1,acetyl-CoA--acetoacetyl-CoA transferase subunit alpha,5,5,1.0,1,1072,,,,651,651,651.0,EEABGJJD_01370,EEABGJJD_01370,EEABGJJD_01370,EEABGJJD_01370,EEABGJJD_01370 +hlyIII,hlyIII,putative hemolysin III,5,5,1.0,1,1071,,,,651,651,651.0,EEABGJJD_00974,EEABGJJD_00974,EEABGJJD_00974,EEABGJJD_00974,EEABGJJD_00974 +group_602,,ABC transporter permease,5,5,1.0,1,1070,,,,651,651,651.0,EEABGJJD_00749,EEABGJJD_00749,EEABGJJD_00749,EEABGJJD_00749,EEABGJJD_00749 +group_601,,GTP pyrophosphokinase,5,5,1.0,1,1069,,,,651,651,651.0,EEABGJJD_00721,EEABGJJD_00721,EEABGJJD_00721,EEABGJJD_00721,EEABGJJD_00721 +group_600,,HAD family phosphatase,5,5,1.0,1,1068,,,,651,651,651.0,EEABGJJD_00528,EEABGJJD_00528,EEABGJJD_00528,EEABGJJD_00528,EEABGJJD_00528 +group_599,,putative two-component response regulator,5,5,1.0,1,1067,,,,654,654,654.0,EEABGJJD_01678,EEABGJJD_01678,EEABGJJD_01678,EEABGJJD_01678,EEABGJJD_01678 +group_598,,protein G alpha 2M-binding protein,5,5,1.0,1,1066,,,,654,654,654.0,EEABGJJD_01136,EEABGJJD_01136,EEABGJJD_01136,EEABGJJD_01136,EEABGJJD_01136 +phoU,phoU,putative phosphate uptake regulatory protein,5,5,1.0,1,1065,,,,654,654,654.0,EEABGJJD_01041,EEABGJJD_01041,EEABGJJD_01041,EEABGJJD_01041,EEABGJJD_01041 +phiSF3702_7,phiSF370.2_7,putative structural protein - phage associated,5,5,1.0,1,1064,,,,654,654,654.0,EEABGJJD_00828,EEABGJJD_00828,EEABGJJD_00828,EEABGJJD_00828,EEABGJJD_00828 +ung,ung,putative uracil DNA glycosylase,5,5,1.0,1,1063,,,,654,654,654.0,EEABGJJD_00750,EEABGJJD_00750,EEABGJJD_00750,EEABGJJD_00750,EEABGJJD_00750 +group_597,,hypothetical protein,5,5,1.0,1,1062,,,,654,654,654.0,EEABGJJD_00341,EEABGJJD_00341,EEABGJJD_00341,EEABGJJD_00341,EEABGJJD_00341 +group_596,,PAP2 family protein,5,5,1.0,1,1061,,,,654,654,654.0,EEABGJJD_00335,EEABGJJD_00335,EEABGJJD_00335,EEABGJJD_00335,EEABGJJD_00335 +rpsC,rpsC,30S ribosomal protein S3,5,5,1.0,1,1060,,,,654,654,654.0,EEABGJJD_00072,EEABGJJD_00072,EEABGJJD_00072,EEABGJJD_00072,EEABGJJD_00072 +group_595,,hypothetical protein,5,5,1.0,1,1059,,,,657,657,657.0,EEABGJJD_01519,EEABGJJD_01519,EEABGJJD_01519,EEABGJJD_01519,EEABGJJD_01519 +group_594,,cell division protein SepF,5,5,1.0,1,1058,,,,657,657,657.0,EEABGJJD_01268,EEABGJJD_01268,EEABGJJD_01268,EEABGJJD_01268,EEABGJJD_01268 +nth,nth,putative endonuclease III (DNA repair),5,5,1.0,1,1057,,,,657,657,657.0,EEABGJJD_00774,EEABGJJD_00774,EEABGJJD_00774,EEABGJJD_00774,EEABGJJD_00774 +atoA,atoA,putative Acetyl-CoA:acetoacetyl-CoA transferase b subunit,5,5,1.0,1,1056,,,,660,660,660.0,EEABGJJD_01371,EEABGJJD_01371,EEABGJJD_01371,EEABGJJD_01371,EEABGJJD_01371 +atoD2,atoD.2,acetyl-CoA--acetoacetyl-CoA transferase subunit alpha,5,5,1.0,1,1055,,,,660,660,660.0,EEABGJJD_00151,EEABGJJD_00151,EEABGJJD_00151,EEABGJJD_00151,EEABGJJD_00151 +comEA,comEA,putative competence protein,5,5,1.0,1,1054,,,,663,663,663.0,EEABGJJD_01176,EEABGJJD_01176,EEABGJJD_01176,EEABGJJD_01176,EEABGJJD_01176 +group_593,,type II-A CRISPR-associated protein Csn2,5,5,1.0,1,1053,,,,663,663,663.0,EEABGJJD_00875,EEABGJJD_00875,EEABGJJD_00875,EEABGJJD_00875,EEABGJJD_00875 +rpe,rpe,putative ribulose-phosphate 3-epimerase,5,5,1.0,1,1052,,,,663,663,663.0,EEABGJJD_00248,EEABGJJD_00248,EEABGJJD_00248,EEABGJJD_00248,EEABGJJD_00248 +group_592,,hypothetical protein,5,5,1.0,1,1051,,,,663,663,663.0,EEABGJJD_00240,EEABGJJD_00240,EEABGJJD_00240,EEABGJJD_00240,EEABGJJD_00240 +group_591,,putative hexulose-6-phosphate synthase,5,5,1.0,1,1050,,,,663,663,663.0,EEABGJJD_00182,EEABGJJD_00182,EEABGJJD_00182,EEABGJJD_00182,EEABGJJD_00182 +comFC,comFC,putative late competence protein,5,5,1.0,1,1049,,,,666,666,666.0,EEABGJJD_01351,EEABGJJD_01351,EEABGJJD_01351,EEABGJJD_01351,EEABGJJD_01351 +group_590,,DNA-binding protein,5,5,1.0,1,1048,,,,666,666,666.0,EEABGJJD_00983,EEABGJJD_00983,EEABGJJD_00983,EEABGJJD_00983,EEABGJJD_00983 +group_589,,putative two-component response regulator,5,5,1.0,1,1047,,,,666,666,666.0,EEABGJJD_00926,EEABGJJD_00926,EEABGJJD_00926,EEABGJJD_00926,EEABGJJD_00926 +group_588,,hypothetical protein,5,5,1.0,1,1046,,,,666,666,666.0,EEABGJJD_00799,EEABGJJD_00799,EEABGJJD_00799,EEABGJJD_00799,EEABGJJD_00799 +group_587,,NAD(P)H-dependent oxidoreductase,5,5,1.0,1,1045,,,,666,666,666.0,EEABGJJD_00431,EEABGJJD_00431,EEABGJJD_00431,EEABGJJD_00431,EEABGJJD_00431 +group_586,,hypothetical protein,5,5,1.0,1,1044,,,,666,666,666.0,EEABGJJD_00146,EEABGJJD_00146,EEABGJJD_00146,EEABGJJD_00146,EEABGJJD_00146 +mipB,mipB,putative transaldolase-like protein,5,5,1.0,1,1043,,,,669,669,669.0,EEABGJJD_01693,EEABGJJD_01693,EEABGJJD_01693,EEABGJJD_01693,EEABGJJD_01693 +group_585,,putative two-component response regulator,5,5,1.0,1,1042,,,,669,669,669.0,EEABGJJD_00722,EEABGJJD_00722,EEABGJJD_00722,EEABGJJD_00722,EEABGJJD_00722 +group_584,,putative transcriptional repressor,5,5,1.0,1,1041,,,,669,669,669.0,EEABGJJD_00707,EEABGJJD_00707,EEABGJJD_00707,EEABGJJD_00707,EEABGJJD_00707 +group_583,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,1040,,,,669,669,669.0,EEABGJJD_00641,EEABGJJD_00641,EEABGJJD_00641,EEABGJJD_00641,EEABGJJD_00641 +sdhB,sdhB,putative L-serine dehydratase beta subunit,5,5,1.0,1,1039,,,,672,672,672.0,EEABGJJD_01808,EEABGJJD_01808,EEABGJJD_01808,EEABGJJD_01808,EEABGJJD_01808 +group_582,,ATP-binding protein,5,5,1.0,1,1038,,,,672,672,672.0,EEABGJJD_01593,EEABGJJD_01593,EEABGJJD_01593,EEABGJJD_01593,EEABGJJD_01593 +deoC,deoC,2-deoxyribose-5-phosphate aldolase,5,5,1.0,1,1037,,,,672,672,672.0,EEABGJJD_01551,EEABGJJD_01551,EEABGJJD_01551,EEABGJJD_01551,EEABGJJD_01551 +group_581,,YggS family pyridoxal phosphate-dependent enzyme,5,5,1.0,1,1036,,,,672,672,672.0,EEABGJJD_01269,EEABGJJD_01269,EEABGJJD_01269,EEABGJJD_01269,EEABGJJD_01269 +group_580,,putative methyltransferase,5,5,1.0,1,1035,,,,672,672,672.0,EEABGJJD_01162,EEABGJJD_01162,EEABGJJD_01162,EEABGJJD_01162,EEABGJJD_01162 +group_579,,putative GTP pyrophosphokinase,5,5,1.0,1,1034,,,,672,672,672.0,EEABGJJD_00942,EEABGJJD_00942,EEABGJJD_00942,EEABGJJD_00942,EEABGJJD_00942 +group_578,,CPBP family intramembrane metalloprotease,5,5,1.0,1,1033,,,,672,672,672.0,EEABGJJD_00615,EEABGJJD_00615,EEABGJJD_00615,EEABGJJD_00615,EEABGJJD_00615 +group_577,,hypothetical protein,5,5,1.0,1,1032,,,,672,672,672.0,EEABGJJD_00217,EEABGJJD_00217,EEABGJJD_00217,EEABGJJD_00217,EEABGJJD_00217 +group_576,,ABC transporter ATP-binding protein,5,5,1.0,1,1031,,,,675,675,675.0,EEABGJJD_01680,EEABGJJD_01680,EEABGJJD_01680,EEABGJJD_01680,EEABGJJD_01680 +group_575,,CRISPR-associated protein Cas4,5,5,1.0,1,1030,,,,675,675,675.0,EEABGJJD_01306,EEABGJJD_01306,EEABGJJD_01306,EEABGJJD_01306,EEABGJJD_01306 +group_574,,putative response regulator,5,5,1.0,1,1029,,,,675,675,675.0,EEABGJJD_01039,EEABGJJD_01039,EEABGJJD_01039,EEABGJJD_01039,EEABGJJD_01039 +group_573,,TrkA family potassium uptake protein,5,5,1.0,1,1028,,,,675,675,675.0,EEABGJJD_00298,EEABGJJD_00298,EEABGJJD_00298,EEABGJJD_00298,EEABGJJD_00298 +group_572,,energy-coupling factor transporter transmembrane protein EcfT,5,5,1.0,1,1027,,,,681,681,681.0,EEABGJJD_01492,EEABGJJD_01492,EEABGJJD_01492,EEABGJJD_01492,EEABGJJD_01492 +group_571,,Crp/Fnr family transcriptional regulator,5,5,1.0,1,1026,,,,681,681,681.0,EEABGJJD_01295,EEABGJJD_01295,EEABGJJD_01295,EEABGJJD_01295,EEABGJJD_01295 +radC,radC,putative DNA repair protein,5,5,1.0,1,1025,,,,681,681,681.0,EEABGJJD_00935,EEABGJJD_00935,EEABGJJD_00935,EEABGJJD_00935,EEABGJJD_00935 +cmk,cmk,putative cytidylate kinase,5,5,1.0,1,1024,,,,681,681,681.0,EEABGJJD_00666,EEABGJJD_00666,EEABGJJD_00666,EEABGJJD_00666,EEABGJJD_00666 +group_570,,DNA-binding protein,5,5,1.0,1,1023,,,,681,681,681.0,EEABGJJD_00153,EEABGJJD_00153,EEABGJJD_00153,EEABGJJD_00153,EEABGJJD_00153 +group_569,,DnaD domain protein,5,5,1.0,1,1022,,,,684,684,684.0,EEABGJJD_00773,EEABGJJD_00773,EEABGJJD_00773,EEABGJJD_00773,EEABGJJD_00773 +rpiA,rpiA,putative ribose 5-phosphate isomerase,5,5,1.0,1,1021,,,,684,684,684.0,EEABGJJD_00737,EEABGJJD_00737,EEABGJJD_00737,EEABGJJD_00737,EEABGJJD_00737 +group_568,,streptolysin associated protein SagF,5,5,1.0,1,1020,,,,684,684,684.0,EEABGJJD_00616,EEABGJJD_00616,EEABGJJD_00616,EEABGJJD_00616,EEABGJJD_00616 +group_567,,hypothetical protein,5,5,1.0,1,1019,,,,684,684,684.0,EEABGJJD_00428,EEABGJJD_00428,EEABGJJD_00428,EEABGJJD_00428,EEABGJJD_00428 +group_566,,class C sortase,5,5,1.0,1,1018,,,,684,684,684.0,EEABGJJD_00145,EEABGJJD_00145,EEABGJJD_00145,EEABGJJD_00145,EEABGJJD_00145 +group_565,,putative amino acid ABC transporter (permease protein),5,5,1.0,1,1017,,,,687,687,687.0,EEABGJJD_01259,EEABGJJD_01259,EEABGJJD_01259,EEABGJJD_01259,EEABGJJD_01259 +srtR,srtR,putative DNA binding regulatory protein - lantibiotic associated,5,5,1.0,1,1016,,,,687,687,687.0,EEABGJJD_00902,EEABGJJD_00902,EEABGJJD_00902,EEABGJJD_00902,EEABGJJD_00902 +group_564,,tRNA (adenine-N(1))-methyltransferase,5,5,1.0,1,1015,,,,687,687,687.0,EEABGJJD_00775,EEABGJJD_00775,EEABGJJD_00775,EEABGJJD_00775,EEABGJJD_00775 +aroD,aroD,3-dehydroquinase,5,5,1.0,1,1014,,,,687,687,687.0,EEABGJJD_00672,EEABGJJD_00672,EEABGJJD_00672,EEABGJJD_00672,EEABGJJD_00672 +group_563,,DNA-binding response regulator,5,5,1.0,1,1013,,,,687,687,687.0,EEABGJJD_00304,EEABGJJD_00304,EEABGJJD_00304,EEABGJJD_00304,EEABGJJD_00304 +group_562,,multidrug transporter,5,5,1.0,1,1012,,,,690,690,690.0,EEABGJJD_00948,EEABGJJD_00948,EEABGJJD_00948,EEABGJJD_00948,EEABGJJD_00948 +srtF,srtF,ABC transporter (ATP-binding) - lantibiotic associated,5,5,1.0,1,1011,,,,690,690,690.0,EEABGJJD_00906,EEABGJJD_00906,EEABGJJD_00906,EEABGJJD_00906,EEABGJJD_00906 +rplA,rplA,50S ribosomal protein L1,5,5,1.0,1,1010,,,,690,690,690.0,EEABGJJD_00391,EEABGJJD_00391,EEABGJJD_00391,EEABGJJD_00391,EEABGJJD_00391 +group_561,,hypothetical protein,5,5,1.0,1,1009,,,,690,690,690.0,EEABGJJD_00322,EEABGJJD_00322,EEABGJJD_00322,EEABGJJD_00322,EEABGJJD_00322 +group_560,,transporter,5,5,1.0,1,1008,,,,693,693,693.0,EEABGJJD_01359,EEABGJJD_01359,EEABGJJD_01359,EEABGJJD_01359,EEABGJJD_01359 +group_559,,phosphopantothenate--cysteine ligase,5,5,1.0,1,1007,,,,693,693,693.0,EEABGJJD_01026,EEABGJJD_01026,EEABGJJD_01026,EEABGJJD_01026,EEABGJJD_01026 +pyrF,pyrF,putative orotidine-5'-decarboxylase PyrF,5,5,1.0,1,1006,,,,693,693,693.0,EEABGJJD_00745,EEABGJJD_00745,EEABGJJD_00745,EEABGJJD_00745,EEABGJJD_00745 +ftsE,ftsE,putative cell-division ATP-binding protein,5,5,1.0,1,1005,,,,693,693,693.0,EEABGJJD_00531,EEABGJJD_00531,EEABGJJD_00531,EEABGJJD_00531,EEABGJJD_00531 +acpA,acpA,putative ribonuclease III,5,5,1.0,1,1004,,,,693,693,693.0,EEABGJJD_00446,EEABGJJD_00446,EEABGJJD_00446,EEABGJJD_00446,EEABGJJD_00446 +group_558,,putative ABC transporter (permease protein),5,5,1.0,1,1003,,,,693,693,693.0,EEABGJJD_00295,EEABGJJD_00295,EEABGJJD_00295,EEABGJJD_00295,EEABGJJD_00295 +phiMGAS50052_19,phiMGAS5005.2_19,phage protein,5,5,1.0,1,1002,,,,696,696,696.0,EEABGJJD_01205,EEABGJJD_01205,EEABGJJD_01205,EEABGJJD_01205,EEABGJJD_01205 +gpmA,gpmA,23-bisphosphoglycerate-dependent phosphoglycerate mutase,5,5,1.0,1,1001,,,,696,696,696.0,EEABGJJD_01190,EEABGJJD_01190,EEABGJJD_01190,EEABGJJD_01190,EEABGJJD_01190 +group_557,,TIGR02206 family membrane protein,5,5,1.0,1,1000,,,,696,696,696.0,EEABGJJD_01183,EEABGJJD_01183,EEABGJJD_01183,EEABGJJD_01183,EEABGJJD_01183 +group_556,,CPBP family intramembrane metalloprotease,5,5,1.0,1,999,,,,696,696,696.0,EEABGJJD_01157,EEABGJJD_01157,EEABGJJD_01157,EEABGJJD_01157,EEABGJJD_01157 +group_555,,putative transcriptional regulator,5,5,1.0,1,998,,,,696,696,696.0,EEABGJJD_00991,EEABGJJD_00991,EEABGJJD_00991,EEABGJJD_00991,EEABGJJD_00991 +group_554,,gamma-glutamyl-gamma-aminobutyrate hydrolase family protein,5,5,1.0,1,997,,,,696,696,696.0,EEABGJJD_00936,EEABGJJD_00936,EEABGJJD_00936,EEABGJJD_00936,EEABGJJD_00936 +srtI,srtI,protein involved in lantibiotic (srt) production,5,5,1.0,1,996,,,,696,696,696.0,EEABGJJD_00901,EEABGJJD_00901,EEABGJJD_00901,EEABGJJD_00901,EEABGJJD_00901 +group_553,,putative glycosyl transferase,5,5,1.0,1,995,,,,696,696,696.0,EEABGJJD_00659,EEABGJJD_00659,EEABGJJD_00659,EEABGJJD_00659,EEABGJJD_00659 +pfs,pfs,5'-methylthioadenosine/S-adenosylhomocysteine nucleosidase,5,5,1.0,1,994,,,,696,696,696.0,EEABGJJD_00381,EEABGJJD_00381,EEABGJJD_00381,EEABGJJD_00381,EEABGJJD_00381 +group_552,,putative glycoprotein endopeptidase,5,5,1.0,1,993,,,,699,699,699.0,EEABGJJD_01558,EEABGJJD_01558,EEABGJJD_01558,EEABGJJD_01558,EEABGJJD_01558 +group_551,,ABC transporter ATP-binding protein,5,5,1.0,1,992,,,,699,699,699.0,EEABGJJD_01080,EEABGJJD_01080,EEABGJJD_01080,EEABGJJD_01080,EEABGJJD_01080 +group_550,,putative transcription regulator GntR family,5,5,1.0,1,991,,,,699,699,699.0,EEABGJJD_01008,EEABGJJD_01008,EEABGJJD_01008,EEABGJJD_01008,EEABGJJD_01008 +group_549,,putative repressor protein,5,5,1.0,1,990,,,,699,699,699.0,EEABGJJD_01005,EEABGJJD_01005,EEABGJJD_01005,EEABGJJD_01005,EEABGJJD_01005 +group_548,,putative dehydrogenease / oxidoreductase,5,5,1.0,1,989,,,,699,699,699.0,EEABGJJD_00375,EEABGJJD_00375,EEABGJJD_00375,EEABGJJD_00375,EEABGJJD_00375 +speJ,speJ,putative exotoxin (superantigen),5,5,1.0,1,988,,,,699,699,699.0,EEABGJJD_00372,EEABGJJD_00372,EEABGJJD_00372,EEABGJJD_00372,EEABGJJD_00372 +group_547,,exotoxin,5,5,1.0,1,987,,,,702,702,702.0,EEABGJJD_01659,EEABGJJD_01659,EEABGJJD_01659,EEABGJJD_01659,EEABGJJD_01659 +glpF,glpF,putative glycerol uptake facilitator,5,5,1.0,1,986,,,,702,702,702.0,EEABGJJD_01405,EEABGJJD_01405,EEABGJJD_01405,EEABGJJD_01405,EEABGJJD_01405 +group_546,,ABC transporter ATP-binding protein,5,5,1.0,1,985,,,,702,702,702.0,EEABGJJD_01057,EEABGJJD_01057,EEABGJJD_01057,EEABGJJD_01057,EEABGJJD_01057 +group_545,,noncanonical pyrimidine nucleotidase YjjG family,5,5,1.0,1,984,,,,702,702,702.0,EEABGJJD_00890,EEABGJJD_00890,EEABGJJD_00890,EEABGJJD_00890,EEABGJJD_00890 +group_544,,segregation/condensation protein A,5,5,1.0,1,983,,,,702,702,702.0,EEABGJJD_00329,EEABGJJD_00329,EEABGJJD_00329,EEABGJJD_00329,EEABGJJD_00329 +group_543,,ABC transporter permease,5,5,1.0,1,982,,,,705,705,705.0,EEABGJJD_01488,EEABGJJD_01488,EEABGJJD_01488,EEABGJJD_01488,EEABGJJD_01488 +group_542,,glucosamine-6-phosphate deaminase,5,5,1.0,1,981,,,,705,705,705.0,EEABGJJD_01167,EEABGJJD_01167,EEABGJJD_01167,EEABGJJD_01167,EEABGJJD_01167 +group_541,,putative N-acetylmannosamine-6-P epimerase,5,5,1.0,1,980,,,,705,705,705.0,EEABGJJD_00236,EEABGJJD_00236,EEABGJJD_00236,EEABGJJD_00236,EEABGJJD_00236 +group_540,,exotoxin,5,5,1.0,1,979,,,,705,705,705.0,EEABGJJD_00209,EEABGJJD_00209,EEABGJJD_00209,EEABGJJD_00209,EEABGJJD_00209 +araD,araD,putative L-ribulose 5-phosphate 4-epimerase,5,5,1.0,1,978,,,,705,705,705.0,EEABGJJD_00184,EEABGJJD_00184,EEABGJJD_00184,EEABGJJD_00184,EEABGJJD_00184 +group_539,,putative phosphoglycerate mutase,5,5,1.0,1,977,,,,708,708,708.0,EEABGJJD_01476,EEABGJJD_01476,EEABGJJD_01476,EEABGJJD_01476,EEABGJJD_01476 +group_538,,N-acetylmuramidase,5,5,1.0,1,976,,,,708,708,708.0,EEABGJJD_00711,EEABGJJD_00711,EEABGJJD_00711,EEABGJJD_00711,EEABGJJD_00711 +speC,speC,pyrogenic exotoxin C precursor phage associated,5,5,1.0,1,975,,,,708,708,708.0,EEABGJJD_00589,EEABGJJD_00589,EEABGJJD_00589,EEABGJJD_00589,EEABGJJD_00589 +group_537,,hypothetical protein,5,5,1.0,1,974,,,,708,708,708.0,EEABGJJD_00362,EEABGJJD_00362,EEABGJJD_00362,EEABGJJD_00362,EEABGJJD_00362 +ccdA,ccdA,putative cytochrome C-type biogenesis protein,5,5,1.0,1,973,,,,711,711,711.0,EEABGJJD_01303,EEABGJJD_01303,EEABGJJD_01303,EEABGJJD_01303,EEABGJJD_01303 +speH,speH,streptococcal exotoxin H precursor,5,5,1.0,1,972,,,,711,711,711.0,EEABGJJD_00843,EEABGJJD_00843,EEABGJJD_00843,EEABGJJD_00843,EEABGJJD_00843 +group_536,,ABC transporter ATP-binding protein,5,5,1.0,1,971,,,,711,711,711.0,EEABGJJD_00693,EEABGJJD_00693,EEABGJJD_00693,EEABGJJD_00693,EEABGJJD_00693 +vicR,vicR,two-component response regulator,5,5,1.0,1,970,,,,711,711,711.0,EEABGJJD_00443,EEABGJJD_00443,EEABGJJD_00443,EEABGJJD_00443,EEABGJJD_00443 +group_535,,TIGR00266 family protein,5,5,1.0,1,969,,,,711,711,711.0,EEABGJJD_00227,EEABGJJD_00227,EEABGJJD_00227,EEABGJJD_00227,EEABGJJD_00227 +group_534,,putative transcriptional regulator (GntR family),5,5,1.0,1,968,,,,714,714,714.0,EEABGJJD_01733,EEABGJJD_01733,EEABGJJD_01733,EEABGJJD_01733,EEABGJJD_01733 +deoD2,deoD2,putative purine nucleoside phosphorylase,5,5,1.0,1,967,,,,714,714,714.0,EEABGJJD_00741,EEABGJJD_00741,EEABGJJD_00741,EEABGJJD_00741,EEABGJJD_00741 +group_533,,IS3 family transposase,5,5,1.0,1,966,,,,714,714,714.0,EEABGJJD_00475,EEABGJJD_00475,EEABGJJD_00475,EEABGJJD_00475,EEABGJJD_00475 +gidB,gidB,putative glucose-inhibited division protein,5,5,1.0,1,965,,,,714,714,714.0,EEABGJJD_00300,EEABGJJD_00300,EEABGJJD_00300,EEABGJJD_00300,EEABGJJD_00300 +group_532,,SrtB family sortase,5,5,1.0,1,964,,,,714,714,714.0,EEABGJJD_00140,EEABGJJD_00140,EEABGJJD_00140,EEABGJJD_00140,EEABGJJD_00140 +group_531,,hypothetical protein,5,5,1.0,1,963,,,,717,717,717.0,EEABGJJD_01297,EEABGJJD_01297,EEABGJJD_01297,EEABGJJD_01297,EEABGJJD_01297 +group_530,,hypothetical protein,5,5,1.0,1,962,,,,717,717,717.0,EEABGJJD_00984,EEABGJJD_00984,EEABGJJD_00984,EEABGJJD_00984,EEABGJJD_00984 +atpB,atpB,putative proton-translocating ATPase a subunit,5,5,1.0,1,961,,,,717,717,717.0,EEABGJJD_00625,EEABGJJD_00625,EEABGJJD_00625,EEABGJJD_00625,EEABGJJD_00625 +group_529,,phage tail protein,5,5,1.0,1,960,,,,717,717,717.0,EEABGJJD_00580,EEABGJJD_00580,EEABGJJD_00580,EEABGJJD_00580,EEABGJJD_00580 +group_528,,LD-carboxypeptidase,5,5,1.0,1,959,,,,717,717,717.0,EEABGJJD_00472,EEABGJJD_00472,EEABGJJD_00472,EEABGJJD_00472,EEABGJJD_00472 +group_527,,YebC/PmpR family DNA-binding transcriptional regulator,5,5,1.0,1,958,,,,717,717,717.0,EEABGJJD_00291,EEABGJJD_00291,EEABGJJD_00291,EEABGJJD_00291,EEABGJJD_00291 +group_526,,putative transcriptional regulatory protein,5,5,1.0,1,957,,,,720,720,720.0,EEABGJJD_01542,EEABGJJD_01542,EEABGJJD_01542,EEABGJJD_01542,EEABGJJD_01542 +group_525,,putative glutamine cyclotransferase,5,5,1.0,1,956,,,,720,720,720.0,EEABGJJD_00425,EEABGJJD_00425,EEABGJJD_00425,EEABGJJD_00425,EEABGJJD_00425 +adcC,adcC,ABC transporter ATP-binding protein,5,5,1.0,1,955,,,,720,720,720.0,EEABGJJD_00110,EEABGJJD_00110,EEABGJJD_00110,EEABGJJD_00110,EEABGJJD_00110 +group_524,,hypothetical protein,5,5,1.0,1,954,,,,723,723,723.0,EEABGJJD_01416,EEABGJJD_01416,EEABGJJD_01416,EEABGJJD_01416,EEABGJJD_01416 +group_523,,hypothetical protein,5,5,1.0,1,953,,,,723,723,723.0,EEABGJJD_00908,EEABGJJD_00908,EEABGJJD_00908,EEABGJJD_00908,EEABGJJD_00908 +group_522,,putative transcriptional regulator (GntR family),5,5,1.0,1,952,,,,723,723,723.0,EEABGJJD_00593,EEABGJJD_00593,EEABGJJD_00593,EEABGJJD_00593,EEABGJJD_00593 +group_521,,rRNA pseudouridine synthase,5,5,1.0,1,951,,,,723,723,723.0,EEABGJJD_00331,EEABGJJD_00331,EEABGJJD_00331,EEABGJJD_00331,EEABGJJD_00331 +group_520,,ABC transporter ATP-binding protein,5,5,1.0,1,950,,,,726,726,726.0,EEABGJJD_01444,EEABGJJD_01444,EEABGJJD_01444,EEABGJJD_01444,EEABGJJD_01444 +group_519,,metal ABC transporter ATP-binding protein,5,5,1.0,1,949,,,,726,726,726.0,EEABGJJD_00385,EEABGJJD_00385,EEABGJJD_00385,EEABGJJD_00385,EEABGJJD_00385 +group_518,,putative transaldolase,5,5,1.0,1,948,,,,729,729,729.0,EEABGJJD_01623,EEABGJJD_01623,EEABGJJD_01623,EEABGJJD_01623,EEABGJJD_01623 +group_517,,type I-C CRISPR-associated protein Cas5,5,5,1.0,1,947,,,,729,729,729.0,EEABGJJD_01309,EEABGJJD_01309,EEABGJJD_01309,EEABGJJD_01309,EEABGJJD_01309 +group_516,,ABC transporter ATP-binding protein,5,5,1.0,1,946,,,,729,729,729.0,EEABGJJD_00949,EEABGJJD_00949,EEABGJJD_00949,EEABGJJD_00949,EEABGJJD_00949 +phiSF3702_2,phiSF370.2_2,putative P1-type antirepressor - phage associated,5,5,1.0,1,945,,,,729,729,729.0,EEABGJJD_00788,EEABGJJD_00788,EEABGJJD_00788,EEABGJJD_00788,EEABGJJD_00788 +group_515,,ATPase,5,5,1.0,1,944,,,,729,729,729.0,EEABGJJD_00460,EEABGJJD_00460,EEABGJJD_00460,EEABGJJD_00460,EEABGJJD_00460 +pyrH,pyrH,putative uridylate kinase (UMP-kinase),5,5,1.0,1,943,,,,729,729,729.0,EEABGJJD_00392,EEABGJJD_00392,EEABGJJD_00392,EEABGJJD_00392,EEABGJJD_00392 +group_514,,peroxide stress protein YaaA,5,5,1.0,1,942,,,,732,732,732.0,EEABGJJD_01736,EEABGJJD_01736,EEABGJJD_01736,EEABGJJD_01736,EEABGJJD_01736 +rsuA,rsuA,16S rRNA pseudouridine(516) synthase,5,5,1.0,1,941,,,,732,732,732.0,EEABGJJD_01122,EEABGJJD_01122,EEABGJJD_01122,EEABGJJD_01122,EEABGJJD_01122 +group_513,,acid phosphatase/phosphotransferase,5,5,1.0,1,940,,,,732,732,732.0,EEABGJJD_00931,EEABGJJD_00931,EEABGJJD_00931,EEABGJJD_00931,EEABGJJD_00931 +trmD,trmD,putative tRNA (guanine-N1)-methyltransferase,5,5,1.0,1,939,,,,732,732,732.0,EEABGJJD_00703,EEABGJJD_00703,EEABGJJD_00703,EEABGJJD_00703,EEABGJJD_00703 +fabG,fabG,putative beta-ketoacyl-ACP reductase,5,5,1.0,1,938,,,,735,735,735.0,EEABGJJD_01463,EEABGJJD_01463,EEABGJJD_01463,EEABGJJD_01463,EEABGJJD_01463 +group_512,,ABC transporter ATP-binding protein,5,5,1.0,1,937,,,,735,735,735.0,EEABGJJD_01399,EEABGJJD_01399,EEABGJJD_01399,EEABGJJD_01399,EEABGJJD_01399 +group_511,,putative amino acid ABC transporter (ATP-binding protein),5,5,1.0,1,936,,,,735,735,735.0,EEABGJJD_01258,EEABGJJD_01258,EEABGJJD_01258,EEABGJJD_01258,EEABGJJD_01258 +group_510,,hypothetical protein,5,5,1.0,1,935,,,,735,735,735.0,EEABGJJD_00552,EEABGJJD_00552,EEABGJJD_00552,EEABGJJD_00552,EEABGJJD_00552 +cofE,cofE,coenzyme F420-0:L-glutamate ligase,5,5,1.0,1,934,,,,735,735,735.0,EEABGJJD_00451,EEABGJJD_00451,EEABGJJD_00451,EEABGJJD_00451,EEABGJJD_00451 +purC,purC,putative phosphoribosylaminoimidazole - succinocarboxamide synthase (SAICAR synthetase),5,5,1.0,1,933,,,,735,735,735.0,EEABGJJD_00043,EEABGJJD_00043,EEABGJJD_00043,EEABGJJD_00043,EEABGJJD_00043 +salX,salX,ABC transporter ATP-binding protein,5,5,1.0,1,932,,,,738,738,738.0,EEABGJJD_01595,EEABGJJD_01595,EEABGJJD_01595,EEABGJJD_01595,EEABGJJD_01595 +group_509,,putative transcriptional activator regulator protein,5,5,1.0,1,931,,,,738,738,738.0,EEABGJJD_01547,EEABGJJD_01547,EEABGJJD_01547,EEABGJJD_01547,EEABGJJD_01547 +group_508,,putative rRNA methylase,5,5,1.0,1,930,,,,738,738,738.0,EEABGJJD_00320,EEABGJJD_00320,EEABGJJD_00320,EEABGJJD_00320,EEABGJJD_00320 +group_507,,putative response regulator,5,5,1.0,1,929,,,,738,738,738.0,EEABGJJD_00231,EEABGJJD_00231,EEABGJJD_00231,EEABGJJD_00231,EEABGJJD_00231 +pppL,pppL,putative phosphoprotein phosphatase,5,5,1.0,1,928,,,,741,741,741.0,EEABGJJD_01361,EEABGJJD_01361,EEABGJJD_01361,EEABGJJD_01361,EEABGJJD_01361 +group_506,,putative two-component response regulator,5,5,1.0,1,927,,,,741,741,741.0,EEABGJJD_01300,EEABGJJD_01300,EEABGJJD_01300,EEABGJJD_01300,EEABGJJD_01300 +group_505,,putative 1-acylglycerol-3-phosphate O-acyltransferase,5,5,1.0,1,926,,,,741,741,741.0,EEABGJJD_01177,EEABGJJD_01177,EEABGJJD_01177,EEABGJJD_01177,EEABGJJD_01177 +group_504,,ABC transporter ATP-binding protein,5,5,1.0,1,925,,,,741,741,741.0,EEABGJJD_01106,EEABGJJD_01106,EEABGJJD_01106,EEABGJJD_01106,EEABGJJD_01106 +group_503,,putative amino acid ABC transporter (ATP-binding protein),5,5,1.0,1,924,,,,741,741,741.0,EEABGJJD_00259,EEABGJJD_00259,EEABGJJD_00259,EEABGJJD_00259,EEABGJJD_00259 +group_502,,GntR family transcriptional regulator,5,5,1.0,1,923,,,,744,744,744.0,EEABGJJD_01554,EEABGJJD_01554,EEABGJJD_01554,EEABGJJD_01554,EEABGJJD_01554 +group_501,,putative DD-carboxypeptidase,5,5,1.0,1,922,,,,744,744,744.0,EEABGJJD_01475,EEABGJJD_01475,EEABGJJD_01475,EEABGJJD_01475,EEABGJJD_01475 +group_500,,putative amino acid ABC transport system (ATP-binding protein),5,5,1.0,1,921,,,,744,744,744.0,EEABGJJD_01388,EEABGJJD_01388,EEABGJJD_01388,EEABGJJD_01388,EEABGJJD_01388 +group_499,,hypothetical protein,5,5,1.0,1,920,,,,744,744,744.0,EEABGJJD_00735,EEABGJJD_00735,EEABGJJD_00735,EEABGJJD_00735,EEABGJJD_00735 +group_498,,class I SAM-dependent methyltransferase,5,5,1.0,1,919,,,,744,744,744.0,EEABGJJD_00289,EEABGJJD_00289,EEABGJJD_00289,EEABGJJD_00289,EEABGJJD_00289 +group_497,,putative transcriptional regulator,5,5,1.0,1,918,,,,747,747,747.0,EEABGJJD_01699,EEABGJJD_01699,EEABGJJD_01699,EEABGJJD_01699,EEABGJJD_01699 +group_496,,23S rRNA (guanosine(2251)-2'-O)-methyltransferase RlmB,5,5,1.0,1,917,,,,747,747,747.0,EEABGJJD_01615,EEABGJJD_01615,EEABGJJD_01615,EEABGJJD_01615,EEABGJJD_01615 +phiSF3702_5,phiSF370.2_5,putative antirepressor - phage associated,5,5,1.0,1,916,,,,747,747,747.0,EEABGJJD_00819,EEABGJJD_00819,EEABGJJD_00819,EEABGJJD_00819,EEABGJJD_00819 +phiSF3702_1,phiSF370.2_1,putative repressor protein - phage associated,5,5,1.0,1,915,,,,747,747,747.0,EEABGJJD_00783,EEABGJJD_00783,EEABGJJD_00783,EEABGJJD_00783,EEABGJJD_00783 +group_495,,hypothetical protein (possible integrase/recombinase),5,5,1.0,1,914,,,,747,747,747.0,EEABGJJD_00328,EEABGJJD_00328,EEABGJJD_00328,EEABGJJD_00328,EEABGJJD_00328 +group_494,,16S rRNA (uracil(1498)-N(3))-methyltransferase,5,5,1.0,1,913,,,,750,750,750.0,EEABGJJD_01650,EEABGJJD_01650,EEABGJJD_01650,EEABGJJD_01650,EEABGJJD_01650 +uppS,uppS,putative undecaprenyl pyrophosphate synthetase,5,5,1.0,1,912,,,,750,750,750.0,EEABGJJD_01636,EEABGJJD_01636,EEABGJJD_01636,EEABGJJD_01636,EEABGJJD_01636 +truA,truA,putative tRNA pseudouridine synthase A,5,5,1.0,1,911,,,,750,750,750.0,EEABGJJD_01580,EEABGJJD_01580,EEABGJJD_01580,EEABGJJD_01580,EEABGJJD_01580 +group_493,,sortase,5,5,1.0,1,910,,,,750,750,750.0,EEABGJJD_00969,EEABGJJD_00969,EEABGJJD_00969,EEABGJJD_00969,EEABGJJD_00969 +group_492,,hypothetical protein,5,5,1.0,1,909,,,,750,750,750.0,EEABGJJD_00907,EEABGJJD_00907,EEABGJJD_00907,EEABGJJD_00907,EEABGJJD_00907 +group_491,,DNA translocase FtsK,5,5,1.0,1,908,,,,753,753,753.0,EEABGJJD_01789,EEABGJJD_01789,EEABGJJD_01789,EEABGJJD_01789,EEABGJJD_01789 +group_490,,SseB family protein,5,5,1.0,1,907,,,,753,753,753.0,EEABGJJD_01621,EEABGJJD_01621,EEABGJJD_01621,EEABGJJD_01621,EEABGJJD_01621 +group_489,,putative C3-degrading proteinase,5,5,1.0,1,906,,,,753,753,753.0,EEABGJJD_01538,EEABGJJD_01538,EEABGJJD_01538,EEABGJJD_01538,EEABGJJD_01538 +group_488,,putative chorismate mutase,5,5,1.0,1,905,,,,753,753,753.0,EEABGJJD_01318,EEABGJJD_01318,EEABGJJD_01318,EEABGJJD_01318,EEABGJJD_01318 +group_487,,ADP-ribosyltransferase,5,5,1.0,1,904,,,,753,753,753.0,EEABGJJD_00367,EEABGJJD_00367,EEABGJJD_00367,EEABGJJD_00367,EEABGJJD_00367 +group_486,,class I SAM-dependent methyltransferase,5,5,1.0,1,903,,,,756,756,756.0,EEABGJJD_01322,EEABGJJD_01322,EEABGJJD_01322,EEABGJJD_01322,EEABGJJD_01322 +phiSF3703_3,phiSF370.3_3,putative repressor - phage associated,5,5,1.0,1,902,,,,756,756,756.0,EEABGJJD_01241,EEABGJJD_01241,EEABGJJD_01241,EEABGJJD_01241,EEABGJJD_01241 +group_485,,DNA repair protein RecO,5,5,1.0,1,901,,,,756,756,756.0,EEABGJJD_00040,EEABGJJD_00040,EEABGJJD_00040,EEABGJJD_00040,EEABGJJD_00040 +divIVAS,divIVAS,cell-division initiation protein (septum placement),5,5,1.0,1,900,,,,759,759,759.0,EEABGJJD_01265,EEABGJJD_01265,EEABGJJD_01265,EEABGJJD_01265,EEABGJJD_01265 +pstB,pstB,phosphate ABC transporter (ATP-binding protein),5,5,1.0,1,899,,,,759,759,759.0,EEABGJJD_01042,EEABGJJD_01042,EEABGJJD_01042,EEABGJJD_01042,EEABGJJD_01042 +group_484,,ABC transporter ATP-binding protein,5,5,1.0,1,898,,,,759,759,759.0,EEABGJJD_00851,EEABGJJD_00851,EEABGJJD_00851,EEABGJJD_00851,EEABGJJD_00851 +mf2,mf2,putative DNase,5,5,1.0,1,897,,,,759,759,759.0,EEABGJJD_00590,EEABGJJD_00590,EEABGJJD_00590,EEABGJJD_00590,EEABGJJD_00590 +tpi,tpi,putative triosephosphate isomerase,5,5,1.0,1,896,,,,759,759,759.0,EEABGJJD_00510,EEABGJJD_00510,EEABGJJD_00510,EEABGJJD_00510,EEABGJJD_00510 +group_483,,hypothetical protein,5,5,1.0,1,895,,,,762,762,762.0,EEABGJJD_01158,EEABGJJD_01158,EEABGJJD_01158,EEABGJJD_01158,EEABGJJD_01158 +phiSF3701_1,phiSF370.1_1,putative cI-like repressor phage associated,5,5,1.0,1,894,,,,762,762,762.0,EEABGJJD_00544,EEABGJJD_00544,EEABGJJD_00544,EEABGJJD_00544,EEABGJJD_00544 +mecA,mecA,adaptor protein MecA,5,5,1.0,1,893,,,,762,762,762.0,EEABGJJD_00263,EEABGJJD_00263,EEABGJJD_00263,EEABGJJD_00263,EEABGJJD_00263 +thiD,thiD,putative phosphomethylpyrimidine kinase,5,5,1.0,1,892,,,,765,765,765.0,EEABGJJD_01579,EEABGJJD_01579,EEABGJJD_01579,EEABGJJD_01579,EEABGJJD_01579 +group_482,,TIGR01457 family HAD-type hydrolase,5,5,1.0,1,891,,,,765,765,765.0,EEABGJJD_00870,EEABGJJD_00870,EEABGJJD_00870,EEABGJJD_00870,EEABGJJD_00870 +group_481,,putative oxidoreductase,5,5,1.0,1,890,,,,765,765,765.0,EEABGJJD_00770,EEABGJJD_00770,EEABGJJD_00770,EEABGJJD_00770,EEABGJJD_00770 +group_480,,hypothetical protein,5,5,1.0,1,889,,,,768,768,768.0,EEABGJJD_01757,EEABGJJD_01757,EEABGJJD_01757,EEABGJJD_01757,EEABGJJD_01757 +phiSF3704_1,phiSF370.4_1,putative repressor protein - phage associated,5,5,1.0,1,888,,,,768,768,768.0,EEABGJJD_01755,EEABGJJD_01755,EEABGJJD_01755,EEABGJJD_01755,EEABGJJD_01755 +rpsB,rpsB,30S ribosomal protein S2,5,5,1.0,1,887,,,,768,768,768.0,EEABGJJD_01728,EEABGJJD_01728,EEABGJJD_01728,EEABGJJD_01728,EEABGJJD_01728 +group_479,,epoxyqueuosine reductase QueH,5,5,1.0,1,886,,,,768,768,768.0,EEABGJJD_00223,EEABGJJD_00223,EEABGJJD_00223,EEABGJJD_00223,EEABGJJD_00223 +accA,accA,putative acetyl-CoA carboxylase alpha subunit,5,5,1.0,1,885,,,,771,771,771.0,EEABGJJD_01457,EEABGJJD_01457,EEABGJJD_01457,EEABGJJD_01457,EEABGJJD_01457 +lacR1,lacR.1,putative lactose phosphotransferase system repressor protein,5,5,1.0,1,884,,,,771,771,771.0,EEABGJJD_01429,EEABGJJD_01429,EEABGJJD_01429,EEABGJJD_01429,EEABGJJD_01429 +group_478,,ABC transporter ATP-binding protein,5,5,1.0,1,883,,,,771,771,771.0,EEABGJJD_00265,EEABGJJD_00265,EEABGJJD_00265,EEABGJJD_00265,EEABGJJD_00265 +proC,proC,putative pyrroline carboxylate reductase,5,5,1.0,1,882,,,,771,771,771.0,EEABGJJD_00127,EEABGJJD_00127,EEABGJJD_00127,EEABGJJD_00127,EEABGJJD_00127 +group_477,,putative pyruvate formate-lyase activating enzyme,5,5,1.0,1,881,,,,774,774,774.0,EEABGJJD_01700,EEABGJJD_01700,EEABGJJD_01700,EEABGJJD_01700,EEABGJJD_01700 +lacR2,lacR.2,putative lactose phosphotransferase system repressor protein,5,5,1.0,1,880,,,,774,774,774.0,EEABGJJD_01606,EEABGJJD_01606,EEABGJJD_01606,EEABGJJD_01606,EEABGJJD_01606 +cfa,cfa,CAMP factor,5,5,1.0,1,879,,,,774,774,774.0,EEABGJJD_01069,EEABGJJD_01069,EEABGJJD_01069,EEABGJJD_01069,EEABGJJD_01069 +spoJ,spoJ,putative chromosome segregation protein,5,5,1.0,1,878,,,,777,777,777.0,EEABGJJD_01835,EEABGJJD_01835,EEABGJJD_01835,EEABGJJD_01835,EEABGJJD_01835 +group_476,,regulatory protein RecX,5,5,1.0,1,877,,,,777,777,777.0,EEABGJJD_01339,EEABGJJD_01339,EEABGJJD_01339,EEABGJJD_01339,EEABGJJD_01339 +group_475,,methyltransferase,5,5,1.0,1,876,,,,777,777,777.0,EEABGJJD_01178,EEABGJJD_01178,EEABGJJD_01178,EEABGJJD_01178,EEABGJJD_01178 +potC,potC,putative spermidine/putrescine ABC transporter (permease protein),5,5,1.0,1,875,,,,777,777,777.0,EEABGJJD_00924,EEABGJJD_00924,EEABGJJD_00924,EEABGJJD_00924,EEABGJJD_00924 +udp,udp,putative uridine phosphorylase,5,5,1.0,1,874,,,,780,780,780.0,EEABGJJD_01553,EEABGJJD_01553,EEABGJJD_01553,EEABGJJD_01553,EEABGJJD_01553 +group_474,,putative oxidoreductase,5,5,1.0,1,873,,,,780,780,780.0,EEABGJJD_01372,EEABGJJD_01372,EEABGJJD_01372,EEABGJJD_01372,EEABGJJD_01372 +group_473,,formate/nitrite transporter,5,5,1.0,1,872,,,,780,780,780.0,EEABGJJD_01187,EEABGJJD_01187,EEABGJJD_01187,EEABGJJD_01187,EEABGJJD_01187 +speI,speI,streptococcal exotoxin I,5,5,1.0,1,871,,,,780,780,780.0,EEABGJJD_00842,EEABGJJD_00842,EEABGJJD_00842,EEABGJJD_00842,EEABGJJD_00842 +group_472,,phage tail protein,5,5,1.0,1,870,,,,780,780,780.0,EEABGJJD_00832,EEABGJJD_00832,EEABGJJD_00832,EEABGJJD_00832,EEABGJJD_00832 +group_471,,prolipoprotein diacylglyceryl transferase,5,5,1.0,1,869,,,,780,780,780.0,EEABGJJD_00490,EEABGJJD_00490,EEABGJJD_00490,EEABGJJD_00490,EEABGJJD_00490 +group_470,,metal-dependent hydrolase,5,5,1.0,1,868,,,,780,780,780.0,EEABGJJD_00206,EEABGJJD_00206,EEABGJJD_00206,EEABGJJD_00206,EEABGJJD_00206 +codY,codY,putative transcriptional pleiotropic repressor,5,5,1.0,1,867,,,,783,783,783.0,EEABGJJD_01483,EEABGJJD_01483,EEABGJJD_01483,EEABGJJD_01483,EEABGJJD_01483 +agaW,agaW,putative PTS dependent N-acetyl-galactosamine-IIC component,5,5,1.0,1,866,,,,783,783,783.0,EEABGJJD_00520,EEABGJJD_00520,EEABGJJD_00520,EEABGJJD_00520,EEABGJJD_00520 +fhuA,fhuA,putative ferrichrome ABC transporter (ATP-binding protein),5,5,1.0,1,865,,,,783,783,783.0,EEABGJJD_00345,EEABGJJD_00345,EEABGJJD_00345,EEABGJJD_00345,EEABGJJD_00345 +group_469,,ABC transporter permease,5,5,1.0,1,864,,,,786,786,786.0,EEABGJJD_01081,EEABGJJD_01081,EEABGJJD_01081,EEABGJJD_01081,EEABGJJD_01081 +estA,estA,putative tributyrin esterase,5,5,1.0,1,863,,,,786,786,786.0,EEABGJJD_00853,EEABGJJD_00853,EEABGJJD_00853,EEABGJJD_00853,EEABGJJD_00853 +group_468,,multidrug ABC transporter permease,5,5,1.0,1,862,,,,786,786,786.0,EEABGJJD_00439,EEABGJJD_00439,EEABGJJD_00439,EEABGJJD_00439,EEABGJJD_00439 +group_467,,putative myo-inositol-1(or 4)-monophosphatase,5,5,1.0,1,861,,,,789,789,789.0,EEABGJJD_01048,EEABGJJD_01048,EEABGJJD_01048,EEABGJJD_01048,EEABGJJD_01048 +group_466,,putative two-component response regulator,5,5,1.0,1,860,,,,789,789,789.0,EEABGJJD_00886,EEABGJJD_00886,EEABGJJD_00886,EEABGJJD_00886,EEABGJJD_00886 +group_465,,Nif3-like dinuclear metal center hexameric protein,5,5,1.0,1,859,,,,789,789,789.0,EEABGJJD_00776,EEABGJJD_00776,EEABGJJD_00776,EEABGJJD_00776,EEABGJJD_00776 +group_464,,histidine protein kinase,5,5,1.0,1,858,,,,789,789,789.0,EEABGJJD_00742,EEABGJJD_00742,EEABGJJD_00742,EEABGJJD_00742,EEABGJJD_00742 +phaB,phaB,putative enoyl CoA hydratase,5,5,1.0,1,857,,,,792,792,792.0,EEABGJJD_01469,EEABGJJD_01469,EEABGJJD_01469,EEABGJJD_01469,EEABGJJD_01469 +group_463,,aminoglycoside phosphotransferase,5,5,1.0,1,856,,,,792,792,792.0,EEABGJJD_01442,EEABGJJD_01442,EEABGJJD_01442,EEABGJJD_01442,EEABGJJD_01442 +group_462,,RNA-binding protein,5,5,1.0,1,855,,,,792,792,792.0,EEABGJJD_01266,EEABGJJD_01266,EEABGJJD_01266,EEABGJJD_01266,EEABGJJD_01266 +rnh,rnh,putative ribonuclease HII,5,5,1.0,1,854,,,,792,792,792.0,EEABGJJD_00977,EEABGJJD_00977,EEABGJJD_00977,EEABGJJD_00977,EEABGJJD_00977 +group_461,,glutamine amidotransferase,5,5,1.0,1,853,,,,792,792,792.0,EEABGJJD_00862,EEABGJJD_00862,EEABGJJD_00862,EEABGJJD_00862,EEABGJJD_00862 +bet,bet,phage recombination protein Bet,5,5,1.0,1,852,,,,792,792,792.0,EEABGJJD_00798,EEABGJJD_00798,EEABGJJD_00798,EEABGJJD_00798,EEABGJJD_00798 +group_460,,TatD family deoxyribonuclease,5,5,1.0,1,851,,,,792,792,792.0,EEABGJJD_00244,EEABGJJD_00244,EEABGJJD_00244,EEABGJJD_00244,EEABGJJD_00244 +cdsA,cdsA,putative phosphatidate cytidylyltransferase (CDP-diglyceride synthase),5,5,1.0,1,850,,,,795,795,795.0,EEABGJJD_01635,EEABGJJD_01635,EEABGJJD_01635,EEABGJJD_01635,EEABGJJD_01635 +potB,potB,putative spermidine / putrescine ABC transporter (permease protein),5,5,1.0,1,849,,,,795,795,795.0,EEABGJJD_00923,EEABGJJD_00923,EEABGJJD_00923,EEABGJJD_00923,EEABGJJD_00923 +idnO,idnO,putative 5-keto-D-gluconate 5-reductase,5,5,1.0,1,848,,,,795,795,795.0,EEABGJJD_00524,EEABGJJD_00524,EEABGJJD_00524,EEABGJJD_00524,EEABGJJD_00524 +glr,glr,putative glutamate racemase,5,5,1.0,1,847,,,,795,795,795.0,EEABGJJD_00324,EEABGJJD_00324,EEABGJJD_00324,EEABGJJD_00324,EEABGJJD_00324 +group_459,,hypothetical protein,5,5,1.0,1,846,,,,798,798,798.0,EEABGJJD_01616,EEABGJJD_01616,EEABGJJD_01616,EEABGJJD_01616,EEABGJJD_01616 +phiSF3703_1,phiSF370.3_1,hypothetical protein phage associated,5,5,1.0,1,845,,,,798,798,798.0,EEABGJJD_01229,EEABGJJD_01229,EEABGJJD_01229,EEABGJJD_01229,EEABGJJD_01229 +group_458,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,844,,,,798,798,798.0,EEABGJJD_00477,EEABGJJD_00477,EEABGJJD_00477,EEABGJJD_00477,EEABGJJD_00477 +group_457,,energy-coupling factor transporter transmembrane protein EcfT,5,5,1.0,1,843,,,,801,801,801.0,EEABGJJD_01811,EEABGJJD_01811,EEABGJJD_01811,EEABGJJD_01811,EEABGJJD_01811 +sda3,sda3,streptodornase Sda3,5,5,1.0,1,842,,,,801,801,801.0,EEABGJJD_01194,EEABGJJD_01194,EEABGJJD_01194,EEABGJJD_01194,EEABGJJD_01194 +group_456,,putative transcription regulator (LacI family),5,5,1.0,1,841,,,,801,801,801.0,EEABGJJD_01091,EEABGJJD_01091,EEABGJJD_01091,EEABGJJD_01091,EEABGJJD_01091 +folP,folP,dihydropteroate synthase,5,5,1.0,1,840,,,,801,801,801.0,EEABGJJD_00918,EEABGJJD_00918,EEABGJJD_00918,EEABGJJD_00918,EEABGJJD_00918 +dppD,dppD,ATPase protein,5,5,1.0,1,839,,,,804,804,804.0,EEABGJJD_01664,EEABGJJD_01664,EEABGJJD_01664,EEABGJJD_01664,EEABGJJD_01664 +group_455,,amino acid ABC transporter permease,5,5,1.0,1,838,,,,804,804,804.0,EEABGJJD_01389,EEABGJJD_01389,EEABGJJD_01389,EEABGJJD_01389,EEABGJJD_01389 +pstB2,pstB2,putative phosphate ABC transporter (ATP-binding protein),5,5,1.0,1,837,,,,804,804,804.0,EEABGJJD_01043,EEABGJJD_01043,EEABGJJD_01043,EEABGJJD_01043,EEABGJJD_01043 +group_454,,glycosyl transferase family 9,5,5,1.0,1,836,,,,804,804,804.0,EEABGJJD_00654,EEABGJJD_00654,EEABGJJD_00654,EEABGJJD_00654,EEABGJJD_00654 +group_453,,IS3 family transposase,5,5,1.0,1,835,,,,807,807,807.0,EEABGJJD_01121,EEABGJJD_01121,EEABGJJD_01121,EEABGJJD_01121,EEABGJJD_01121 +phiMGAS50052_4,phiMGAS5005.2_4,phage protein,5,5,1.0,1,834,,,,807,807,807.0,EEABGJJD_00790,EEABGJJD_00790,EEABGJJD_00790,EEABGJJD_00790,EEABGJJD_00790 +group_452,,putative cyclophilin-type protein,5,5,1.0,1,833,,,,807,807,807.0,EEABGJJD_00387,EEABGJJD_00387,EEABGJJD_00387,EEABGJJD_00387,EEABGJJD_00387 +group_451,,putative transposase IS861,5,5,1.0,1,832,,,,807,807,807.0,EEABGJJD_00200,EEABGJJD_00200,EEABGJJD_00200,EEABGJJD_00200,EEABGJJD_00200 +group_450,,PTS mannose/fructose/sorbose transporter subunit IIC,5,5,1.0,1,831,,,,810,810,810.0,EEABGJJD_01453,EEABGJJD_01453,EEABGJJD_01453,EEABGJJD_01453,EEABGJJD_01453 +group_449,,PrsW family intramembrane metalloprotease,5,5,1.0,1,830,,,,810,810,810.0,EEABGJJD_01143,EEABGJJD_01143,EEABGJJD_01143,EEABGJJD_01143,EEABGJJD_01143 +group_448,,PTS mannose/fructose/sorbose/N-acetylgalactosamine transporter subunit IIC,5,5,1.0,1,829,,,,810,810,810.0,EEABGJJD_00883,EEABGJJD_00883,EEABGJJD_00883,EEABGJJD_00883,EEABGJJD_00883 +group_447,,MazF family toxin-antitoxin system,5,5,1.0,1,828,,,,810,810,810.0,EEABGJJD_00782,EEABGJJD_00782,EEABGJJD_00782,EEABGJJD_00782,EEABGJJD_00782 +punA,punA,putative purine nucleoside phosphorylase,5,5,1.0,1,827,,,,810,810,810.0,EEABGJJD_00740,EEABGJJD_00740,EEABGJJD_00740,EEABGJJD_00740,EEABGJJD_00740 +group_446,,sugar phosphate phosphatase,5,5,1.0,1,826,,,,810,810,810.0,EEABGJJD_00513,EEABGJJD_00513,EEABGJJD_00513,EEABGJJD_00513,EEABGJJD_00513 +group_445,,MBL fold metallo-hydrolase,5,5,1.0,1,825,,,,810,810,810.0,EEABGJJD_00445,EEABGJJD_00445,EEABGJJD_00445,EEABGJJD_00445,EEABGJJD_00445 +group_444,,Membrane protein insertase YidC 1,5,5,1.0,1,824,,,,810,810,810.0,EEABGJJD_00233,EEABGJJD_00233,EEABGJJD_00233,EEABGJJD_00233,EEABGJJD_00233 +group_443,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,823,,,,813,813,813.0,EEABGJJD_01451,EEABGJJD_01451,EEABGJJD_01451,EEABGJJD_01451,EEABGJJD_01451 +group_442,,protein-ADP-ribose hydrolase,5,5,1.0,1,822,,,,813,813,813.0,EEABGJJD_01021,EEABGJJD_01021,EEABGJJD_01021,EEABGJJD_01021,EEABGJJD_01021 +group_441,,ABC transporter substrate-binding protein,5,5,1.0,1,821,,,,813,813,813.0,EEABGJJD_00645,EEABGJJD_00645,EEABGJJD_00645,EEABGJJD_00645,EEABGJJD_00645 +group_440,,streptodornase,5,5,1.0,1,820,,,,816,816,816.0,EEABGJJD_01690,EEABGJJD_01690,EEABGJJD_01690,EEABGJJD_01690,EEABGJJD_01690 +group_439,,PTS sugar transporter subunit IIC,5,5,1.0,1,819,,,,816,816,816.0,EEABGJJD_01108,EEABGJJD_01108,EEABGJJD_01108,EEABGJJD_01108,EEABGJJD_01108 +group_438,,maltodextrose utilization protein MalA,5,5,1.0,1,818,,,,816,816,816.0,EEABGJJD_01092,EEABGJJD_01092,EEABGJJD_01092,EEABGJJD_01092,EEABGJJD_01092 +group_437,,TIGR03943 family protein,5,5,1.0,1,817,,,,816,816,816.0,EEABGJJD_00484,EEABGJJD_00484,EEABGJJD_00484,EEABGJJD_00484,EEABGJJD_00484 +adcB,adcB,ABC transporter permease,5,5,1.0,1,816,,,,816,816,816.0,EEABGJJD_00111,EEABGJJD_00111,EEABGJJD_00111,EEABGJJD_00111,EEABGJJD_00111 +group_436,,endonuclease/exonuclease/phosphatase,5,5,1.0,1,815,,,,819,819,819.0,EEABGJJD_01648,EEABGJJD_01648,EEABGJJD_01648,EEABGJJD_01648,EEABGJJD_01648 +phiNCTC81984_4,phiNCTC8198.4_4,N4-gp56 family major capsid protein,5,5,1.0,1,814,,,,819,819,819.0,EEABGJJD_00571,EEABGJJD_00571,EEABGJJD_00571,EEABGJJD_00571,EEABGJJD_00571 +group_435,,antibiotic ABC transporter permease,5,5,1.0,1,813,,,,819,819,819.0,EEABGJJD_00438,EEABGJJD_00438,EEABGJJD_00438,EEABGJJD_00438,EEABGJJD_00438 +dppC,dppC,transmembrane transport protein,5,5,1.0,1,812,,,,822,822,822.0,EEABGJJD_01663,EEABGJJD_01663,EEABGJJD_01663,EEABGJJD_01663,EEABGJJD_01663 +proB,proB,putative gamma-glutamyl kinase,5,5,1.0,1,811,,,,822,822,822.0,EEABGJJD_01397,EEABGJJD_01397,EEABGJJD_01397,EEABGJJD_01397,EEABGJJD_01397 +agaD,agaD,putative PTS dependent galactosamine IID component,5,5,1.0,1,810,,,,822,822,822.0,EEABGJJD_00519,EEABGJJD_00519,EEABGJJD_00519,EEABGJJD_00519,EEABGJJD_00519 +group_434,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,809,,,,825,825,825.0,EEABGJJD_01419,EEABGJJD_01419,EEABGJJD_01419,EEABGJJD_01419,EEABGJJD_01419 +nadE,nadE,putative NAD+ synthase,5,5,1.0,1,808,,,,825,825,825.0,EEABGJJD_01383,EEABGJJD_01383,EEABGJJD_01383,EEABGJJD_01383,EEABGJJD_01383 +group_433,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,807,,,,825,825,825.0,EEABGJJD_00478,EEABGJJD_00478,EEABGJJD_00478,EEABGJJD_00478,EEABGJJD_00478 +group_432,,asparagine ligase A,5,5,1.0,1,806,,,,825,825,825.0,EEABGJJD_00470,EEABGJJD_00470,EEABGJJD_00470,EEABGJJD_00470,EEABGJJD_00470 +hlyA1,hlyA1,putative hemolysin,5,5,1.0,1,805,,,,828,828,828.0,EEABGJJD_01251,EEABGJJD_01251,EEABGJJD_01251,EEABGJJD_01251,EEABGJJD_01251 +group_431,,putative phospotransferase system (PTS) enzyme II component D,5,5,1.0,1,804,,,,828,828,828.0,EEABGJJD_00884,EEABGJJD_00884,EEABGJJD_00884,EEABGJJD_00884,EEABGJJD_00884 +fpg,fpg,putative formamidopyrimidine-DNA glycosylase,5,5,1.0,1,803,,,,828,828,828.0,EEABGJJD_00418,EEABGJJD_00418,EEABGJJD_00418,EEABGJJD_00418,EEABGJJD_00418 +exoA,exoA,putative 3'-exo-deoxyribonuclease,5,5,1.0,1,802,,,,828,828,828.0,EEABGJJD_00359,EEABGJJD_00359,EEABGJJD_00359,EEABGJJD_00359,EEABGJJD_00359 +group_430,,MurR/RpiR family transcriptional regulator,5,5,1.0,1,801,,,,831,831,831.0,EEABGJJD_00243,EEABGJJD_00243,EEABGJJD_00243,EEABGJJD_00243,EEABGJJD_00243 +group_429,,putative sugar transport protein (permease),5,5,1.0,1,800,,,,831,831,831.0,EEABGJJD_00239,EEABGJJD_00239,EEABGJJD_00239,EEABGJJD_00239,EEABGJJD_00239 +rplB,rplB,50S ribosomal protein L2,5,5,1.0,1,799,,,,834,834,834.0,EEABGJJD_00069,EEABGJJD_00069,EEABGJJD_00069,EEABGJJD_00069,EEABGJJD_00069 +group_428,,ABC transporter ATP-binding protein,5,5,1.0,1,798,,,,837,837,837.0,EEABGJJD_01496,EEABGJJD_01496,EEABGJJD_01496,EEABGJJD_01496,EEABGJJD_01496 +malG,malG,putative maltose/maltodextrin ABC transport system (permease),5,5,1.0,1,797,,,,837,837,837.0,EEABGJJD_01090,EEABGJJD_01090,EEABGJJD_01090,EEABGJJD_01090,EEABGJJD_01090 +group_427,,amino acid ABC transporter substrate-binding protein,5,5,1.0,1,796,,,,837,837,837.0,EEABGJJD_01070,EEABGJJD_01070,EEABGJJD_01070,EEABGJJD_01070,EEABGJJD_01070 +group_426,,lipoate--protein ligase family protein,5,5,1.0,1,795,,,,837,837,837.0,EEABGJJD_01025,EEABGJJD_01025,EEABGJJD_01025,EEABGJJD_01025,EEABGJJD_01025 +group_425,,DNA-protecting protein DprA,5,5,1.0,1,794,,,,837,837,837.0,EEABGJJD_00978,EEABGJJD_00978,EEABGJJD_00978,EEABGJJD_00978,EEABGJJD_00978 +group_424,,NAD(+) kinase,5,5,1.0,1,793,,,,837,837,837.0,EEABGJJD_00943,EEABGJJD_00943,EEABGJJD_00943,EEABGJJD_00943,EEABGJJD_00943 +group_423,,sugar phosphate isomerase/epimerase,5,5,1.0,1,792,,,,837,837,837.0,EEABGJJD_00450,EEABGJJD_00450,EEABGJJD_00450,EEABGJJD_00450,EEABGJJD_00450 +group_422,,ABC transporter (ATP-binding protein),5,5,1.0,1,791,,,,840,840,840.0,EEABGJJD_01813,EEABGJJD_01813,EEABGJJD_01813,EEABGJJD_01813,EEABGJJD_01813 +group_421,,putative reductase / dehydrogenase,5,5,1.0,1,790,,,,840,840,840.0,EEABGJJD_01413,EEABGJJD_01413,EEABGJJD_01413,EEABGJJD_01413,EEABGJJD_01413 +group_420,,DegV family protein,5,5,1.0,1,789,,,,840,840,840.0,EEABGJJD_01247,EEABGJJD_01247,EEABGJJD_01247,EEABGJJD_01247,EEABGJJD_01247 +hemK,hemK,putative protoporphyrinogen oxidase,5,5,1.0,1,788,,,,840,840,840.0,EEABGJJD_00958,EEABGJJD_00958,EEABGJJD_00958,EEABGJJD_00958,EEABGJJD_00958 +group_419,,hypothetical protein,5,5,1.0,1,787,,,,840,840,840.0,EEABGJJD_00785,EEABGJJD_00785,EEABGJJD_00785,EEABGJJD_00785,EEABGJJD_00785 +thyA,thyA,putative thymidylate synthase,5,5,1.0,1,786,,,,840,840,840.0,EEABGJJD_00730,EEABGJJD_00730,EEABGJJD_00730,EEABGJJD_00730,EEABGJJD_00730 +bacA,bacA,putative undecaprenol kinase (bacitracin resistance protein),5,5,1.0,1,785,,,,840,840,840.0,EEABGJJD_00262,EEABGJJD_00262,EEABGJJD_00262,EEABGJJD_00262,EEABGJJD_00262 +purR,purR,putative purine operon repressor,5,5,1.0,1,784,,,,840,840,840.0,EEABGJJD_00252,EEABGJJD_00252,EEABGJJD_00252,EEABGJJD_00252,EEABGJJD_00252 +group_418,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,783,,,,843,843,843.0,EEABGJJD_01812,EEABGJJD_01812,EEABGJJD_01812,EEABGJJD_01812,EEABGJJD_01812 +group_417,,hypothetical protein,5,5,1.0,1,782,,,,843,843,843.0,EEABGJJD_01792,EEABGJJD_01792,EEABGJJD_01792,EEABGJJD_01792,EEABGJJD_01792 +ropB,ropB,putative transcription regulator,5,5,1.0,1,781,,,,843,843,843.0,EEABGJJD_01689,EEABGJJD_01689,EEABGJJD_01689,EEABGJJD_01689,EEABGJJD_01689 +group_416,,fatty acid-binding protein DegV,5,5,1.0,1,780,,,,843,843,843.0,EEABGJJD_01417,EEABGJJD_01417,EEABGJJD_01417,EEABGJJD_01417,EEABGJJD_01417 +group_415,,transcription antiterminator BglG,5,5,1.0,1,779,,,,843,843,843.0,EEABGJJD_00481,EEABGJJD_00481,EEABGJJD_00481,EEABGJJD_00481,EEABGJJD_00481 +group_414,,ABC transporter substrate-binding protein,5,5,1.0,1,778,,,,843,843,843.0,EEABGJJD_00292,EEABGJJD_00292,EEABGJJD_00292,EEABGJJD_00292,EEABGJJD_00292 +group_413,,mechanosensitive ion channel family protein,5,5,1.0,1,777,,,,846,846,846.0,EEABGJJD_01576,EEABGJJD_01576,EEABGJJD_01576,EEABGJJD_01576,EEABGJJD_01576 +group_412,,hypothetical protein,5,5,1.0,1,776,,,,846,846,846.0,EEABGJJD_00459,EEABGJJD_00459,EEABGJJD_00459,EEABGJJD_00459,EEABGJJD_00459 +group_411,,ABC transporter substrate-binding protein,5,5,1.0,1,775,,,,846,846,846.0,EEABGJJD_00293,EEABGJJD_00293,EEABGJJD_00293,EEABGJJD_00293,EEABGJJD_00293 +lrp,lrp,leucine-rich protein,5,5,1.0,1,774,,,,849,849,849.0,EEABGJJD_01642,EEABGJJD_01642,EEABGJJD_01642,EEABGJJD_01642,EEABGJJD_01642 +glpF2,glpF.2,putative glycerol uptake facilitator protein,5,5,1.0,1,773,,,,849,849,849.0,EEABGJJD_01540,EEABGJJD_01540,EEABGJJD_01540,EEABGJJD_01540,EEABGJJD_01540 +group_410,,type I-C CRISPR-associated protein Cas7/Csd2,5,5,1.0,1,772,,,,849,849,849.0,EEABGJJD_01307,EEABGJJD_01307,EEABGJJD_01307,EEABGJJD_01307,EEABGJJD_01307 +group_409,,ribosome biogenesis GTPase YlqF,5,5,1.0,1,771,,,,849,849,849.0,EEABGJJD_00976,EEABGJJD_00976,EEABGJJD_00976,EEABGJJD_00976,EEABGJJD_00976 +group_408,,fatty acid-binding protein DegV,5,5,1.0,1,770,,,,849,849,849.0,EEABGJJD_00716,EEABGJJD_00716,EEABGJJD_00716,EEABGJJD_00716,EEABGJJD_00716 +group_407,,N-acetylmuramoyl-L-alanine amidase,5,5,1.0,1,769,,,,849,849,849.0,EEABGJJD_00502,EEABGJJD_00502,EEABGJJD_00502,EEABGJJD_00502,EEABGJJD_00502 +group_406,,TIGR00159 family protein,5,5,1.0,1,768,,,,852,852,852.0,EEABGJJD_00864,EEABGJJD_00864,EEABGJJD_00864,EEABGJJD_00864,EEABGJJD_00864 +lppC,lppC,putative acid phosphatase,5,5,1.0,1,767,,,,855,855,855.0,EEABGJJD_01565,EEABGJJD_01565,EEABGJJD_01565,EEABGJJD_01565,EEABGJJD_01565 +folD,folD,putative bifunctional methylenetetrahydrofolate dehydrogenase / methenyltetrahydrofolate cyclohydrolase,5,5,1.0,1,766,,,,855,855,855.0,EEABGJJD_01255,EEABGJJD_01255,EEABGJJD_01255,EEABGJJD_01255,EEABGJJD_01255 +group_405,,dTDP-4-dehydrorhamnose reductase,5,5,1.0,1,765,,,,855,855,855.0,EEABGJJD_00651,EEABGJJD_00651,EEABGJJD_00651,EEABGJJD_00651,EEABGJJD_00651 +group_404,,metal ABC transporter permease,5,5,1.0,1,764,,,,855,855,855.0,EEABGJJD_00386,EEABGJJD_00386,EEABGJJD_00386,EEABGJJD_00386,EEABGJJD_00386 +phiSF3704_2,phiSF370.4_2,putative replication protein,5,5,1.0,1,763,,,,858,858,858.0,EEABGJJD_01765,EEABGJJD_01765,EEABGJJD_01765,EEABGJJD_01765,EEABGJJD_01765 +group_403,,glyoxalase,5,5,1.0,1,762,,,,858,858,858.0,EEABGJJD_01735,EEABGJJD_01735,EEABGJJD_01735,EEABGJJD_01735,EEABGJJD_01735 +group_402,,S1 RNA-binding protein,5,5,1.0,1,761,,,,858,858,858.0,EEABGJJD_00394,EEABGJJD_00394,EEABGJJD_00394,EEABGJJD_00394,EEABGJJD_00394 +group_401,,DegV family protein,5,5,1.0,1,760,,,,861,861,861.0,EEABGJJD_01613,EEABGJJD_01613,EEABGJJD_01613,EEABGJJD_01613,EEABGJJD_01613 +group_400,,lipase,5,5,1.0,1,759,,,,861,861,861.0,EEABGJJD_01246,EEABGJJD_01246,EEABGJJD_01246,EEABGJJD_01246,EEABGJJD_01246 +map,map,putative methionine aminopeptidase,5,5,1.0,1,758,,,,861,861,861.0,EEABGJJD_01133,EEABGJJD_01133,EEABGJJD_01133,EEABGJJD_01133,EEABGJJD_01133 +malD,malD,putative maltodextrin transport system permease,5,5,1.0,1,757,,,,861,861,861.0,EEABGJJD_01093,EEABGJJD_01093,EEABGJJD_01093,EEABGJJD_01093,EEABGJJD_01093 +group_399,,sugar transporter,5,5,1.0,1,756,,,,864,864,864.0,EEABGJJD_01823,EEABGJJD_01823,EEABGJJD_01823,EEABGJJD_01823,EEABGJJD_01823 +group_398,,putative ABC transporter (binding protein),5,5,1.0,1,755,,,,864,864,864.0,EEABGJJD_00748,EEABGJJD_00748,EEABGJJD_00748,EEABGJJD_00748,EEABGJJD_00748 +group_397,,rRNA (cytidine-2'-O-)-methyltransferase,5,5,1.0,1,754,,,,864,864,864.0,EEABGJJD_00355,EEABGJJD_00355,EEABGJJD_00355,EEABGJJD_00355,EEABGJJD_00355 +pflC,pflC,putative pyruvate-formate lyase activating enzyme,5,5,1.0,1,753,,,,864,864,864.0,EEABGJJD_00339,EEABGJJD_00339,EEABGJJD_00339,EEABGJJD_00339,EEABGJJD_00339 +group_396,,hypothetical protein,5,5,1.0,1,752,,,,864,864,864.0,EEABGJJD_00202,EEABGJJD_00202,EEABGJJD_00202,EEABGJJD_00202,EEABGJJD_00202 +group_395,,putative hexulose-6-phosphate isomerase,5,5,1.0,1,751,,,,864,864,864.0,EEABGJJD_00183,EEABGJJD_00183,EEABGJJD_00183,EEABGJJD_00183,EEABGJJD_00183 +accD,accD,acetyl-CoA carboxylase carboxyl transferase subunit beta,5,5,1.0,1,750,,,,867,867,867.0,EEABGJJD_01458,EEABGJJD_01458,EEABGJJD_01458,EEABGJJD_01458,EEABGJJD_01458 +group_394,,phosphate-binding protein,5,5,1.0,1,749,,,,867,867,867.0,EEABGJJD_01046,EEABGJJD_01046,EEABGJJD_01046,EEABGJJD_01046,EEABGJJD_01046 +pdxK,pdxK,putative pyridoxal kinase,5,5,1.0,1,748,,,,867,867,867.0,EEABGJJD_01014,EEABGJJD_01014,EEABGJJD_01014,EEABGJJD_01014,EEABGJJD_01014 +mutR,mutR,putative positive transcriptional regulator,5,5,1.0,1,747,,,,867,867,867.0,EEABGJJD_00417,EEABGJJD_00417,EEABGJJD_00417,EEABGJJD_00417,EEABGJJD_00417 +group_393,,type II CRISPR-associated endonuclease Cas1,5,5,1.0,1,746,,,,870,870,870.0,EEABGJJD_00873,EEABGJJD_00873,EEABGJJD_00873,EEABGJJD_00873,EEABGJJD_00873 +group_392,,putative ABC transport protein (permease),5,5,1.0,1,745,,,,870,870,870.0,EEABGJJD_00850,EEABGJJD_00850,EEABGJJD_00850,EEABGJJD_00850,EEABGJJD_00850 +cpsFO,cpsFO,glucose-1-phosphate thymidyl transferase,5,5,1.0,1,744,,,,870,870,870.0,EEABGJJD_00778,EEABGJJD_00778,EEABGJJD_00778,EEABGJJD_00778,EEABGJJD_00778 +group_391,,YitT family protein,5,5,1.0,1,743,,,,873,873,873.0,EEABGJJD_01826,EEABGJJD_01826,EEABGJJD_01826,EEABGJJD_01826,EEABGJJD_01826 +sdhA,sdhA,putative L-serine dehydratase alpha subunit,5,5,1.0,1,742,,,,873,873,873.0,EEABGJJD_01809,EEABGJJD_01809,EEABGJJD_01809,EEABGJJD_01809,EEABGJJD_01809 +group_390,,YitT family protein,5,5,1.0,1,741,,,,873,873,873.0,EEABGJJD_01779,EEABGJJD_01779,EEABGJJD_01779,EEABGJJD_01779,EEABGJJD_01779 +fps,fps,putative geranyltranstransferase (farnesyl diphosphate synthase),5,5,1.0,1,740,,,,873,873,873.0,EEABGJJD_01252,EEABGJJD_01252,EEABGJJD_01252,EEABGJJD_01252,EEABGJJD_01252 +aroE2,aroE.2,putative shikimate 5-dehydrogenase,5,5,1.0,1,739,,,,873,873,873.0,EEABGJJD_00449,EEABGJJD_00449,EEABGJJD_00449,EEABGJJD_00449,EEABGJJD_00449 +group_389,,putative positive regulator,5,5,1.0,1,738,,,,873,873,873.0,EEABGJJD_00448,EEABGJJD_00448,EEABGJJD_00448,EEABGJJD_00448,EEABGJJD_00448 +group_388,,GTPase A,5,5,1.0,1,737,,,,873,873,873.0,EEABGJJD_00247,EEABGJJD_00247,EEABGJJD_00247,EEABGJJD_00247,EEABGJJD_00247 +rsmA,rsmA,16S rRNA (adenine(1518)-N(6)/adenine(1519)-N(6))- dimethyltransferase RsmA,5,5,1.0,1,736,,,,873,873,873.0,EEABGJJD_00246,EEABGJJD_00246,EEABGJJD_00246,EEABGJJD_00246,EEABGJJD_00246 +nadC,nadC,putative nicotinate-nucleotide pyrophosphorylase,5,5,1.0,1,735,,,,873,873,873.0,EEABGJJD_00199,EEABGJJD_00199,EEABGJJD_00199,EEABGJJD_00199,EEABGJJD_00199 +group_387,,putative heat shock protein HSP33,5,5,1.0,1,734,,,,873,873,873.0,EEABGJJD_00135,EEABGJJD_00135,EEABGJJD_00135,EEABGJJD_00135,EEABGJJD_00135 +group_386,,heme-binding protein,5,5,1.0,1,733,,,,876,876,876.0,EEABGJJD_01499,EEABGJJD_01499,EEABGJJD_01499,EEABGJJD_01499,EEABGJJD_01499 +czcD,czcD,putative cation-efflux system membrane protein,5,5,1.0,1,732,,,,876,876,876.0,EEABGJJD_00700,EEABGJJD_00700,EEABGJJD_00700,EEABGJJD_00700,EEABGJJD_00700 +atpG,atpG,putative proton-translocating ATPase gamma subunit,5,5,1.0,1,731,,,,876,876,876.0,EEABGJJD_00629,EEABGJJD_00629,EEABGJJD_00629,EEABGJJD_00629,EEABGJJD_00629 +group_385,,putative DNA polymerase III delta' subunit,5,5,1.0,1,730,,,,876,876,876.0,EEABGJJD_00352,EEABGJJD_00352,EEABGJJD_00352,EEABGJJD_00352,EEABGJJD_00352 +group_384,,YitT family protein,5,5,1.0,1,729,,,,879,879,879.0,EEABGJJD_01780,EEABGJJD_01780,EEABGJJD_01780,EEABGJJD_01780,EEABGJJD_01780 +group_383,,putative transcriptional regulator,5,5,1.0,1,728,,,,879,879,879.0,EEABGJJD_01331,EEABGJJD_01331,EEABGJJD_01331,EEABGJJD_01331,EEABGJJD_01331 +aroE,aroE,putative shikimate 5-dehydrogenase,5,5,1.0,1,727,,,,879,879,879.0,EEABGJJD_01323,EEABGJJD_01323,EEABGJJD_01323,EEABGJJD_01323,EEABGJJD_01323 +group_382,,membrane protein,5,5,1.0,1,726,,,,879,879,879.0,EEABGJJD_00166,EEABGJJD_00166,EEABGJJD_00166,EEABGJJD_00166,EEABGJJD_00166 +fba,fba,putative fructose-bisphosphate aldolase,5,5,1.0,1,725,,,,882,882,882.0,EEABGJJD_01570,EEABGJJD_01570,EEABGJJD_01570,EEABGJJD_01570,EEABGJJD_01570 +group_381,,deacetylase SIR2,5,5,1.0,1,724,,,,882,882,882.0,EEABGJJD_01020,EEABGJJD_01020,EEABGJJD_01020,EEABGJJD_01020,EEABGJJD_01020 +group_380,,putative collagen-like protein,5,5,1.0,1,723,,,,882,882,882.0,EEABGJJD_00878,EEABGJJD_00878,EEABGJJD_00878,EEABGJJD_00878,EEABGJJD_00878 +group_379,,fructokinase,5,5,1.0,1,722,,,,885,885,885.0,EEABGJJD_01507,EEABGJJD_01507,EEABGJJD_01507,EEABGJJD_01507,EEABGJJD_01507 +group_378,,heme ABC transporter substrate-binding protein IsdE,5,5,1.0,1,721,,,,885,885,885.0,EEABGJJD_01498,EEABGJJD_01498,EEABGJJD_01498,EEABGJJD_01498,EEABGJJD_01498 +group_377,,tRNA pseudouridine(55) synthase TruB,5,5,1.0,1,720,,,,885,885,885.0,EEABGJJD_01052,EEABGJJD_01052,EEABGJJD_01052,EEABGJJD_01052,EEABGJJD_01052 +citG,citG,CitG-like protein,5,5,1.0,1,719,,,,885,885,885.0,EEABGJJD_00990,EEABGJJD_00990,EEABGJJD_00990,EEABGJJD_00990,EEABGJJD_00990 +group_376,,hypothetical protein,5,5,1.0,1,718,,,,885,885,885.0,EEABGJJD_00462,EEABGJJD_00462,EEABGJJD_00462,EEABGJJD_00462,EEABGJJD_00462 +pstC,pstC,putative phosphate ABC transporter (permease protein),5,5,1.0,1,717,,,,888,888,888.0,EEABGJJD_01044,EEABGJJD_01044,EEABGJJD_01044,EEABGJJD_01044,EEABGJJD_01044 +citE,citE,putative citrate lyase beta subunit,5,5,1.0,1,716,,,,888,888,888.0,EEABGJJD_00998,EEABGJJD_00998,EEABGJJD_00998,EEABGJJD_00998,EEABGJJD_00998 +murB,murB,putative UDP-N-acetylenolpyruvoylglucosamine reductase,5,5,1.0,1,715,,,,888,888,888.0,EEABGJJD_00921,EEABGJJD_00921,EEABGJJD_00921,EEABGJJD_00921,EEABGJJD_00921 +group_375,,transcriptional regulator,5,5,1.0,1,714,,,,888,888,888.0,EEABGJJD_00609,EEABGJJD_00609,EEABGJJD_00609,EEABGJJD_00609,EEABGJJD_00609 +group_374,,ATP-binding protein,5,5,1.0,1,713,,,,891,891,891.0,EEABGJJD_01592,EEABGJJD_01592,EEABGJJD_01592,EEABGJJD_01592,EEABGJJD_01592 +group_373,,eukaryotic hypersensitive-induced response-like protein,5,5,1.0,1,712,,,,891,891,891.0,EEABGJJD_01566,EEABGJJD_01566,EEABGJJD_01566,EEABGJJD_01566,EEABGJJD_01566 +group_372,,RluA family pseudouridine synthase,5,5,1.0,1,711,,,,891,891,891.0,EEABGJJD_00686,EEABGJJD_00686,EEABGJJD_00686,EEABGJJD_00686,EEABGJJD_00686 +group_371,,RNase adaptor protein RapZ,5,5,1.0,1,710,,,,891,891,891.0,EEABGJJD_00539,EEABGJJD_00539,EEABGJJD_00539,EEABGJJD_00539,EEABGJJD_00539 +mvaK1,mvaK1,mevalonate kinase,5,5,1.0,1,709,,,,894,894,894.0,EEABGJJD_00724,EEABGJJD_00724,EEABGJJD_00724,EEABGJJD_00724,EEABGJJD_00724 +group_370,,sulfite exporter TauE/SafE domain protein,5,5,1.0,1,708,,,,897,897,897.0,EEABGJJD_01539,EEABGJJD_01539,EEABGJJD_01539,EEABGJJD_01539,EEABGJJD_01539 +group_369,,LysR family transcriptional regulator,5,5,1.0,1,707,,,,897,897,897.0,EEABGJJD_01368,EEABGJJD_01368,EEABGJJD_01368,EEABGJJD_01368,EEABGJJD_01368 +era,era,GTP-binding protein (GTPase),5,5,1.0,1,706,,,,897,897,897.0,EEABGJJD_00403,EEABGJJD_00403,EEABGJJD_00403,EEABGJJD_00403,EEABGJJD_00403 +htpX,htpX,putative heat shock protein,5,5,1.0,1,705,,,,897,897,897.0,EEABGJJD_00302,EEABGJJD_00302,EEABGJJD_00302,EEABGJJD_00302,EEABGJJD_00302 +group_368,,putative formiminotransferase cyclodeaminase,5,5,1.0,1,704,,,,900,900,900.0,EEABGJJD_01720,EEABGJJD_01720,EEABGJJD_01720,EEABGJJD_01720,EEABGJJD_01720 +miaA,miaA,putative tRNA isopentenylpyrophosphate transferase,5,5,1.0,1,703,,,,900,900,900.0,EEABGJJD_00766,EEABGJJD_00766,EEABGJJD_00766,EEABGJJD_00766,EEABGJJD_00766 +group_367,,histidine--tRNA ligase,5,5,1.0,1,702,,,,900,900,900.0,EEABGJJD_00474,EEABGJJD_00474,EEABGJJD_00474,EEABGJJD_00474,EEABGJJD_00474 +hasC2,hasC.2,putative UDP-glucose pyrophosphorylase,5,5,1.0,1,701,,,,900,900,900.0,EEABGJJD_00218,EEABGJJD_00218,EEABGJJD_00218,EEABGJJD_00218,EEABGJJD_00218 +group_366,,putative ribonuclease HIII,5,5,1.0,1,700,,,,903,903,903.0,EEABGJJD_01531,EEABGJJD_01531,EEABGJJD_01531,EEABGJJD_01531,EEABGJJD_01531 +group_365,,ABC transporter ATP-binding protein,5,5,1.0,1,699,,,,903,903,903.0,EEABGJJD_01489,EEABGJJD_01489,EEABGJJD_01489,EEABGJJD_01489,EEABGJJD_01489 +group_364,,HAD family hydrolase,5,5,1.0,1,698,,,,903,903,903.0,EEABGJJD_00498,EEABGJJD_00498,EEABGJJD_00498,EEABGJJD_00498,EEABGJJD_00498 +group_363,,permease,5,5,1.0,1,697,,,,903,903,903.0,EEABGJJD_00485,EEABGJJD_00485,EEABGJJD_00485,EEABGJJD_00485,EEABGJJD_00485 +dnaI,dnaI,putative primosome component (helicase loader),5,5,1.0,1,696,,,,903,903,903.0,EEABGJJD_00308,EEABGJJD_00308,EEABGJJD_00308,EEABGJJD_00308,EEABGJJD_00308 +group_362,,RluA family pseudouridine synthase,5,5,1.0,1,695,,,,906,906,906.0,EEABGJJD_00944,EEABGJJD_00944,EEABGJJD_00944,EEABGJJD_00944,EEABGJJD_00944 +cpsY,cpsY,putative transcriptional regulator,5,5,1.0,1,694,,,,906,906,906.0,EEABGJJD_00743,EEABGJJD_00743,EEABGJJD_00743,EEABGJJD_00743,EEABGJJD_00743 +group_361,,C4-dicarboxylate ABC transporter,5,5,1.0,1,693,,,,909,909,909.0,EEABGJJD_00764,EEABGJJD_00764,EEABGJJD_00764,EEABGJJD_00764,EEABGJJD_00764 +group_360,,RNA pseudouridine synthase,5,5,1.0,1,692,,,,912,912,912.0,EEABGJJD_01706,EEABGJJD_01706,EEABGJJD_01706,EEABGJJD_01706,EEABGJJD_01706 +manN,manN,putative mannose-specific phosphotransferase system component IID,5,5,1.0,1,691,,,,912,912,912.0,EEABGJJD_01454,EEABGJJD_01454,EEABGJJD_01454,EEABGJJD_01454,EEABGJJD_01454 +phiMGAS50052_29,phiMGAS5005.2_29,phage structural protein,5,5,1.0,1,690,,,,912,912,912.0,EEABGJJD_01215,EEABGJJD_01215,EEABGJJD_01215,EEABGJJD_01215,EEABGJJD_01215 +group_359,,diacylglycerol kinase,5,5,1.0,1,689,,,,912,912,912.0,EEABGJJD_00973,EEABGJJD_00973,EEABGJJD_00973,EEABGJJD_00973,EEABGJJD_00973 +fruB,fruB,1-phosphofructokinase,5,5,1.0,1,688,,,,912,912,912.0,EEABGJJD_00708,EEABGJJD_00708,EEABGJJD_00708,EEABGJJD_00708,EEABGJJD_00708 +group_358,,sporulation protein,5,5,1.0,1,687,,,,912,912,912.0,EEABGJJD_00541,EEABGJJD_00541,EEABGJJD_00541,EEABGJJD_00541,EEABGJJD_00541 +group_357,,XRE family transcriptional regulator,5,5,1.0,1,686,,,,912,912,912.0,EEABGJJD_00055,EEABGJJD_00055,EEABGJJD_00055,EEABGJJD_00055,EEABGJJD_00055 +group_356,,UTP--glucose-1-phosphate uridylyltransferase,5,5,1.0,1,685,,,,915,915,915.0,EEABGJJD_01820,EEABGJJD_01820,EEABGJJD_01820,EEABGJJD_01820,EEABGJJD_01820 +group_355,,hypothetical protein,5,5,1.0,1,684,,,,915,915,915.0,EEABGJJD_00809,EEABGJJD_00809,EEABGJJD_00809,EEABGJJD_00809,EEABGJJD_00809 +group_354,,LysR family transcriptional regulator,5,5,1.0,1,683,,,,915,915,915.0,EEABGJJD_00684,EEABGJJD_00684,EEABGJJD_00684,EEABGJJD_00684,EEABGJJD_00684 +group_353,,N-acetylneuraminate lyase,5,5,1.0,1,682,,,,915,915,915.0,EEABGJJD_00241,EEABGJJD_00241,EEABGJJD_00241,EEABGJJD_00241,EEABGJJD_00241 +group_352,,protein jag,5,5,1.0,1,681,,,,915,915,915.0,EEABGJJD_00234,EEABGJJD_00234,EEABGJJD_00234,EEABGJJD_00234,EEABGJJD_00234 +group_351,,glycine--tRNA ligase subunit alpha,5,5,1.0,1,680,,,,918,918,918.0,EEABGJJD_01411,EEABGJJD_01411,EEABGJJD_01411,EEABGJJD_01411,EEABGJJD_01411 +group_350,,thioredoxin-disulfide reductase,5,5,1.0,1,679,,,,918,918,918.0,EEABGJJD_01386,EEABGJJD_01386,EEABGJJD_01386,EEABGJJD_01386,EEABGJJD_01386 +group_349,,neutral zinc metallopeptidase,5,5,1.0,1,678,,,,918,918,918.0,EEABGJJD_00642,EEABGJJD_00642,EEABGJJD_00642,EEABGJJD_00642,EEABGJJD_00642 +group_348,,foldase,5,5,1.0,1,677,,,,921,921,921.0,EEABGJJD_01685,EEABGJJD_01685,EEABGJJD_01685,EEABGJJD_01685,EEABGJJD_01685 +lmb,lmb,putative laminin adhesion,5,5,1.0,1,676,,,,921,921,921.0,EEABGJJD_01668,EEABGJJD_01668,EEABGJJD_01668,EEABGJJD_01668,EEABGJJD_01668 +coaA,coaA,putative pantothenate kinase,5,5,1.0,1,675,,,,921,921,921.0,EEABGJJD_01036,EEABGJJD_01036,EEABGJJD_01036,EEABGJJD_01036,EEABGJJD_01036 +group_347,,TIGR01212 family radical SAM protein,5,5,1.0,1,674,,,,921,921,921.0,EEABGJJD_00336,EEABGJJD_00336,EEABGJJD_00336,EEABGJJD_00336,EEABGJJD_00336 +group_346,,XRE family transcriptional regulator,5,5,1.0,1,673,,,,924,924,924.0,EEABGJJD_01788,EEABGJJD_01788,EEABGJJD_01788,EEABGJJD_01788,EEABGJJD_01788 +apbA,apbA,putative 2-dehydropantoate 2-reductase,5,5,1.0,1,672,,,,924,924,924.0,EEABGJJD_00706,EEABGJJD_00706,EEABGJJD_00706,EEABGJJD_00706,EEABGJJD_00706 +group_345,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,671,,,,924,924,924.0,EEABGJJD_00617,EEABGJJD_00617,EEABGJJD_00617,EEABGJJD_00617,EEABGJJD_00617 +group_344,,protein translocase component YidC,5,5,1.0,1,670,,,,924,924,924.0,EEABGJJD_00316,EEABGJJD_00316,EEABGJJD_00316,EEABGJJD_00316,EEABGJJD_00316 +oppF,oppF,oligopeptidepermease,5,5,1.0,1,669,,,,924,924,924.0,EEABGJJD_00276,EEABGJJD_00276,EEABGJJD_00276,EEABGJJD_00276,EEABGJJD_00276 +group_343,,inhibitor of complement protein,5,5,1.0,1,668,,,,927,927,927.0,EEABGJJD_01672,EEABGJJD_01672,EEABGJJD_01672,EEABGJJD_01672,EEABGJJD_01672 +group_342,,alpha/beta hydrolase,5,5,1.0,1,667,,,,927,927,927.0,EEABGJJD_01572,EEABGJJD_01572,EEABGJJD_01572,EEABGJJD_01572,EEABGJJD_01572 +group_341,,putative sugar-binding transport protein,5,5,1.0,1,666,,,,927,927,927.0,EEABGJJD_01329,EEABGJJD_01329,EEABGJJD_01329,EEABGJJD_01329,EEABGJJD_01329 +group_340,,putative protease,5,5,1.0,1,665,,,,927,927,927.0,EEABGJJD_00494,EEABGJJD_00494,EEABGJJD_00494,EEABGJJD_00494,EEABGJJD_00494 +group_339,,putative sugar transferase,5,5,1.0,1,664,,,,927,927,927.0,EEABGJJD_00429,EEABGJJD_00429,EEABGJJD_00429,EEABGJJD_00429,EEABGJJD_00429 +group_338,,hypothetical protein,5,5,1.0,1,663,,,,927,927,927.0,EEABGJJD_00427,EEABGJJD_00427,EEABGJJD_00427,EEABGJJD_00427,EEABGJJD_00427 +oppC,oppC,oligopeptidepermease,5,5,1.0,1,662,,,,927,927,927.0,EEABGJJD_00274,EEABGJJD_00274,EEABGJJD_00274,EEABGJJD_00274,EEABGJJD_00274 +lacC2,lacC.2,putative tagatose 6-phosphate kinase,5,5,1.0,1,661,,,,930,930,930.0,EEABGJJD_01603,EEABGJJD_01603,EEABGJJD_01603,EEABGJJD_01603,EEABGJJD_01603 +group_337,,ribonuclease Z,5,5,1.0,1,660,,,,930,930,930.0,EEABGJJD_00769,EEABGJJD_00769,EEABGJJD_00769,EEABGJJD_00769,EEABGJJD_00769 +ftsX,ftsX,putative cell-division protein,5,5,1.0,1,659,,,,930,930,930.0,EEABGJJD_00532,EEABGJJD_00532,EEABGJJD_00532,EEABGJJD_00532,EEABGJJD_00532 +mreA,mreA,putative macrolide-efflux protein,5,5,1.0,1,658,,,,933,933,933.0,EEABGJJD_01051,EEABGJJD_01051,EEABGJJD_01051,EEABGJJD_01051,EEABGJJD_01051 +rgpBc,rgpBc,putative rhamnosyltransferase - possibly involved in cell wall localization and side chain formation of rhamnose-glucose polysaccharide,5,5,1.0,1,657,,,,933,933,933.0,EEABGJJD_00653,EEABGJJD_00653,EEABGJJD_00653,EEABGJJD_00653,EEABGJJD_00653 +ptsK,ptsK,Hpr kinase/phosphatase,5,5,1.0,1,656,,,,933,933,933.0,EEABGJJD_00489,EEABGJJD_00489,EEABGJJD_00489,EEABGJJD_00489,EEABGJJD_00489 +mtsA,mtsA,metal binding protein of ABC transporter (lipoprotein),5,5,1.0,1,655,,,,933,933,933.0,EEABGJJD_00384,EEABGJJD_00384,EEABGJJD_00384,EEABGJJD_00384,EEABGJJD_00384 +fhuD,fhuD,ferrichrome ABC transporter (ferrichrome-binding protein),5,5,1.0,1,654,,,,933,933,933.0,EEABGJJD_00344,EEABGJJD_00344,EEABGJJD_00344,EEABGJJD_00344,EEABGJJD_00344 +group_336,,putative oxidoreductase,5,5,1.0,1,653,,,,936,936,936.0,EEABGJJD_01739,EEABGJJD_01739,EEABGJJD_01739,EEABGJJD_01739,EEABGJJD_01739 +fmt,fmt,putative methionyl tRNA formyltransferase,5,5,1.0,1,652,,,,936,936,936.0,EEABGJJD_01363,EEABGJJD_01363,EEABGJJD_01363,EEABGJJD_01363,EEABGJJD_01363 +pyrD,pyrD,putative dihydroorotate dehydrogenase,5,5,1.0,1,651,,,,936,936,936.0,EEABGJJD_01191,EEABGJJD_01191,EEABGJJD_01191,EEABGJJD_01191,EEABGJJD_01191 +pstC2,pstC2,putative phosphate ABC transporter (permease protein),5,5,1.0,1,650,,,,936,936,936.0,EEABGJJD_01045,EEABGJJD_01045,EEABGJJD_01045,EEABGJJD_01045,EEABGJJD_01045 +group_335,,putative manganese-dependent inorganic pyrophosphatase (intrageneric coaggregation-relevant adhesin),5,5,1.0,1,649,,,,936,936,936.0,EEABGJJD_00340,EEABGJJD_00340,EEABGJJD_00340,EEABGJJD_00340,EEABGJJD_00340 +group_334,,putative esterase,5,5,1.0,1,648,,,,939,939,939.0,EEABGJJD_01537,EEABGJJD_01537,EEABGJJD_01537,EEABGJJD_01537,EEABGJJD_01537 +fabD,fabD,putative malonyl CoA-acyl carrier protein transacylase,5,5,1.0,1,647,,,,939,939,939.0,EEABGJJD_01464,EEABGJJD_01464,EEABGJJD_01464,EEABGJJD_01464,EEABGJJD_01464 +group_333,,putative thiamine biosynthesis lipoprotein precursor,5,5,1.0,1,646,,,,939,939,939.0,EEABGJJD_00954,EEABGJJD_00954,EEABGJJD_00954,EEABGJJD_00954,EEABGJJD_00954 +cbf,cbf,putative CMP-binding factor,5,5,1.0,1,645,,,,939,939,939.0,EEABGJJD_00251,EEABGJJD_00251,EEABGJJD_00251,EEABGJJD_00251,EEABGJJD_00251 +group_332,,putative glucose kinase,5,5,1.0,1,644,,,,939,939,939.0,EEABGJJD_00242,EEABGJJD_00242,EEABGJJD_00242,EEABGJJD_00242,EEABGJJD_00242 +group_331,,competence protein CglA,5,5,1.0,1,643,,,,939,939,939.0,EEABGJJD_00118,EEABGJJD_00118,EEABGJJD_00118,EEABGJJD_00118,EEABGJJD_00118 +rpoA,rpoA,DNA-directed RNA polymerase alpha subunit,5,5,1.0,1,642,,,,939,939,939.0,EEABGJJD_00092,EEABGJJD_00092,EEABGJJD_00092,EEABGJJD_00092,EEABGJJD_00092 +group_330,,YitT family protein,5,5,1.0,1,641,,,,942,942,942.0,EEABGJJD_01781,EEABGJJD_01781,EEABGJJD_01781,EEABGJJD_01781,EEABGJJD_01781 +cysM,cysM,putative O-acetylserine lyase,5,5,1.0,1,640,,,,942,942,942.0,EEABGJJD_01354,EEABGJJD_01354,EEABGJJD_01354,EEABGJJD_01354,EEABGJJD_01354 +group_329,,LPXTG cell wall anchor domain-containing protein,5,5,1.0,1,639,,,,942,942,942.0,EEABGJJD_01248,EEABGJJD_01248,EEABGJJD_01248,EEABGJJD_01248,EEABGJJD_01248 +birA,birA,putative biotin operon repressor,5,5,1.0,1,638,,,,942,942,942.0,EEABGJJD_01140,EEABGJJD_01140,EEABGJJD_01140,EEABGJJD_01140,EEABGJJD_01140 +group_328,,bifunctional oligoribonuclease/PAP phosphatase NrnA,5,5,1.0,1,637,,,,942,942,942.0,EEABGJJD_00596,EEABGJJD_00596,EEABGJJD_00596,EEABGJJD_00596,EEABGJJD_00596 +group_327,,putative divalent cation transport protein,5,5,1.0,1,636,,,,945,945,945.0,EEABGJJD_01518,EEABGJJD_01518,EEABGJJD_01518,EEABGJJD_01518,EEABGJJD_01518 +mvaD,mvaD,mevalonate pyrophosphate decarboxylase,5,5,1.0,1,635,,,,945,945,945.0,EEABGJJD_00725,EEABGJJD_00725,EEABGJJD_00725,EEABGJJD_00725,EEABGJJD_00725 +group_326,,putative sugar-binding transport protein,5,5,1.0,1,634,,,,951,951,951.0,EEABGJJD_01330,EEABGJJD_01330,EEABGJJD_01330,EEABGJJD_01330,EEABGJJD_01330 +arcC,arcC,putative carbamate kinase,5,5,1.0,1,633,,,,951,951,951.0,EEABGJJD_01289,EEABGJJD_01289,EEABGJJD_01289,EEABGJJD_01289,EEABGJJD_01289 +sagB,sagB,streptolysin S biosynthesis dehydrogenase SagB,5,5,1.0,1,632,,,,951,951,951.0,EEABGJJD_00612,EEABGJJD_00612,EEABGJJD_00612,EEABGJJD_00612,EEABGJJD_00612 +group_325,,50S ribosomal protein L11 methyltransferase,5,5,1.0,1,631,,,,954,954,954.0,EEABGJJD_01651,EEABGJJD_01651,EEABGJJD_01651,EEABGJJD_01651,EEABGJJD_01651 +pmi,pmi,putative mannose-6-phosphate isomerase,5,5,1.0,1,630,,,,954,954,954.0,EEABGJJD_01506,EEABGJJD_01506,EEABGJJD_01506,EEABGJJD_01506,EEABGJJD_01506 +group_324,,class I SAM-dependent methyltransferase,5,5,1.0,1,629,,,,954,954,954.0,EEABGJJD_00125,EEABGJJD_00125,EEABGJJD_00125,EEABGJJD_00125,EEABGJJD_00125 +group_323,,aromatic acid exporter family protein,5,5,1.0,1,628,,,,957,957,957.0,EEABGJJD_01054,EEABGJJD_01054,EEABGJJD_01054,EEABGJJD_01054,EEABGJJD_01054 +group_322,,putative sugar ABC transporter (permease protein),5,5,1.0,1,627,,,,957,957,957.0,EEABGJJD_01030,EEABGJJD_01030,EEABGJJD_01030,EEABGJJD_01030,EEABGJJD_01030 +group_321,,YbbR-like domain-containing protein,5,5,1.0,1,626,,,,957,957,957.0,EEABGJJD_00865,EEABGJJD_00865,EEABGJJD_00865,EEABGJJD_00865,EEABGJJD_00865 +group_320,,class 1b ribonucleoside-diphosphate reductase subunit beta,5,5,1.0,1,625,,,,960,960,960.0,EEABGJJD_01153,EEABGJJD_01153,EEABGJJD_01153,EEABGJJD_01153,EEABGJJD_01153 +group_319,,gfo/Idh/MocA family oxidoreductase,5,5,1.0,1,624,,,,960,960,960.0,EEABGJJD_00376,EEABGJJD_00376,EEABGJJD_00376,EEABGJJD_00376,EEABGJJD_00376 +group_318,,putative transcription factor,5,5,1.0,1,623,,,,963,963,963.0,EEABGJJD_01165,EEABGJJD_01165,EEABGJJD_01165,EEABGJJD_01165,EEABGJJD_01165 +group_317,,polysaccharide deacetylase family protein,5,5,1.0,1,622,,,,963,963,963.0,EEABGJJD_00915,EEABGJJD_00915,EEABGJJD_00915,EEABGJJD_00915,EEABGJJD_00915 +group_316,,sugar transport system (permease),5,5,1.0,1,621,,,,963,963,963.0,EEABGJJD_00238,EEABGJJD_00238,EEABGJJD_00238,EEABGJJD_00238,EEABGJJD_00238 +prsA2,prsA.2,putative ribose-phosphate pyrophosphokinase,5,5,1.0,1,620,,,,963,963,963.0,EEABGJJD_00039,EEABGJJD_00039,EEABGJJD_00039,EEABGJJD_00039,EEABGJJD_00039 +scrR,scrR,putative sucrose operon repressor,5,5,1.0,1,619,,,,966,966,966.0,EEABGJJD_01511,EEABGJJD_01511,EEABGJJD_01511,EEABGJJD_01511,EEABGJJD_01511 +asnB,asnB,putative L-asparaginase,5,5,1.0,1,618,,,,966,966,966.0,EEABGJJD_01487,EEABGJJD_01487,EEABGJJD_01487,EEABGJJD_01487,EEABGJJD_01487 +group_315,,YihY/virulence factor BrkB family protein,5,5,1.0,1,617,,,,966,966,966.0,EEABGJJD_01132,EEABGJJD_01132,EEABGJJD_01132,EEABGJJD_01132,EEABGJJD_01132 +acoA,acoA,putative acetoin dehydrogenase (TPP-dependent) alpha chain,5,5,1.0,1,616,,,,969,969,969.0,EEABGJJD_00856,EEABGJJD_00856,EEABGJJD_00856,EEABGJJD_00856,EEABGJJD_00856 +group_314,,serine/threonine protein kinase,5,5,1.0,1,615,,,,969,969,969.0,EEABGJJD_00454,EEABGJJD_00454,EEABGJJD_00454,EEABGJJD_00454,EEABGJJD_00454 +fabK,fabK,putative trans-2-enoyl-ACP reductase II,5,5,1.0,1,614,,,,972,972,972.0,EEABGJJD_01465,EEABGJJD_01465,EEABGJJD_01465,EEABGJJD_01465,EEABGJJD_01465 +glcK,glcK,glucose kinase,5,5,1.0,1,613,,,,972,972,972.0,EEABGJJD_01278,EEABGJJD_01278,EEABGJJD_01278,EEABGJJD_01278,EEABGJJD_01278 +fabH,fabH,3-oxoacyl-ACP synthase III,5,5,1.0,1,612,,,,975,975,975.0,EEABGJJD_01467,EEABGJJD_01467,EEABGJJD_01467,EEABGJJD_01467,EEABGJJD_01467 +dppB,dppB,transmembrane transport protein,5,5,1.0,1,611,,,,978,978,978.0,EEABGJJD_01662,EEABGJJD_01662,EEABGJJD_01662,EEABGJJD_01662,EEABGJJD_01662 +lacD1,lacD.1,putative tagatose 16-diphosphate aldolase,5,5,1.0,1,610,,,,978,978,978.0,EEABGJJD_01422,EEABGJJD_01422,EEABGJJD_01422,EEABGJJD_01422,EEABGJJD_01422 +group_313,,putative ribose transport operon repressor,5,5,1.0,1,609,,,,978,978,978.0,EEABGJJD_01284,EEABGJJD_01284,EEABGJJD_01284,EEABGJJD_01284,EEABGJJD_01284 +group_312,,nucleoid-associated protein,5,5,1.0,1,608,,,,978,978,978.0,EEABGJJD_00962,EEABGJJD_00962,EEABGJJD_00962,EEABGJJD_00962,EEABGJJD_00962 +group_311,,YvcK family protein,5,5,1.0,1,607,,,,978,978,978.0,EEABGJJD_00540,EEABGJJD_00540,EEABGJJD_00540,EEABGJJD_00540,EEABGJJD_00540 +group_310,,putative transcriptional regulator,5,5,1.0,1,606,,,,978,978,978.0,EEABGJJD_00134,EEABGJJD_00134,EEABGJJD_00134,EEABGJJD_00134,EEABGJJD_00134 +group_309,,putative transcriptional regulator,5,5,1.0,1,605,,,,981,981,981.0,EEABGJJD_01698,EEABGJJD_01698,EEABGJJD_01698,EEABGJJD_01698,EEABGJJD_01698 +prs,prs,putative phosphoribosyl pyrophosphate synthetase,5,5,1.0,1,604,,,,981,981,981.0,EEABGJJD_00940,EEABGJJD_00940,EEABGJJD_00940,EEABGJJD_00940,EEABGJJD_00940 +group_308,,peptide chain release factor 2,5,5,1.0,1,603,,,,981,981,981.0,EEABGJJD_00530,EEABGJJD_00530,EEABGJJD_00530,EEABGJJD_00530,EEABGJJD_00530 +lacD2,lacD.2,putative tagatose 16-diphosphate aldolase,5,5,1.0,1,602,,,,984,984,984.0,EEABGJJD_01602,EEABGJJD_01602,EEABGJJD_01602,EEABGJJD_01602,EEABGJJD_01602 +group_307,,putative esterase,5,5,1.0,1,601,,,,984,984,984.0,EEABGJJD_01098,EEABGJJD_01098,EEABGJJD_01098,EEABGJJD_01098,EEABGJJD_01098 +ldh,ldh,putative L-lactate dehydrogenase,5,5,1.0,1,600,,,,984,984,984.0,EEABGJJD_00967,EEABGJJD_00967,EEABGJJD_00967,EEABGJJD_00967,EEABGJJD_00967 +group_306,,putative GMP reductase,5,5,1.0,1,599,,,,984,984,984.0,EEABGJJD_00951,EEABGJJD_00951,EEABGJJD_00951,EEABGJJD_00951,EEABGJJD_00951 +hutG,hutG,putative formiminoglutamate hydrolase,5,5,1.0,1,598,,,,987,987,987.0,EEABGJJD_01726,EEABGJJD_01726,EEABGJJD_01726,EEABGJJD_01726,EEABGJJD_01726 +group_305,,alpha/beta hydrolase,5,5,1.0,1,597,,,,987,987,987.0,EEABGJJD_01433,EEABGJJD_01433,EEABGJJD_01433,EEABGJJD_01433,EEABGJJD_01433 +group_304,,rhodanese domain-containing protein,5,5,1.0,1,596,,,,987,987,987.0,EEABGJJD_00762,EEABGJJD_00762,EEABGJJD_00762,EEABGJJD_00762,EEABGJJD_00762 +pyrB,pyrB,putative aspartate transcarbamoylase,5,5,1.0,1,595,,,,987,987,987.0,EEABGJJD_00689,EEABGJJD_00689,EEABGJJD_00689,EEABGJJD_00689,EEABGJJD_00689 +group_303,,dehydrogenase,5,5,1.0,1,594,,,,987,987,987.0,EEABGJJD_00452,EEABGJJD_00452,EEABGJJD_00452,EEABGJJD_00452,EEABGJJD_00452 +group_302,,nucleoside-triphosphate diphosphatase,5,5,1.0,1,593,,,,987,987,987.0,EEABGJJD_00325,EEABGJJD_00325,EEABGJJD_00325,EEABGJJD_00325,EEABGJJD_00325 +phiNCTC81981_5,phiNCTC8198.1_5,recombinase RecT,5,5,1.0,1,592,,,,990,990,990.0,EEABGJJD_01231,EEABGJJD_01231,EEABGJJD_01231,EEABGJJD_01231,EEABGJJD_01231 +group_301,,putative lipoate-protein ligase,5,5,1.0,1,591,,,,990,990,990.0,EEABGJJD_00861,EEABGJJD_00861,EEABGJJD_00861,EEABGJJD_00861,EEABGJJD_00861 +group_300,,type 2 isopentenyl-diphosphate Delta-isomerase,5,5,1.0,1,590,,,,990,990,990.0,EEABGJJD_00727,EEABGJJD_00727,EEABGJJD_00727,EEABGJJD_00727,EEABGJJD_00727 +group_299,,glycosyl transferase 2 family protein,5,5,1.0,1,589,,,,990,990,990.0,EEABGJJD_00455,EEABGJJD_00455,EEABGJJD_00455,EEABGJJD_00455,EEABGJJD_00455 +manL,manL,mannose-specific phosphotransferase system component IIAB,5,5,1.0,1,588,,,,993,993,993.0,EEABGJJD_01452,EEABGJJD_01452,EEABGJJD_01452,EEABGJJD_01452,EEABGJJD_01452 +asnA,asnA,putative asparagine synthetase A,5,5,1.0,1,587,,,,993,993,993.0,EEABGJJD_01288,EEABGJJD_01288,EEABGJJD_01288,EEABGJJD_01288,EEABGJJD_01288 +ddh,ddh,putative D-specific D-2-hydroxyacid dehydrogenase,5,5,1.0,1,586,,,,993,993,993.0,EEABGJJD_00982,EEABGJJD_00982,EEABGJJD_00982,EEABGJJD_00982,EEABGJJD_00982 +group_298,,putative thioredoxin reductase,5,5,1.0,1,585,,,,993,993,993.0,EEABGJJD_00704,EEABGJJD_00704,EEABGJJD_00704,EEABGJJD_00704,EEABGJJD_00704 +group_297,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,584,,,,993,993,993.0,EEABGJJD_00437,EEABGJJD_00437,EEABGJJD_00437,EEABGJJD_00437,EEABGJJD_00437 +pta,pta,putative phosphotransacetylase,5,5,1.0,1,583,,,,996,996,996.0,EEABGJJD_00945,EEABGJJD_00945,EEABGJJD_00945,EEABGJJD_00945,EEABGJJD_00945 +regR,regR,putative transcriptional regulator (LacI family),5,5,1.0,1,582,,,,996,996,996.0,EEABGJJD_00517,EEABGJJD_00517,EEABGJJD_00517,EEABGJJD_00517,EEABGJJD_00517 +group_296,,hypothetical protein,5,5,1.0,1,581,,,,996,996,996.0,EEABGJJD_00442,EEABGJJD_00442,EEABGJJD_00442,EEABGJJD_00442,EEABGJJD_00442 +group_295,,hypothetical protein,5,5,1.0,1,580,,,,996,996,996.0,EEABGJJD_00414,EEABGJJD_00414,EEABGJJD_00414,EEABGJJD_00414,EEABGJJD_00414 +group_294,,LLM class flavin-dependent oxidoreductase,5,5,1.0,1,579,,,,999,999,999.0,EEABGJJD_01023,EEABGJJD_01023,EEABGJJD_01023,EEABGJJD_01023,EEABGJJD_01023 +group_293,,peptide ABC transporter substrate-binding protein,5,5,1.0,1,578,,,,999,999,999.0,EEABGJJD_00848,EEABGJJD_00848,EEABGJJD_00848,EEABGJJD_00848,EEABGJJD_00848 +group_292,,putative sugar transferase,5,5,1.0,1,577,,,,999,999,999.0,EEABGJJD_00434,EEABGJJD_00434,EEABGJJD_00434,EEABGJJD_00434,EEABGJJD_00434 +ntpC,ntpC,putative V-type Na+ -ATPase subunit C,5,5,1.0,1,576,,,,999,999,999.0,EEABGJJD_00160,EEABGJJD_00160,EEABGJJD_00160,EEABGJJD_00160,EEABGJJD_00160 +ruvB,ruvB,putative Holliday junction DNA helicase subunit B,5,5,1.0,1,575,,,,999,999,999.0,EEABGJJD_00057,EEABGJJD_00057,EEABGJJD_00057,EEABGJJD_00057,EEABGJJD_00057 +acoB,acoB,putative acetoin dehydrogenase (TPP-dependent) beta chain,5,5,1.0,1,574,,,,1002,1002,1002.0,EEABGJJD_00857,EEABGJJD_00857,EEABGJJD_00857,EEABGJJD_00857,EEABGJJD_00857 +kdgK,kdgK,putative 2-keto-3-deoxygluconate kinase,5,5,1.0,1,573,,,,1002,1002,1002.0,EEABGJJD_00526,EEABGJJD_00526,EEABGJJD_00526,EEABGJJD_00526,EEABGJJD_00526 +ccpA,ccpA,catabolite control protein A,5,5,1.0,1,572,,,,1002,1002,1002.0,EEABGJJD_00433,EEABGJJD_00433,EEABGJJD_00433,EEABGJJD_00433,EEABGJJD_00433 +fhuG,fhuG,putative ferrichrome ABC transporter (permease),5,5,1.0,1,571,,,,1002,1002,1002.0,EEABGJJD_00342,EEABGJJD_00342,EEABGJJD_00342,EEABGJJD_00342,EEABGJJD_00342 +group_291,,putative two-component sensor histidine kinase,5,5,1.0,1,570,,,,1005,1005,1005.0,EEABGJJD_01358,EEABGJJD_01358,EEABGJJD_01358,EEABGJJD_01358,EEABGJJD_01358 +group_290,,phosphomevalonate kinase,5,5,1.0,1,569,,,,1008,1008,1008.0,EEABGJJD_00726,EEABGJJD_00726,EEABGJJD_00726,EEABGJJD_00726,EEABGJJD_00726 +rgpEc,rgpEc,putative glycosyltransferase - possibly involved in cell wall localization and side chain formation of rhamnose-glucose polysaccharide,5,5,1.0,1,568,,,,1008,1008,1008.0,EEABGJJD_00656,EEABGJJD_00656,EEABGJJD_00656,EEABGJJD_00656,EEABGJJD_00656 +plsX,plsX,putative fatty acid/phospholipid synthesis protein,5,5,1.0,1,567,,,,1008,1008,1008.0,EEABGJJD_00041,EEABGJJD_00041,EEABGJJD_00041,EEABGJJD_00041,EEABGJJD_00041 +mraY,mraY,putative undecaprenyl-phosphate-UDP-MurNAc-pentapeptide phospho-MurNAc-pentapeptide transferase,5,5,1.0,1,566,,,,1011,1011,1011.0,EEABGJJD_01391,EEABGJJD_01391,EEABGJJD_01391,EEABGJJD_01391,EEABGJJD_01391 +plr,plr,glyceraldehyde-3-phosphate dehydrogenase plasmin receptor,5,5,1.0,1,565,,,,1011,1011,1011.0,EEABGJJD_00257,EEABGJJD_00257,EEABGJJD_00257,EEABGJJD_00257,EEABGJJD_00257 +group_289,,16S rRNA (cytosine(1402)-N(4))-methyltransferase RsmH,5,5,1.0,1,564,,,,1014,1014,1014.0,EEABGJJD_01394,EEABGJJD_01394,EEABGJJD_01394,EEABGJJD_01394,EEABGJJD_01394 +arcB,arcB,putative ornithine transcarbamylase,5,5,1.0,1,563,,,,1014,1014,1014.0,EEABGJJD_01292,EEABGJJD_01292,EEABGJJD_01292,EEABGJJD_01292,EEABGJJD_01292 +pfk,pfk,putative 6-phosphofructokinase,5,5,1.0,1,562,,,,1014,1014,1014.0,EEABGJJD_01077,EEABGJJD_01077,EEABGJJD_01077,EEABGJJD_01077,EEABGJJD_01077 +hylP1,hylP1,hyaluronidase phage associated,5,5,1.0,1,561,,,,1014,1014,1014.0,EEABGJJD_00582,EEABGJJD_00582,EEABGJJD_00582,EEABGJJD_00582,EEABGJJD_00582 +nrdF1,nrdF.1,putative ribonucleotide reductase 2,5,5,1.0,1,560,,,,1014,1014,1014.0,EEABGJJD_00364,EEABGJJD_00364,EEABGJJD_00364,EEABGJJD_00364,EEABGJJD_00364 +gpsA,gpsA,putative NAD(P)H-dependent glycerol-3-phosphate dehydrogenase,5,5,1.0,1,559,,,,1017,1017,1017.0,EEABGJJD_00219,EEABGJJD_00219,EEABGJJD_00219,EEABGJJD_00219,EEABGJJD_00219 +adhA,adhA,putative alcohol dehydrogenase I,5,5,1.0,1,558,,,,1017,1017,1017.0,EEABGJJD_00063,EEABGJJD_00063,EEABGJJD_00063,EEABGJJD_00063,EEABGJJD_00063 +malR,malR,putative maltose operon transcriptional repressor,5,5,1.0,1,557,,,,1020,1020,1020.0,EEABGJJD_01087,EEABGJJD_01087,EEABGJJD_01087,EEABGJJD_01087,EEABGJJD_01087 +lplA,lplA,putative lipoate-protein ligase,5,5,1.0,1,556,,,,1020,1020,1020.0,EEABGJJD_01019,EEABGJJD_01019,EEABGJJD_01019,EEABGJJD_01019,EEABGJJD_01019 +group_288,,putative sulfate exporter family transporter,5,5,1.0,1,555,,,,1020,1020,1020.0,EEABGJJD_00880,EEABGJJD_00880,EEABGJJD_00880,EEABGJJD_00880,EEABGJJD_00880 +group_287,,IgG-degrading enzyme/Mac-1 IdeZ,5,5,1.0,1,554,,,,1020,1020,1020.0,EEABGJJD_00713,EEABGJJD_00713,EEABGJJD_00713,EEABGJJD_00713,EEABGJJD_00713 +group_286,,putative regulatory protein,5,5,1.0,1,553,,,,1020,1020,1020.0,EEABGJJD_00155,EEABGJJD_00155,EEABGJJD_00155,EEABGJJD_00155,EEABGJJD_00155 +trsA,trsA,putative tryptophanyl-tRNA synthetase,5,5,1.0,1,552,,,,1023,1023,1023.0,EEABGJJD_01825,EEABGJJD_01825,EEABGJJD_01825,EEABGJJD_01825,EEABGJJD_01825 +group_285,,putative ABC transporter (permease),5,5,1.0,1,551,,,,1023,1023,1023.0,EEABGJJD_01497,EEABGJJD_01497,EEABGJJD_01497,EEABGJJD_01497,EEABGJJD_01497 +group_284,,branched-chain amino acid aminotransferase,5,5,1.0,1,550,,,,1023,1023,1023.0,EEABGJJD_00755,EEABGJJD_00755,EEABGJJD_00755,EEABGJJD_00755,EEABGJJD_00755 +group_283,,diacylglycerol kinase,5,5,1.0,1,549,,,,1023,1023,1023.0,EEABGJJD_00623,EEABGJJD_00623,EEABGJJD_00623,EEABGJJD_00623,EEABGJJD_00623 +group_282,,pilin,5,5,1.0,1,548,,,,1023,1023,1023.0,EEABGJJD_00139,EEABGJJD_00139,EEABGJJD_00139,EEABGJJD_00139,EEABGJJD_00139 +group_281,,type II secretion system F family protein,5,5,1.0,1,547,,,,1023,1023,1023.0,EEABGJJD_00119,EEABGJJD_00119,EEABGJJD_00119,EEABGJJD_00119,EEABGJJD_00119 +group_280,,phosphoribosylformylglycinamidine cyclo-ligase,5,5,1.0,1,546,,,,1023,1023,1023.0,EEABGJJD_00046,EEABGJJD_00046,EEABGJJD_00046,EEABGJJD_00046,EEABGJJD_00046 +group_279,,helix-turn-helix domain-containing protein,5,5,1.0,1,545,,,,1026,1026,1026.0,EEABGJJD_01815,EEABGJJD_01815,EEABGJJD_01815,EEABGJJD_01815,EEABGJJD_01815 +group_278,,type I-C CRISPR-associated endonuclease Cas1,5,5,1.0,1,544,,,,1026,1026,1026.0,EEABGJJD_01305,EEABGJJD_01305,EEABGJJD_01305,EEABGJJD_01305,EEABGJJD_01305 +group_277,,putative glycoprotein endopeptidase,5,5,1.0,1,543,,,,1029,1029,1029.0,EEABGJJD_01556,EEABGJJD_01556,EEABGJJD_01556,EEABGJJD_01556,EEABGJJD_01556 +queA,queA,putative S-adenosylmethionine-tRNA ribosyltransferase-isomerase,5,5,1.0,1,542,,,,1029,1029,1029.0,EEABGJJD_01168,EEABGJJD_01168,EEABGJJD_01168,EEABGJJD_01168,EEABGJJD_01168 +group_276,,phage portal protein,5,5,1.0,1,541,,,,1029,1029,1029.0,EEABGJJD_00814,EEABGJJD_00814,EEABGJJD_00814,EEABGJJD_00814,EEABGJJD_00814 +hrcA,hrcA,putative heat shock transcription repressor protein,5,5,1.0,1,540,,,,1035,1035,1035.0,EEABGJJD_01473,EEABGJJD_01473,EEABGJJD_01473,EEABGJJD_01473,EEABGJJD_01473 +group_275,,ABC transporter permease,5,5,1.0,1,539,,,,1035,1035,1035.0,EEABGJJD_01443,EEABGJJD_01443,EEABGJJD_01443,EEABGJJD_01443,EEABGJJD_01443 +group_274,,PDZ domain-containing protein,5,5,1.0,1,538,,,,1038,1038,1038.0,EEABGJJD_01285,EEABGJJD_01285,EEABGJJD_01285,EEABGJJD_01285,EEABGJJD_01285 +group_273,,elongation factor Ts,5,5,1.0,1,537,,,,1041,1041,1041.0,EEABGJJD_01729,EEABGJJD_01729,EEABGJJD_01729,EEABGJJD_01729,EEABGJJD_01729 +group_272,,putative transcription regulator,5,5,1.0,1,536,,,,1041,1041,1041.0,EEABGJJD_01334,EEABGJJD_01334,EEABGJJD_01334,EEABGJJD_01334,EEABGJJD_01334 +holA,holA,DNA polymerase III delta subunit,5,5,1.0,1,535,,,,1041,1041,1041.0,EEABGJJD_01174,EEABGJJD_01174,EEABGJJD_01174,EEABGJJD_01174,EEABGJJD_01174 +cpsFQ,cpsFQ,putative dTDP-glucose-46-dehydratase,5,5,1.0,1,534,,,,1041,1041,1041.0,EEABGJJD_00780,EEABGJJD_00780,EEABGJJD_00780,EEABGJJD_00780,EEABGJJD_00780 +group_271,,BMP family ABC transporter substrate-binding protein,5,5,1.0,1,533,,,,1044,1044,1044.0,EEABGJJD_00168,EEABGJJD_00168,EEABGJJD_00168,EEABGJJD_00168,EEABGJJD_00168 +scl,scl,collagen-like surface protei,5,5,1.0,1,532,,,,1047,1047,1047.0,EEABGJJD_01646,EEABGJJD_01646,EEABGJJD_01646,EEABGJJD_01646,EEABGJJD_01646 +ddlA,ddlA,putative D-alaD-ala ligase,5,5,1.0,1,531,,,,1047,1047,1047.0,EEABGJJD_01185,EEABGJJD_01185,EEABGJJD_01185,EEABGJJD_01185,EEABGJJD_01185 +group_270,,iron ABC transporter substrate-binding protein,5,5,1.0,1,530,,,,1047,1047,1047.0,EEABGJJD_00887,EEABGJJD_00887,EEABGJJD_00887,EEABGJJD_00887,EEABGJJD_00887 +group_269,,minor capsid protein E,5,5,1.0,1,529,,,,1050,1050,1050.0,EEABGJJD_00822,EEABGJJD_00822,EEABGJJD_00822,EEABGJJD_00822,EEABGJJD_00822 +group_268,,putative lipoprotein,5,5,1.0,1,528,,,,1053,1053,1053.0,EEABGJJD_01033,EEABGJJD_01033,EEABGJJD_01033,EEABGJJD_01033,EEABGJJD_01033 +citC,citC,[citrate (pro-3S)-lyase] ligase,5,5,1.0,1,527,,,,1053,1053,1053.0,EEABGJJD_01002,EEABGJJD_01002,EEABGJJD_01002,EEABGJJD_01002,EEABGJJD_01002 +phoH,phoH,phosphate starvation-induced protein,5,5,1.0,1,526,,,,1053,1053,1053.0,EEABGJJD_00399,EEABGJJD_00399,EEABGJJD_00399,EEABGJJD_00399,EEABGJJD_00399 +fhuB1,fhuB.1,putative ferrichrome ABC transporter (permease),5,5,1.0,1,525,,,,1053,1053,1053.0,EEABGJJD_00343,EEABGJJD_00343,EEABGJJD_00343,EEABGJJD_00343,EEABGJJD_00343 +group_267,,putative protease maturation protein,5,5,1.0,1,524,,,,1056,1056,1056.0,EEABGJJD_01161,EEABGJJD_01161,EEABGJJD_01161,EEABGJJD_01161,EEABGJJD_01161 +group_266,,putative regulatory protein,5,5,1.0,1,523,,,,1059,1059,1059.0,EEABGJJD_00705,EEABGJJD_00705,EEABGJJD_00705,EEABGJJD_00705,EEABGJJD_00705 +sagC,sagC,streptolysin S associated ORF,5,5,1.0,1,522,,,,1059,1059,1059.0,EEABGJJD_00613,EEABGJJD_00613,EEABGJJD_00613,EEABGJJD_00613,EEABGJJD_00613 +group_265,,putative sugar ABC transporter (permease protein),5,5,1.0,1,521,,,,1065,1065,1065.0,EEABGJJD_01031,EEABGJJD_01031,EEABGJJD_01031,EEABGJJD_01031,EEABGJJD_01031 +group_264,,putative ABC transporter (ATP-binding protein)h,5,5,1.0,1,520,,,,1065,1065,1065.0,EEABGJJD_00294,EEABGJJD_00294,EEABGJJD_00294,EEABGJJD_00294,EEABGJJD_00294 +pepA,pepA,putative glutamyl-aminopeptidase,5,5,1.0,1,519,,,,1068,1068,1068.0,EEABGJJD_00128,EEABGJJD_00128,EEABGJJD_00128,EEABGJJD_00128,EEABGJJD_00128 +group_263,,putative integrase/recombinase,5,5,1.0,1,518,,,,1071,1071,1071.0,EEABGJJD_01004,EEABGJJD_01004,EEABGJJD_01004,EEABGJJD_01004,EEABGJJD_01004 +oppD,oppD,oligopeptidepermease (ATP-binding protein),5,5,1.0,1,517,,,,1071,1071,1071.0,EEABGJJD_00275,EEABGJJD_00275,EEABGJJD_00275,EEABGJJD_00275,EEABGJJD_00275 +pepP,pepP,putative aminopeptidase P; XAA-pro aminopeptidase,5,5,1.0,1,516,,,,1074,1074,1074.0,EEABGJJD_01516,EEABGJJD_01516,EEABGJJD_01516,EEABGJJD_01516,EEABGJJD_01516 +group_262,,3-dehydroquinate synthase,5,5,1.0,1,515,,,,1074,1074,1074.0,EEABGJJD_01319,EEABGJJD_01319,EEABGJJD_01319,EEABGJJD_01319,EEABGJJD_01319 +potD,potD,putative spermidine/putrescine ABC transporter (periplasmic transport protein),5,5,1.0,1,514,,,,1074,1074,1074.0,EEABGJJD_00925,EEABGJJD_00925,EEABGJJD_00925,EEABGJJD_00925,EEABGJJD_00925 +group_261,,ABC transporter permease,5,5,1.0,1,513,,,,1077,1077,1077.0,EEABGJJD_00640,EEABGJJD_00640,EEABGJJD_00640,EEABGJJD_00640,EEABGJJD_00640 +pheS,pheS,putative phenylalanyl-tRNA synthetase (alpha subunit),5,5,1.0,1,512,,,,1077,1077,1077.0,EEABGJJD_00636,EEABGJJD_00636,EEABGJJD_00636,EEABGJJD_00636,EEABGJJD_00636 +purK,purK,5-(carboxyamino)imidazole ribonucleotide synthase,5,5,1.0,1,511,,,,1077,1077,1077.0,EEABGJJD_00052,EEABGJJD_00052,EEABGJJD_00052,EEABGJJD_00052,EEABGJJD_00052 +group_260,,hypothetical protein,5,5,1.0,1,510,,,,1080,1080,1080.0,EEABGJJD_01796,EEABGJJD_01796,EEABGJJD_01796,EEABGJJD_01796,EEABGJJD_01796 +group_259,,MmcQ/YjbR family DNA-binding protein,5,5,1.0,1,509,,,,1080,1080,1080.0,EEABGJJD_01298,EEABGJJD_01298,EEABGJJD_01298,EEABGJJD_01298,EEABGJJD_01298 +group_258,,23S rRNA (adenine(2503)-C(2))-methyltransferase RlmN,5,5,1.0,1,508,,,,1080,1080,1080.0,EEABGJJD_01282,EEABGJJD_01282,EEABGJJD_01282,EEABGJJD_01282,EEABGJJD_01282 +prfA,prfA,putative peptide chain release factor 1,5,5,1.0,1,507,,,,1080,1080,1080.0,EEABGJJD_00957,EEABGJJD_00957,EEABGJJD_00957,EEABGJJD_00957,EEABGJJD_00957 +murG,murG,putative undecaprenyl-PP-MurNAc-pentapeptide-UDPGlcNAc GlcNAc transferase,5,5,1.0,1,506,,,,1083,1083,1083.0,EEABGJJD_01273,EEABGJJD_01273,EEABGJJD_01273,EEABGJJD_01273,EEABGJJD_01273 +carA,carA,putative carbamoyl phosphate synthetase small subunit,5,5,1.0,1,505,,,,1083,1083,1083.0,EEABGJJD_00690,EEABGJJD_00690,EEABGJJD_00690,EEABGJJD_00690,EEABGJJD_00690 +deaD2,deaD2,putative RNA helicase,5,5,1.0,1,504,,,,1086,1086,1086.0,EEABGJJD_01146,EEABGJJD_01146,EEABGJJD_01146,EEABGJJD_01146,EEABGJJD_01146 +pepQ,pepQ,putative XAA-PRO dipeptidase; X-PRO dipeptidase,5,5,1.0,1,503,,,,1086,1086,1086.0,EEABGJJD_00432,EEABGJJD_00432,EEABGJJD_00432,EEABGJJD_00432,EEABGJJD_00432 +gldA,gldA,putative glycerol dehydrogenase,5,5,1.0,1,502,,,,1089,1089,1089.0,EEABGJJD_01692,EEABGJJD_01692,EEABGJJD_01692,EEABGJJD_01692,EEABGJJD_01692 +group_257,,putative zinc-containing alcohol dehydrogenase,5,5,1.0,1,501,,,,1092,1092,1092.0,EEABGJJD_00930,EEABGJJD_00930,EEABGJJD_00930,EEABGJJD_00930,EEABGJJD_00930 +group_256,,ATP-binding protein,5,5,1.0,1,500,,,,1092,1092,1092.0,EEABGJJD_00550,EEABGJJD_00550,EEABGJJD_00550,EEABGJJD_00550,EEABGJJD_00550 +group_255,,L-ascorbate 6-phosphate lactonase,5,5,1.0,1,499,,,,1092,1092,1092.0,EEABGJJD_00186,EEABGJJD_00186,EEABGJJD_00186,EEABGJJD_00186,EEABGJJD_00186 +group_254,,DNA polymerase IV,5,5,1.0,1,498,,,,1095,1095,1095.0,EEABGJJD_01535,EEABGJJD_01535,EEABGJJD_01535,EEABGJJD_01535,EEABGJJD_01535 +group_253,,hypothetical protein,5,5,1.0,1,497,,,,1095,1095,1095.0,EEABGJJD_01114,EEABGJJD_01114,EEABGJJD_01114,EEABGJJD_01114,EEABGJJD_01114 +alr,alr,putative alanine racemase,5,5,1.0,1,496,,,,1101,1101,1101.0,EEABGJJD_01502,EEABGJJD_01502,EEABGJJD_01502,EEABGJJD_01502,EEABGJJD_01502 +recF,recF,RecF protein,5,5,1.0,1,495,,,,1107,1107,1107.0,EEABGJJD_01822,EEABGJJD_01822,EEABGJJD_01822,EEABGJJD_01822,EEABGJJD_01822 +msrA,msrA,putative peptide methionine sulfoxide reductase,5,5,1.0,1,494,,,,1107,1107,1107.0,EEABGJJD_01301,EEABGJJD_01301,EEABGJJD_01301,EEABGJJD_01301,EEABGJJD_01301 +group_252,,FAD-dependent oxidoreductase,5,5,1.0,1,493,,,,1107,1107,1107.0,EEABGJJD_00777,EEABGJJD_00777,EEABGJJD_00777,EEABGJJD_00777,EEABGJJD_00777 +group_251,,nucleotidyltransferase,5,5,1.0,1,492,,,,1107,1107,1107.0,EEABGJJD_00290,EEABGJJD_00290,EEABGJJD_00290,EEABGJJD_00290,EEABGJJD_00290 +rpoD,rpoD,putative RNA polymerase sigma 42 protein,5,5,1.0,1,491,,,,1110,1110,1110.0,EEABGJJD_00649,EEABGJJD_00649,EEABGJJD_00649,EEABGJJD_00649,EEABGJJD_00649 +hylP3,hylP3,hyaluronidase - phage associated,5,5,1.0,1,490,,,,1113,1113,1113.0,EEABGJJD_01203,EEABGJJD_01203,EEABGJJD_01203,EEABGJJD_01203,EEABGJJD_01203 +group_250,,putative GTP-binding protein,5,5,1.0,1,489,,,,1116,1116,1116.0,EEABGJJD_00004,EEABGJJD_00004,EEABGJJD_00004,EEABGJJD_00004,EEABGJJD_00004 +hylP2,hylP2,hyaluronidase - phage associated,5,5,1.0,1,488,,,,1119,1119,1119.0,EEABGJJD_00834,EEABGJJD_00834,EEABGJJD_00834,EEABGJJD_00834,EEABGJJD_00834 +group_249,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,487,,,,1119,1119,1119.0,EEABGJJD_00619,EEABGJJD_00619,EEABGJJD_00619,EEABGJJD_00619,EEABGJJD_00619 +group_248,,ribosome biogenesis GTPase YqeH,5,5,1.0,1,486,,,,1119,1119,1119.0,EEABGJJD_00284,EEABGJJD_00284,EEABGJJD_00284,EEABGJJD_00284,EEABGJJD_00284 +group_247,,putative tRNA-(5-methylaminomethyl-2-thiouridylate),5,5,1.0,1,485,,,,1122,1122,1122.0,EEABGJJD_01807,EEABGJJD_01807,EEABGJJD_01807,EEABGJJD_01807,EEABGJJD_01807 +group_246,,putative decarboxylase beta subunit,5,5,1.0,1,484,,,,1122,1122,1122.0,EEABGJJD_00995,EEABGJJD_00995,EEABGJJD_00995,EEABGJJD_00995,EEABGJJD_00995 +group_245,,tRNA epoxyqueuosine(34) reductase QueG,5,5,1.0,1,483,,,,1125,1125,1125.0,EEABGJJD_00529,EEABGJJD_00529,EEABGJJD_00529,EEABGJJD_00529,EEABGJJD_00529 +group_244,,putative choline binding protein,5,5,1.0,1,482,,,,1125,1125,1125.0,EEABGJJD_00049,EEABGJJD_00049,EEABGJJD_00049,EEABGJJD_00049,EEABGJJD_00049 +group_243,,putative iron-sulfur cofactor synthesis protein,5,5,1.0,1,481,,,,1128,1128,1128.0,EEABGJJD_00939,EEABGJJD_00939,EEABGJJD_00939,EEABGJJD_00939,EEABGJJD_00939 +group_242,,AI-2E family transporter,5,5,1.0,1,480,,,,1128,1128,1128.0,EEABGJJD_00934,EEABGJJD_00934,EEABGJJD_00934,EEABGJJD_00934,EEABGJJD_00934 +sagH,sagH,ABC transporter (ATP-binding protein) - streptolysin S associated ORF,5,5,1.0,1,479,,,,1128,1128,1128.0,EEABGJJD_00618,EEABGJJD_00618,EEABGJJD_00618,EEABGJJD_00618,EEABGJJD_00618 +group_241,,putative methylmalonyl-CoA decarboxylase beta-subunit,5,5,1.0,1,478,,,,1131,1131,1131.0,EEABGJJD_00989,EEABGJJD_00989,EEABGJJD_00989,EEABGJJD_00989,EEABGJJD_00989 +msmK,msmK,multiple sugar-binding ABC transport system (ATP-binding protein),5,5,1.0,1,477,,,,1134,1134,1134.0,EEABGJJD_01641,EEABGJJD_01641,EEABGJJD_01641,EEABGJJD_01641,EEABGJJD_01641 +recA,recA,recombination protein,5,5,1.0,1,476,,,,1137,1137,1137.0,EEABGJJD_01748,EEABGJJD_01748,EEABGJJD_01748,EEABGJJD_01748,EEABGJJD_01748 +dnaJ,dnaJ,molecular chaperone DnaJ,5,5,1.0,1,475,,,,1137,1137,1137.0,EEABGJJD_01470,EEABGJJD_01470,EEABGJJD_01470,EEABGJJD_01470,EEABGJJD_01470 +dnaN,dnaN,beta subunit of DNA polymerase III,5,5,1.0,1,474,,,,1137,1137,1137.0,EEABGJJD_00002,EEABGJJD_00002,EEABGJJD_00002,EEABGJJD_00002,EEABGJJD_00002 +group_240,,YSIRK signal domain/LPXTG anchor domain surface protein,5,5,1.0,1,473,,,,1140,1140,1140.0,EEABGJJD_01669,EEABGJJD_01669,EEABGJJD_01669,EEABGJJD_01669,EEABGJJD_01669 +int3,int3,putative integrase - phage associated,5,5,1.0,1,472,,,,1140,1140,1140.0,EEABGJJD_00781,EEABGJJD_00781,EEABGJJD_00781,EEABGJJD_00781,EEABGJJD_00781 +group_239,,glycerate kinase,5,5,1.0,1,471,,,,1143,1143,1143.0,EEABGJJD_01586,EEABGJJD_01586,EEABGJJD_01586,EEABGJJD_01586,EEABGJJD_01586 +int2,int.2,integrase,5,5,1.0,1,470,,,,1143,1143,1143.0,EEABGJJD_01243,EEABGJJD_01243,EEABGJJD_01243,EEABGJJD_01243,EEABGJJD_01243 +group_238,,putative iron-sulfur cofactor synthesis protein,5,5,1.0,1,469,,,,1143,1143,1143.0,EEABGJJD_00678,EEABGJJD_00678,EEABGJJD_00678,EEABGJJD_00678,EEABGJJD_00678 +tgt,tgt,putative tRNA-guanine transglycosylase,5,5,1.0,1,468,,,,1143,1143,1143.0,EEABGJJD_00203,EEABGJJD_00203,EEABGJJD_00203,EEABGJJD_00203,EEABGJJD_00203 +int4,int4,putative integrase - phage associated,5,5,1.0,1,467,,,,1146,1146,1146.0,EEABGJJD_01754,EEABGJJD_01754,EEABGJJD_01754,EEABGJJD_01754,EEABGJJD_01754 +nusA,nusA,transcription termination-antitermination factor,5,5,1.0,1,466,,,,1146,1146,1146.0,EEABGJJD_01438,EEABGJJD_01438,EEABGJJD_01438,EEABGJJD_01438,EEABGJJD_01438 +nagA,nagA,putative N-acetylglucosamine-6-phosphate deacetylase,5,5,1.0,1,465,,,,1149,1149,1149.0,EEABGJJD_01414,EEABGJJD_01414,EEABGJJD_01414,EEABGJJD_01414,EEABGJJD_01414 +group_237,,cell division protein FtsQ/DivIB,5,5,1.0,1,464,,,,1149,1149,1149.0,EEABGJJD_01272,EEABGJJD_01272,EEABGJJD_01272,EEABGJJD_01272,EEABGJJD_01272 +group_236,,putative acetyl-CoA c-acetyltransferase,5,5,1.0,1,463,,,,1149,1149,1149.0,EEABGJJD_00440,EEABGJJD_00440,EEABGJJD_00440,EEABGJJD_00440,EEABGJJD_00440 +group_235,,A/G-specific adenine glycosylase,5,5,1.0,1,462,,,,1155,1155,1155.0,EEABGJJD_01524,EEABGJJD_01524,EEABGJJD_01524,EEABGJJD_01524,EEABGJJD_01524 +group_234,,class I SAM-dependent RNA methyltransferase,5,5,1.0,1,461,,,,1155,1155,1155.0,EEABGJJD_01376,EEABGJJD_01376,EEABGJJD_01376,EEABGJJD_01376,EEABGJJD_01376 +potA,potA,putative spermidine / putrescine ABC transporter (ATP-binding protein),5,5,1.0,1,460,,,,1155,1155,1155.0,EEABGJJD_00922,EEABGJJD_00922,EEABGJJD_00922,EEABGJJD_00922,EEABGJJD_00922 +group_233,,hypothetical protein,5,5,1.0,1,459,,,,1155,1155,1155.0,EEABGJJD_00652,EEABGJJD_00652,EEABGJJD_00652,EEABGJJD_00652,EEABGJJD_00652 +group_232,,putative nucleotide sugar dehydrogenase,5,5,1.0,1,458,,,,1158,1158,1158.0,EEABGJJD_00456,EEABGJJD_00456,EEABGJJD_00456,EEABGJJD_00456,EEABGJJD_00456 +group_231,,class I SAM-dependent rRNA methyltransferase,5,5,1.0,1,457,,,,1164,1164,1164.0,EEABGJJD_00671,EEABGJJD_00671,EEABGJJD_00671,EEABGJJD_00671,EEABGJJD_00671 +group_230,,putative malic enzyme ((S)-malate:NAD+ oxidoreductase (decarboxylating)),5,5,1.0,1,456,,,,1167,1167,1167.0,EEABGJJD_00929,EEABGJJD_00929,EEABGJJD_00929,EEABGJJD_00929,EEABGJJD_00929 +aroF,aroF,putative chorismate synthase,5,5,1.0,1,455,,,,1167,1167,1167.0,EEABGJJD_00673,EEABGJJD_00673,EEABGJJD_00673,EEABGJJD_00673,EEABGJJD_00673 +group_229,,hypothetical protein,5,5,1.0,1,454,,,,1167,1167,1167.0,EEABGJJD_00563,EEABGJJD_00563,EEABGJJD_00563,EEABGJJD_00563,EEABGJJD_00563 +group_228,,NAD(P)/FAD-dependent oxidoreductase,5,5,1.0,1,453,,,,1170,1170,1170.0,EEABGJJD_01550,EEABGJJD_01550,EEABGJJD_01550,EEABGJJD_01550,EEABGJJD_01550 +group_227,,LysM peptidoglycan-binding domain-containing protein,5,5,1.0,1,452,,,,1170,1170,1170.0,EEABGJJD_00397,EEABGJJD_00397,EEABGJJD_00397,EEABGJJD_00397,EEABGJJD_00397 +rgpG,rgpG,possibly involved in regulation of genetic competence,5,5,1.0,1,451,,,,1170,1170,1170.0,EEABGJJD_00264,EEABGJJD_00264,EEABGJJD_00264,EEABGJJD_00264,EEABGJJD_00264 +group_226,,chromosome replication initiation protein,5,5,1.0,1,450,,,,1173,1173,1173.0,EEABGJJD_00307,EEABGJJD_00307,EEABGJJD_00307,EEABGJJD_00307,EEABGJJD_00307 +mvaS2,mvaS.2,putative 3-hydroxy-3-methylglutaryl-coenzyme A synthase (HMG-CoA synthase),5,5,1.0,1,449,,,,1176,1176,1176.0,EEABGJJD_00729,EEABGJJD_00729,EEABGJJD_00729,EEABGJJD_00729,EEABGJJD_00729 +group_225,,AI-2E family transporter,5,5,1.0,1,448,,,,1182,1182,1182.0,EEABGJJD_00845,EEABGJJD_00845,EEABGJJD_00845,EEABGJJD_00845,EEABGJJD_00845 +lctO,lctO,putative lactate oxidase,5,5,1.0,1,447,,,,1182,1182,1182.0,EEABGJJD_00360,EEABGJJD_00360,EEABGJJD_00360,EEABGJJD_00360,EEABGJJD_00360 +group_224,,D-alanyl-D-alanine carboxypeptidase,5,5,1.0,1,446,,,,1182,1182,1182.0,EEABGJJD_00270,EEABGJJD_00270,EEABGJJD_00270,EEABGJJD_00270,EEABGJJD_00270 +group_223,,putative acetyl-CoA acetyltransferase,5,5,1.0,1,445,,,,1185,1185,1185.0,EEABGJJD_00150,EEABGJJD_00150,EEABGJJD_00150,EEABGJJD_00150,EEABGJJD_00150 +atoB,atoB,putative acetyl-CoA:acetyltransferase,5,5,1.0,1,444,,,,1188,1188,1188.0,EEABGJJD_01369,EEABGJJD_01369,EEABGJJD_01369,EEABGJJD_01369,EEABGJJD_01369 +group_222,,hypothetical protein,5,5,1.0,1,443,,,,1188,1188,1188.0,EEABGJJD_01169,EEABGJJD_01169,EEABGJJD_01169,EEABGJJD_01169,EEABGJJD_01169 +group_221,,aspartate aminotransferase,5,5,1.0,1,442,,,,1194,1194,1194.0,EEABGJJD_00537,EEABGJJD_00537,EEABGJJD_00537,EEABGJJD_00537,EEABGJJD_00537 +group_220,,putative multi-drug resistance efflux pump,5,5,1.0,1,441,,,,1194,1194,1194.0,EEABGJJD_00421,EEABGJJD_00421,EEABGJJD_00421,EEABGJJD_00421,EEABGJJD_00421 +group_219,,peptidase C10,5,5,1.0,1,440,,,,1197,1197,1197.0,EEABGJJD_01687,EEABGJJD_01687,EEABGJJD_01687,EEABGJJD_01687,EEABGJJD_01687 +pgk,pgk,putative phosphoglycerate kinase,5,5,1.0,1,439,,,,1197,1197,1197.0,EEABGJJD_01564,EEABGJJD_01564,EEABGJJD_01564,EEABGJJD_01564,EEABGJJD_01564 +norA,norA,putative antibiotic resistance protein NorA,5,5,1.0,1,438,,,,1197,1197,1197.0,EEABGJJD_01541,EEABGJJD_01541,EEABGJJD_01541,EEABGJJD_01541,EEABGJJD_01541 +group_218,,putative oxalate:formate antiporter,5,5,1.0,1,437,,,,1197,1197,1197.0,EEABGJJD_01163,EEABGJJD_01163,EEABGJJD_01163,EEABGJJD_01163,EEABGJJD_01163 +metK,metK,S-adenosylmethionine synthetase,5,5,1.0,1,436,,,,1197,1197,1197.0,EEABGJJD_01138,EEABGJJD_01138,EEABGJJD_01138,EEABGJJD_01138,EEABGJJD_01138 +hemN,hemN,putative coproporphyrinogen III oxidase,5,5,1.0,1,435,,,,1197,1197,1197.0,EEABGJJD_00868,EEABGJJD_00868,EEABGJJD_00868,EEABGJJD_00868,EEABGJJD_00868 +tufA,tufA,putative translation elongation factor EF-Tu,5,5,1.0,1,434,,,,1197,1197,1197.0,EEABGJJD_00509,EEABGJJD_00509,EEABGJJD_00509,EEABGJJD_00509,EEABGJJD_00509 +opuAA,opuAA,putative glycine betaine/proline ABC transporter (ATP-binding protein),5,5,1.0,1,433,,,,1197,1197,1197.0,EEABGJJD_00187,EEABGJJD_00187,EEABGJJD_00187,EEABGJJD_00187,EEABGJJD_00187 +ackA,ackA,acetate kinase,5,5,1.0,1,432,,,,1197,1197,1197.0,EEABGJJD_00126,EEABGJJD_00126,EEABGJJD_00126,EEABGJJD_00126,EEABGJJD_00126 +group_217,,CHAP domain-containing protein,5,5,1.0,1,431,,,,1197,1197,1197.0,EEABGJJD_00038,EEABGJJD_00038,EEABGJJD_00038,EEABGJJD_00038,EEABGJJD_00038 +hsdS,hsdS,putative type I site-specific deoxyribonuclease,5,5,1.0,1,430,,,,1200,1200,1200.0,EEABGJJD_01588,EEABGJJD_01588,EEABGJJD_01588,EEABGJJD_01588,EEABGJJD_01588 +group_216,,putative trimethylamine dehydrogenase,5,5,1.0,1,429,,,,1200,1200,1200.0,EEABGJJD_01024,EEABGJJD_01024,EEABGJJD_01024,EEABGJJD_01024,EEABGJJD_01024 +agaS,agaS,putative tagatose-6-phosphate aldose/ketose isomerase,5,5,1.0,1,428,,,,1200,1200,1200.0,EEABGJJD_00594,EEABGJJD_00594,EEABGJJD_00594,EEABGJJD_00594,EEABGJJD_00594 +ugl,ugl,putative unsaturated glucuronyl hydrolase,5,5,1.0,1,427,,,,1200,1200,1200.0,EEABGJJD_00522,EEABGJJD_00522,EEABGJJD_00522,EEABGJJD_00522,EEABGJJD_00522 +group_215,,putative S-adenosylmethionine synthetase,5,5,1.0,1,426,,,,1200,1200,1200.0,EEABGJJD_00453,EEABGJJD_00453,EEABGJJD_00453,EEABGJJD_00453,EEABGJJD_00453 +nupC,nupC,putative nucleoside transporter,5,5,1.0,1,425,,,,1203,1203,1203.0,EEABGJJD_01552,EEABGJJD_01552,EEABGJJD_01552,EEABGJJD_01552,EEABGJJD_01552 +group_214,,hypothetical protein,5,5,1.0,1,424,,,,1203,1203,1203.0,EEABGJJD_00588,EEABGJJD_00588,EEABGJJD_00588,EEABGJJD_00588,EEABGJJD_00588 +group_213,,transposase - IS1562,5,5,1.0,1,423,,,,1206,1206,1206.0,EEABGJJD_01671,EEABGJJD_01671,EEABGJJD_01671,EEABGJJD_01671,EEABGJJD_01671 +phiMGAS50052_14,phiMGAS5005.2_14,phage-associated cell wall hydrolase,5,5,1.0,1,422,,,,1206,1206,1206.0,EEABGJJD_01196,EEABGJJD_01196,EEABGJJD_01196,EEABGJJD_01196,EEABGJJD_01196 +group_212,,D-alanyl-D-alanine carboxypeptidase,5,5,1.0,1,421,,,,1206,1206,1206.0,EEABGJJD_00914,EEABGJJD_00914,EEABGJJD_00914,EEABGJJD_00914,EEABGJJD_00914 +group_211,,30S ribosomal protein S1,5,5,1.0,1,420,,,,1206,1206,1206.0,EEABGJJD_00759,EEABGJJD_00759,EEABGJJD_00759,EEABGJJD_00759,EEABGJJD_00759 +rgpDc,rgpDc,ABC-transporter (ATP-binding protein) - possibly involved in cell wall localization and side chain formation of rhamnose-glucose polysaccharide,5,5,1.0,1,419,,,,1206,1206,1206.0,EEABGJJD_00655,EEABGJJD_00655,EEABGJJD_00655,EEABGJJD_00655,EEABGJJD_00655 +hasB,hasB,UDP-glucose 6-dehydrogenase,5,5,1.0,1,418,,,,1209,1209,1209.0,EEABGJJD_01819,EEABGJJD_01819,EEABGJJD_01819,EEABGJJD_01819,EEABGJJD_01819 +group_210,,CCA-adding enzyme,5,5,1.0,1,417,,,,1209,1209,1209.0,EEABGJJD_00717,EEABGJJD_00717,EEABGJJD_00717,EEABGJJD_00717,EEABGJJD_00717 +group_209,,putative efflux protein,5,5,1.0,1,416,,,,1209,1209,1209.0,EEABGJJD_00457,EEABGJJD_00457,EEABGJJD_00457,EEABGJJD_00457,EEABGJJD_00457 +deoB,deoB,putative phosphopentomutase,5,5,1.0,1,415,,,,1212,1212,1212.0,EEABGJJD_00738,EEABGJJD_00738,EEABGJJD_00738,EEABGJJD_00738,EEABGJJD_00738 +phiNCTC81984_1,phiNCTC8198.4_1,PBSX family phage terminase large subunit,5,5,1.0,1,414,,,,1212,1212,1212.0,EEABGJJD_00565,EEABGJJD_00565,EEABGJJD_00565,EEABGJJD_00565,EEABGJJD_00565 +thiI,thiI,putative thiamine biosynthesis protein,5,5,1.0,1,413,,,,1215,1215,1215.0,EEABGJJD_00679,EEABGJJD_00679,EEABGJJD_00679,EEABGJJD_00679,EEABGJJD_00679 +group_208,,putative sodium/dicarboxylate symporter,5,5,1.0,1,412,,,,1215,1215,1215.0,EEABGJJD_00297,EEABGJJD_00297,EEABGJJD_00297,EEABGJJD_00297,EEABGJJD_00297 +group_207,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,411,,,,1218,1218,1218.0,EEABGJJD_01679,EEABGJJD_01679,EEABGJJD_01679,EEABGJJD_01679,EEABGJJD_01679 +lmrP,lmrP,putative integral membrane protein,5,5,1.0,1,410,,,,1221,1221,1221.0,EEABGJJD_01752,EEABGJJD_01752,EEABGJJD_01752,EEABGJJD_01752,EEABGJJD_01752 +group_206,,MFS transporter,5,5,1.0,1,409,,,,1221,1221,1221.0,EEABGJJD_01124,EEABGJJD_01124,EEABGJJD_01124,EEABGJJD_01124,EEABGJJD_01124 +group_205,,ABC transporter permease,5,5,1.0,1,408,,,,1221,1221,1221.0,EEABGJJD_00694,EEABGJJD_00694,EEABGJJD_00694,EEABGJJD_00694,EEABGJJD_00694 +group_204,,putative protein involved in cytokinesis contains TGc (transglutaminase/protease-like) domain,5,5,1.0,1,407,,,,1221,1221,1221.0,EEABGJJD_00208,EEABGJJD_00208,EEABGJJD_00208,EEABGJJD_00208,EEABGJJD_00208 +htrA,htrA,putative serine protease,5,5,1.0,1,406,,,,1224,1224,1224.0,EEABGJJD_01834,EEABGJJD_01834,EEABGJJD_01834,EEABGJJD_01834,EEABGJJD_01834 +group_203,,aminoacyltransferase,5,5,1.0,1,405,,,,1224,1224,1224.0,EEABGJJD_01011,EEABGJJD_01011,EEABGJJD_01011,EEABGJJD_01011,EEABGJJD_01011 +pepT,pepT,putative tripeptidase,5,5,1.0,1,404,,,,1224,1224,1224.0,EEABGJJD_00663,EEABGJJD_00663,EEABGJJD_00663,EEABGJJD_00663,EEABGJJD_00663 +murM,murM,putative peptidoglycan branched peptide synthesis protein serine/alanine adding enzyme,5,5,1.0,1,403,,,,1227,1227,1227.0,EEABGJJD_00512,EEABGJJD_00512,EEABGJJD_00512,EEABGJJD_00512,EEABGJJD_00512 +group_202,,putative aminotransferase,5,5,1.0,1,402,,,,1227,1227,1227.0,EEABGJJD_00267,EEABGJJD_00267,EEABGJJD_00267,EEABGJJD_00267,EEABGJJD_00267 +clpX,clpX,putative ATP-dependent Clp protease subunit X,5,5,1.0,1,401,,,,1230,1230,1230.0,EEABGJJD_00733,EEABGJJD_00733,EEABGJJD_00733,EEABGJJD_00733,EEABGJJD_00733 +fabF,fabF,putative beta-ketoacyl-ACP synthase II,5,5,1.0,1,400,,,,1233,1233,1233.0,EEABGJJD_01462,EEABGJJD_01462,EEABGJJD_01462,EEABGJJD_01462,EEABGJJD_01462 +group_201,,putative two-component sensory transduction histidine kinase,5,5,1.0,1,399,,,,1233,1233,1233.0,EEABGJJD_00723,EEABGJJD_00723,EEABGJJD_00723,EEABGJJD_00723,EEABGJJD_00723 +dacA,dacA,penicillin-binding protein (D-alanyl-D-alanine carboxypeptidase),5,5,1.0,1,398,,,,1233,1233,1233.0,EEABGJJD_00271,EEABGJJD_00271,EEABGJJD_00271,EEABGJJD_00271,EEABGJJD_00271 +group_200,,PTS ascorbate transporter subunit IIC,5,5,1.0,1,397,,,,1236,1236,1236.0,EEABGJJD_01624,EEABGJJD_01624,EEABGJJD_01624,EEABGJJD_01624,EEABGJJD_01624 +sagP,sagP,streptococcal antitumor protein,5,5,1.0,1,396,,,,1236,1236,1236.0,EEABGJJD_01294,EEABGJJD_01294,EEABGJJD_01294,EEABGJJD_01294,EEABGJJD_01294 +group_199,,cation transporter,5,5,1.0,1,395,,,,1236,1236,1236.0,EEABGJJD_01068,EEABGJJD_01068,EEABGJJD_01068,EEABGJJD_01068,EEABGJJD_01068 +murN,murN,putative peptidoglycan branched peptide synthesis protein alanine adding enzyme,5,5,1.0,1,394,,,,1236,1236,1236.0,EEABGJJD_00511,EEABGJJD_00511,EEABGJJD_00511,EEABGJJD_00511,EEABGJJD_00511 +group_198,,putative aminotransferase,5,5,1.0,1,393,,,,1239,1239,1239.0,EEABGJJD_01484,EEABGJJD_01484,EEABGJJD_01484,EEABGJJD_01484,EEABGJJD_01484 +group_197,,putative GTP-binding protein,5,5,1.0,1,392,,,,1239,1239,1239.0,EEABGJJD_00767,EEABGJJD_00767,EEABGJJD_00767,EEABGJJD_00767,EEABGJJD_00767 +group_196,,insulinase family protein,5,5,1.0,1,391,,,,1245,1245,1245.0,EEABGJJD_01817,EEABGJJD_01817,EEABGJJD_01817,EEABGJJD_01817,EEABGJJD_01817 +group_195,,putative maltose/maltodextrin-binding protein,5,5,1.0,1,390,,,,1248,1248,1248.0,EEABGJJD_01088,EEABGJJD_01088,EEABGJJD_01088,EEABGJJD_01088,EEABGJJD_01088 +group_194,,tetratricopeptide repeat protein,5,5,1.0,1,389,,,,1248,1248,1248.0,EEABGJJD_00846,EEABGJJD_00846,EEABGJJD_00846,EEABGJJD_00846,EEABGJJD_00846 +group_193,,ATP-grasp domain-containing protein,5,5,1.0,1,388,,,,1248,1248,1248.0,EEABGJJD_00471,EEABGJJD_00471,EEABGJJD_00471,EEABGJJD_00471,EEABGJJD_00471 +group_192,,long-chain fatty acid--CoA ligase,5,5,1.0,1,387,,,,1248,1248,1248.0,EEABGJJD_00441,EEABGJJD_00441,EEABGJJD_00441,EEABGJJD_00441,EEABGJJD_00441 +proA,proA,putative gamma-glutamyl phosphate reductase,5,5,1.0,1,386,,,,1251,1251,1251.0,EEABGJJD_01396,EEABGJJD_01396,EEABGJJD_01396,EEABGJJD_01396,EEABGJJD_01396 +dltD,dltD,putative extramembranal protein,5,5,1.0,1,385,,,,1251,1251,1251.0,EEABGJJD_01099,EEABGJJD_01099,EEABGJJD_01099,EEABGJJD_01099,EEABGJJD_01099 +dltB,dltB,putative integral membrane protein,5,5,1.0,1,384,,,,1257,1257,1257.0,EEABGJJD_01101,EEABGJJD_01101,EEABGJJD_01101,EEABGJJD_01101,EEABGJJD_01101 +glyA,glyA,putative serine hydroxymethyltransferase,5,5,1.0,1,383,,,,1257,1257,1257.0,EEABGJJD_00961,EEABGJJD_00961,EEABGJJD_00961,EEABGJJD_00961,EEABGJJD_00961 +group_191,,tyrosine--tRNA ligase,5,5,1.0,1,382,,,,1257,1257,1257.0,EEABGJJD_00113,EEABGJJD_00113,EEABGJJD_00113,EEABGJJD_00113,EEABGJJD_00113 +group_190,,hyaluronan synthase,5,5,1.0,1,381,,,,1260,1260,1260.0,EEABGJJD_01818,EEABGJJD_01818,EEABGJJD_01818,EEABGJJD_01818,EEABGJJD_01818 +group_189,,may be involved in production of a peptide sex pheromone,5,5,1.0,1,380,,,,1260,1260,1260.0,EEABGJJD_01634,EEABGJJD_01634,EEABGJJD_01634,EEABGJJD_01634,EEABGJJD_01634 +murZ,murZ,putative UDP-N-acetylglucosamine 1-carboxyvinyltransferase,5,5,1.0,1,379,,,,1260,1260,1260.0,EEABGJJD_01137,EEABGJJD_01137,EEABGJJD_01137,EEABGJJD_01137,EEABGJJD_01137 +group_188,,extracellular solute-binding protein,5,5,1.0,1,378,,,,1260,1260,1260.0,EEABGJJD_01097,EEABGJJD_01097,EEABGJJD_01097,EEABGJJD_01097,EEABGJJD_01097 +pyrP,pyrP,putative uracil permease,5,5,1.0,1,377,,,,1260,1260,1260.0,EEABGJJD_00688,EEABGJJD_00688,EEABGJJD_00688,EEABGJJD_00688,EEABGJJD_00688 +group_187,,putative transposase,5,5,1.0,1,376,,,,1263,1263,1263.0,EEABGJJD_00599,EEABGJJD_00599,EEABGJJD_00599,EEABGJJD_00599,EEABGJJD_00599 +group_186,,Fe-S cluster assembly protein SufD,5,5,1.0,1,375,,,,1263,1263,1263.0,EEABGJJD_00266,EEABGJJD_00266,EEABGJJD_00266,EEABGJJD_00266,EEABGJJD_00266 +group_185,,putative toxic anion resistance protein,5,5,1.0,1,374,,,,1263,1263,1263.0,EEABGJJD_00165,EEABGJJD_00165,EEABGJJD_00165,EEABGJJD_00165,EEABGJJD_00165 +hutI,hutI,putative imidazolonepropionase (imidazolone-5-propionate hydrolase),5,5,1.0,1,373,,,,1266,1266,1266.0,EEABGJJD_01718,EEABGJJD_01718,EEABGJJD_01718,EEABGJJD_01718,EEABGJJD_01718 +purD,purD,phosphoribosylamine-glycine ligase,5,5,1.0,1,372,,,,1266,1266,1266.0,EEABGJJD_00050,EEABGJJD_00050,EEABGJJD_00050,EEABGJJD_00050,EEABGJJD_00050 +group_184,,putative ATP-binding cassette transporter-like protein,5,5,1.0,1,371,,,,1269,1269,1269.0,EEABGJJD_01681,EEABGJJD_01681,EEABGJJD_01681,EEABGJJD_01681,EEABGJJD_01681 +group_183,,replication-associated recombination protein A,5,5,1.0,1,370,,,,1269,1269,1269.0,EEABGJJD_01655,EEABGJJD_01655,EEABGJJD_01655,EEABGJJD_01655,EEABGJJD_01655 +phiMGAS50052_36,phiMGAS5005.2_36,phage protein,5,5,1.0,1,369,,,,1269,1269,1269.0,EEABGJJD_01222,EEABGJJD_01222,EEABGJJD_01222,EEABGJJD_01222,EEABGJJD_01222 +group_182,,PLP-dependent aminotransferase family protein,5,5,1.0,1,368,,,,1269,1269,1269.0,EEABGJJD_01015,EEABGJJD_01015,EEABGJJD_01015,EEABGJJD_01015,EEABGJJD_01015 +pyrC,pyrC,putative dihydroorotase,5,5,1.0,1,367,,,,1269,1269,1269.0,EEABGJJD_00751,EEABGJJD_00751,EEABGJJD_00751,EEABGJJD_00751,EEABGJJD_00751 +group_181,,bifunctional folylpolyglutamate synthase/dihydrofolate synthase,5,5,1.0,1,366,,,,1269,1269,1269.0,EEABGJJD_00676,EEABGJJD_00676,EEABGJJD_00676,EEABGJJD_00676,EEABGJJD_00676 +cinA,cinA,putative competence-damage protein,5,5,1.0,1,365,,,,1272,1272,1272.0,EEABGJJD_01749,EEABGJJD_01749,EEABGJJD_01749,EEABGJJD_01749,EEABGJJD_01749 +group_180,,ABC transporter substrate-binding protein,5,5,1.0,1,364,,,,1272,1272,1272.0,EEABGJJD_00692,EEABGJJD_00692,EEABGJJD_00692,EEABGJJD_00692,EEABGJJD_00692 +group_179,,putative UDP-N-acetylglucosamine 1-carboxyvinyltransferase,5,5,1.0,1,363,,,,1272,1272,1272.0,EEABGJJD_00633,EEABGJJD_00633,EEABGJJD_00633,EEABGJJD_00633,EEABGJJD_00633 +group_178,,DNA recombination protein RmuC,5,5,1.0,1,362,,,,1272,1272,1272.0,EEABGJJD_00250,EEABGJJD_00250,EEABGJJD_00250,EEABGJJD_00250,EEABGJJD_00250 +group_177,,LytR family transcriptional regulator,5,5,1.0,1,361,,,,1275,1275,1275.0,EEABGJJD_01447,EEABGJJD_01447,EEABGJJD_01447,EEABGJJD_01447,EEABGJJD_01447 +group_176,,putative deacetylase,5,5,1.0,1,360,,,,1275,1275,1275.0,EEABGJJD_01147,EEABGJJD_01147,EEABGJJD_01147,EEABGJJD_01147,EEABGJJD_01147 +ftsW,ftsW,putative cell division protein,5,5,1.0,1,359,,,,1275,1275,1275.0,EEABGJJD_00508,EEABGJJD_00508,EEABGJJD_00508,EEABGJJD_00508,EEABGJJD_00508 +serS,serS,putative seryl-tRNA synthetase,5,5,1.0,1,358,,,,1278,1278,1278.0,EEABGJJD_01456,EEABGJJD_01456,EEABGJJD_01456,EEABGJJD_01456,EEABGJJD_01456 +folC1,folC.1,putative folyl-polyglutamate synthetase,5,5,1.0,1,357,,,,1278,1278,1278.0,EEABGJJD_00916,EEABGJJD_00916,EEABGJJD_00916,EEABGJJD_00916,EEABGJJD_00916 +mvaS1,mvaS.1,putative 3-hydroxy-3-methylglutaryl-coenzyme A,5,5,1.0,1,356,,,,1278,1278,1278.0,EEABGJJD_00728,EEABGJJD_00728,EEABGJJD_00728,EEABGJJD_00728,EEABGJJD_00728 +group_175,,putative protease,5,5,1.0,1,355,,,,1278,1278,1278.0,EEABGJJD_00495,EEABGJJD_00495,EEABGJJD_00495,EEABGJJD_00495,EEABGJJD_00495 +hisS,hisS,putative histidine-tRNA ligase,5,5,1.0,1,354,,,,1281,1281,1281.0,EEABGJJD_01783,EEABGJJD_01783,EEABGJJD_01783,EEABGJJD_01783,EEABGJJD_01783 +ropA,ropA,transcription regulator - (trigger factor (prolyl isomerase)),5,5,1.0,1,353,,,,1284,1284,1284.0,EEABGJJD_01575,EEABGJJD_01575,EEABGJJD_01575,EEABGJJD_01575,EEABGJJD_01575 +group_174,,CBS domain-containing protein,5,5,1.0,1,352,,,,1284,1284,1284.0,EEABGJJD_01134,EEABGJJD_01134,EEABGJJD_01134,EEABGJJD_01134,EEABGJJD_01134 +aroA,aroA,putative 3-phosphoshikimate 1-carboxyvinyltransferase,5,5,1.0,1,351,,,,1284,1284,1284.0,EEABGJJD_01131,EEABGJJD_01131,EEABGJJD_01131,EEABGJJD_01131,EEABGJJD_01131 +group_173,,putative purine permease,5,5,1.0,1,350,,,,1284,1284,1284.0,EEABGJJD_00953,EEABGJJD_00953,EEABGJJD_00953,EEABGJJD_00953,EEABGJJD_00953 +group_172,,putative histidine kinase,5,5,1.0,1,349,,,,1284,1284,1284.0,EEABGJJD_00230,EEABGJJD_00230,EEABGJJD_00230,EEABGJJD_00230,EEABGJJD_00230 +group_171,,metal-independent alpha-mannosidase,5,5,1.0,1,348,,,,1287,1287,1287.0,EEABGJJD_01335,EEABGJJD_01335,EEABGJJD_01335,EEABGJJD_01335,EEABGJJD_01335 +group_170,,lipopolysaccharide biosynthesis protein,5,5,1.0,1,347,,,,1287,1287,1287.0,EEABGJJD_00661,EEABGJJD_00661,EEABGJJD_00661,EEABGJJD_00661,EEABGJJD_00661 +group_169,,putative cell-cycle protein,5,5,1.0,1,346,,,,1287,1287,1287.0,EEABGJJD_00011,EEABGJJD_00011,EEABGJJD_00011,EEABGJJD_00011,EEABGJJD_00011 +group_168,,serine hydrolase,5,5,1.0,1,345,,,,1287,1287,1287.0,EEABGJJD_00010,EEABGJJD_00010,EEABGJJD_00010,EEABGJJD_00010,EEABGJJD_00010 +group_167,,insulinase family protein,5,5,1.0,1,344,,,,1290,1290,1290.0,EEABGJJD_01816,EEABGJJD_01816,EEABGJJD_01816,EEABGJJD_01816,EEABGJJD_01816 +phiSF3702_4,phiSF370.2_4,putative terminase large subunit - phage associated,5,5,1.0,1,343,,,,1290,1290,1290.0,EEABGJJD_00812,EEABGJJD_00812,EEABGJJD_00812,EEABGJJD_00812,EEABGJJD_00812 +group_166,,metallophosphatase,5,5,1.0,1,342,,,,1293,1293,1293.0,EEABGJJD_00680,EEABGJJD_00680,EEABGJJD_00680,EEABGJJD_00680,EEABGJJD_00680 +purA,purA,putative adenylosuccinate synthetase,5,5,1.0,1,341,,,,1293,1293,1293.0,EEABGJJD_00167,EEABGJJD_00167,EEABGJJD_00167,EEABGJJD_00167,EEABGJJD_00167 +group_165,,MATE family efflux transporter,5,5,1.0,1,340,,,,1293,1293,1293.0,EEABGJJD_00064,EEABGJJD_00064,EEABGJJD_00064,EEABGJJD_00064,EEABGJJD_00064 +purB,purB,adenylosuccinate lyase,5,5,1.0,1,339,,,,1293,1293,1293.0,EEABGJJD_00054,EEABGJJD_00054,EEABGJJD_00054,EEABGJJD_00054,EEABGJJD_00054 +group_164,,HD domain-containing protein,5,5,1.0,1,338,,,,1302,1302,1302.0,EEABGJJD_00514,EEABGJJD_00514,EEABGJJD_00514,EEABGJJD_00514,EEABGJJD_00514 +group_163,,putative PTS system enzyme IIC component,5,5,1.0,1,337,,,,1305,1305,1305.0,EEABGJJD_01695,EEABGJJD_01695,EEABGJJD_01695,EEABGJJD_01695,EEABGJJD_01695 +secY,secY,putative preprotein translocase,5,5,1.0,1,336,,,,1305,1305,1305.0,EEABGJJD_00086,EEABGJJD_00086,EEABGJJD_00086,EEABGJJD_00086,EEABGJJD_00086 +malC,malC,maltodextrin transport system permease,5,5,1.0,1,335,,,,1308,1308,1308.0,EEABGJJD_01094,EEABGJJD_01094,EEABGJJD_01094,EEABGJJD_01094,EEABGJJD_01094 +group_162,,phosphopyruvate hydratase,5,5,1.0,1,334,,,,1308,1308,1308.0,EEABGJJD_00606,EEABGJJD_00606,EEABGJJD_00606,EEABGJJD_00606,EEABGJJD_00606 +group_161,,putative nucleolar protein,5,5,1.0,1,333,,,,1311,1311,1311.0,EEABGJJD_01047,EEABGJJD_01047,EEABGJJD_01047,EEABGJJD_01047,EEABGJJD_01047 +group_160,,putative histidine kinase protein,5,5,1.0,1,332,,,,1311,1311,1311.0,EEABGJJD_01038,EEABGJJD_01038,EEABGJJD_01038,EEABGJJD_01038,EEABGJJD_01038 +pgdA,pgdA,putative phosphoglycerate dehydrogenase,5,5,1.0,1,331,,,,1311,1311,1311.0,EEABGJJD_00309,EEABGJJD_00309,EEABGJJD_00309,EEABGJJD_00309,EEABGJJD_00309 +group_159,,C4-dicarboxylate ABC transporter,5,5,1.0,1,330,,,,1311,1311,1311.0,EEABGJJD_00198,EEABGJJD_00198,EEABGJJD_00198,EEABGJJD_00198,EEABGJJD_00198 +group_158,,chloride channel protein,5,5,1.0,1,329,,,,1314,1314,1314.0,EEABGJJD_01154,EEABGJJD_01154,EEABGJJD_01154,EEABGJJD_01154,EEABGJJD_01154 +obgE,obgE,GTPase ObgE,5,5,1.0,1,328,,,,1314,1314,1314.0,EEABGJJD_01118,EEABGJJD_01118,EEABGJJD_01118,EEABGJJD_01118,EEABGJJD_01118 +ftsZ,ftsZ,putative cell division protein,5,5,1.0,1,327,,,,1320,1320,1320.0,EEABGJJD_01270,EEABGJJD_01270,EEABGJJD_01270,EEABGJJD_01270,EEABGJJD_01270 +group_157,,chromosome segregation protein SMC,5,5,1.0,1,326,,,,1320,1320,1320.0,EEABGJJD_00549,EEABGJJD_00549,EEABGJJD_00549,EEABGJJD_00549,EEABGJJD_00549 +group_156,,putative sugar transporter sugar binding lipoprotein,5,5,1.0,1,325,,,,1320,1320,1320.0,EEABGJJD_00237,EEABGJJD_00237,EEABGJJD_00237,EEABGJJD_00237,EEABGJJD_00237 +ska,ska,streptokinase A precursor,5,5,1.0,1,324,,,,1323,1323,1323.0,EEABGJJD_01643,EEABGJJD_01643,EEABGJJD_01643,EEABGJJD_01643,EEABGJJD_01643 +sunL,sunL,16S rRNA (cytosine(967)-C(5))-methyltransferase,5,5,1.0,1,323,,,,1323,1323,1323.0,EEABGJJD_01362,EEABGJJD_01362,EEABGJJD_01362,EEABGJJD_01362,EEABGJJD_01362 +group_155,,putative amino acid symporter,5,5,1.0,1,322,,,,1323,1323,1323.0,EEABGJJD_01067,EEABGJJD_01067,EEABGJJD_01067,EEABGJJD_01067,EEABGJJD_01067 +group_154,,transposase,5,5,1.0,1,321,,,,1323,1323,1323.0,EEABGJJD_00142,EEABGJJD_00142,EEABGJJD_00142,EEABGJJD_00142,EEABGJJD_00142 +comFA,comFA,putative late competence protein required for DNA uptake,5,5,1.0,1,320,,,,1326,1326,1326.0,EEABGJJD_01352,EEABGJJD_01352,EEABGJJD_01352,EEABGJJD_01352,EEABGJJD_01352 +murC,murC,putative UDP-N-acetyl muramate-alanine ligase,5,5,1.0,1,319,,,,1329,1329,1329.0,EEABGJJD_00312,EEABGJJD_00312,EEABGJJD_00312,EEABGJJD_00312,EEABGJJD_00312 +group_153,,GntP family permease,5,5,1.0,1,318,,,,1332,1332,1332.0,EEABGJJD_01373,EEABGJJD_01373,EEABGJJD_01373,EEABGJJD_01373,EEABGJJD_01373 +group_152,,putative Xaa-His dipeptidase,5,5,1.0,1,317,,,,1332,1332,1332.0,EEABGJJD_01290,EEABGJJD_01290,EEABGJJD_01290,EEABGJJD_01290,EEABGJJD_01290 +malP,malP,putative L-malate permease,5,5,1.0,1,316,,,,1332,1332,1332.0,EEABGJJD_00928,EEABGJJD_00928,EEABGJJD_00928,EEABGJJD_00928,EEABGJJD_00928 +lys,lys,putative lysin - phage associated,5,5,1.0,1,315,,,,1335,1335,1335.0,EEABGJJD_00841,EEABGJJD_00841,EEABGJJD_00841,EEABGJJD_00841,EEABGJJD_00841 +group_151,,putative glucosyl transferase,5,5,1.0,1,314,,,,1335,1335,1335.0,EEABGJJD_00435,EEABGJJD_00435,EEABGJJD_00435,EEABGJJD_00435,EEABGJJD_00435 +hlyX,hlyX,putative hemolysin,5,5,1.0,1,313,,,,1335,1335,1335.0,EEABGJJD_00338,EEABGJJD_00338,EEABGJJD_00338,EEABGJJD_00338,EEABGJJD_00338 +group_150,,phosphoadenosine phosphosulfate reductase,5,5,1.0,1,312,,,,1335,1335,1335.0,EEABGJJD_00193,EEABGJJD_00193,EEABGJJD_00193,EEABGJJD_00193,EEABGJJD_00193 +pepC,pepC,putative cysteine aminopeptidase C,5,5,1.0,1,311,,,,1338,1338,1338.0,EEABGJJD_01382,EEABGJJD_01382,EEABGJJD_01382,EEABGJJD_01382,EEABGJJD_01382 +group_149,,UDP-N-acetylmuramyl peptide synthase,5,5,1.0,1,310,,,,1338,1338,1338.0,EEABGJJD_00863,EEABGJJD_00863,EEABGJJD_00863,EEABGJJD_00863,EEABGJJD_00863 +group_148,,putative glycerol-3-phosphate transporter,5,5,1.0,1,309,,,,1338,1338,1338.0,EEABGJJD_00377,EEABGJJD_00377,EEABGJJD_00377,EEABGJJD_00377,EEABGJJD_00377 +xseA,xseA,putative exodeoxyribonuclease VII (large subunit),5,5,1.0,1,308,,,,1341,1341,1341.0,EEABGJJD_01254,EEABGJJD_01254,EEABGJJD_01254,EEABGJJD_01254,EEABGJJD_01254 +group_147,,hypothetical protein,5,5,1.0,1,307,,,,1341,1341,1341.0,EEABGJJD_01053,EEABGJJD_01053,EEABGJJD_01053,EEABGJJD_01053,EEABGJJD_01053 +group_146,,APC family permease,5,5,1.0,1,306,,,,1344,1344,1344.0,EEABGJJD_01724,EEABGJJD_01724,EEABGJJD_01724,EEABGJJD_01724,EEABGJJD_01724 +cysS,cysS,putative cysteinyl-tRNA synthetase,5,5,1.0,1,305,,,,1344,1344,1344.0,EEABGJJD_01618,EEABGJJD_01618,EEABGJJD_01618,EEABGJJD_01618,EEABGJJD_01618 +group_145,,putative ATP-dependent RNA helicase,5,5,1.0,1,304,,,,1344,1344,1344.0,EEABGJJD_01390,EEABGJJD_01390,EEABGJJD_01390,EEABGJJD_01390,EEABGJJD_01390 +group_144,,N-acetylglucosamine-1-phosphate uridyltransferase,5,5,1.0,1,303,,,,1344,1344,1344.0,EEABGJJD_00170,EEABGJJD_00170,EEABGJJD_00170,EEABGJJD_00170,EEABGJJD_00170 +glnA,glnA,putative glutamine synthetase,5,5,1.0,1,302,,,,1347,1347,1347.0,EEABGJJD_01561,EEABGJJD_01561,EEABGJJD_01561,EEABGJJD_01561,EEABGJJD_01561 +group_143,,putative NADH peroxidase,5,5,1.0,1,301,,,,1347,1347,1347.0,EEABGJJD_01404,EEABGJJD_01404,EEABGJJD_01404,EEABGJJD_01404,EEABGJJD_01404 +gid,gid,putative glucose-inhibited division protein,5,5,1.0,1,300,,,,1347,1347,1347.0,EEABGJJD_00985,EEABGJJD_00985,EEABGJJD_00985,EEABGJJD_00985,EEABGJJD_00985 +srtK,srtK,putative histidine kinase - lantibiotic associated,5,5,1.0,1,299,,,,1347,1347,1347.0,EEABGJJD_00903,EEABGJJD_00903,EEABGJJD_00903,EEABGJJD_00903,EEABGJJD_00903 +asnS,asnS,putative asparaginyl-tRNA synthetase,5,5,1.0,1,298,,,,1347,1347,1347.0,EEABGJJD_00538,EEABGJJD_00538,EEABGJJD_00538,EEABGJJD_00538,EEABGJJD_00538 +group_142,,putative histidine kinase possibly involved in competence,5,5,1.0,1,297,,,,1347,1347,1347.0,EEABGJJD_00229,EEABGJJD_00229,EEABGJJD_00229,EEABGJJD_00229,EEABGJJD_00229 +pgi,pgi,glucose-6-phosphate isomerase,5,5,1.0,1,296,,,,1350,1350,1350.0,EEABGJJD_00211,EEABGJJD_00211,EEABGJJD_00211,EEABGJJD_00211,EEABGJJD_00211 +gor,gor,putative glutathione reductase (GR),5,5,1.0,1,295,,,,1353,1353,1353.0,EEABGJJD_00675,EEABGJJD_00675,EEABGJJD_00675,EEABGJJD_00675,EEABGJJD_00675 +group_141,,cell wall metabolism sensor histidine kinase VicK,5,5,1.0,1,294,,,,1353,1353,1353.0,EEABGJJD_00444,EEABGJJD_00444,EEABGJJD_00444,EEABGJJD_00444,EEABGJJD_00444 +group_140,,putative RNA methyltransferase,5,5,1.0,1,293,,,,1356,1356,1356.0,EEABGJJD_01338,EEABGJJD_01338,EEABGJJD_01338,EEABGJJD_01338,EEABGJJD_01338 +group_139,,putative two-component responsible histidine kinase,5,5,1.0,1,292,,,,1356,1356,1356.0,EEABGJJD_01337,EEABGJJD_01337,EEABGJJD_01337,EEABGJJD_01337,EEABGJJD_01337 +rlmD,rlmD,23S rRNA (uracil(1939)-C(5))-methyltransferase RlmD,5,5,1.0,1,291,,,,1356,1356,1356.0,EEABGJJD_01128,EEABGJJD_01128,EEABGJJD_01128,EEABGJJD_01128,EEABGJJD_01128 +group_138,,putative phospho-sugar mutase,5,5,1.0,1,290,,,,1356,1356,1356.0,EEABGJJD_00866,EEABGJJD_00866,EEABGJJD_00866,EEABGJJD_00866,EEABGJJD_00866 +dnaA,dnaA,Chromosomal initiator protein,5,5,1.0,1,289,,,,1356,1356,1356.0,EEABGJJD_00001,EEABGJJD_00001,EEABGJJD_00001,EEABGJJD_00001,EEABGJJD_00001 +murD,murD,putative UDP-N-acetylmuramoylalanine-D-glutamate ligase,5,5,1.0,1,288,,,,1359,1359,1359.0,EEABGJJD_01274,EEABGJJD_01274,EEABGJJD_01274,EEABGJJD_01274,EEABGJJD_01274 +group_137,,streptolysin associated protein SagD,5,5,1.0,1,287,,,,1359,1359,1359.0,EEABGJJD_00614,EEABGJJD_00614,EEABGJJD_00614,EEABGJJD_00614,EEABGJJD_00614 +holB,holB,DNA polymerase III delta prime subunit,5,5,1.0,1,286,,,,1362,1362,1362.0,EEABGJJD_01802,EEABGJJD_01802,EEABGJJD_01802,EEABGJJD_01802,EEABGJJD_01802 +malF,malF,putative maltose/maltodextrin ABC transport system (permease),5,5,1.0,1,285,,,,1362,1362,1362.0,EEABGJJD_01089,EEABGJJD_01089,EEABGJJD_01089,EEABGJJD_01089,EEABGJJD_01089 +group_136,,DNA repair protein RadA,5,5,1.0,1,284,,,,1362,1362,1362.0,EEABGJJD_00225,EEABGJJD_00225,EEABGJJD_00225,EEABGJJD_00225,EEABGJJD_00225 +accC,accC,acetyl-CoA carboxylase biotin carboxylase subunit,5,5,1.0,1,283,,,,1365,1365,1365.0,EEABGJJD_01459,EEABGJJD_01459,EEABGJJD_01459,EEABGJJD_01459,EEABGJJD_01459 +ftsA,ftsA,cell division protein,5,5,1.0,1,282,,,,1365,1365,1365.0,EEABGJJD_01271,EEABGJJD_01271,EEABGJJD_01271,EEABGJJD_01271,EEABGJJD_01271 +group_135,,putative drug resistance protein,5,5,1.0,1,281,,,,1368,1368,1368.0,EEABGJJD_00480,EEABGJJD_00480,EEABGJJD_00480,EEABGJJD_00480,EEABGJJD_00480 +nox,nox,NADH Oxidase,5,5,1.0,1,280,,,,1371,1371,1371.0,EEABGJJD_00966,EEABGJJD_00966,EEABGJJD_00966,EEABGJJD_00966,EEABGJJD_00966 +group_134,,branched-chain amino acid transport system II carrier protein,5,5,1.0,1,279,,,,1371,1371,1371.0,EEABGJJD_00296,EEABGJJD_00296,EEABGJJD_00296,EEABGJJD_00296,EEABGJJD_00296 +thdF,thdF,putative thiophene degradation protein F,5,5,1.0,1,278,,,,1377,1377,1377.0,EEABGJJD_00895,EEABGJJD_00895,EEABGJJD_00895,EEABGJJD_00895,EEABGJJD_00895 +aapA,aapA,putative amino acid permease,5,5,1.0,1,277,,,,1380,1380,1380.0,EEABGJJD_01385,EEABGJJD_01385,EEABGJJD_01385,EEABGJJD_01385,EEABGJJD_01385 +group_133,,putative transcarboxylase subunit,5,5,1.0,1,276,,,,1383,1383,1383.0,EEABGJJD_00986,EEABGJJD_00986,EEABGJJD_00986,EEABGJJD_00986,EEABGJJD_00986 +gcaD,gcaD,putative UDP-N-acetylglucosamine pyrophosphorylase,5,5,1.0,1,275,,,,1383,1383,1383.0,EEABGJJD_00378,EEABGJJD_00378,EEABGJJD_00378,EEABGJJD_00378,EEABGJJD_00378 +group_132,,putative histidine kinase,5,5,1.0,1,274,,,,1389,1389,1389.0,EEABGJJD_01677,EEABGJJD_01677,EEABGJJD_01677,EEABGJJD_01677,EEABGJJD_01677 +group_131,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,273,,,,1389,1389,1389.0,EEABGJJD_01486,EEABGJJD_01486,EEABGJJD_01486,EEABGJJD_01486,EEABGJJD_01486 +metB,metB,putative cystathionine beta-lyase,5,5,1.0,1,272,,,,1389,1389,1389.0,EEABGJJD_00177,EEABGJJD_00177,EEABGJJD_00177,EEABGJJD_00177,EEABGJJD_00177 +group_130,,UDP-N-acetylmuramoyl-tripeptide--D-alanyl-D- alanine ligase,5,5,1.0,1,271,,,,1392,1392,1392.0,EEABGJJD_01184,EEABGJJD_01184,EEABGJJD_01184,EEABGJJD_01184,EEABGJJD_01184 +ntpJ,ntpJ,putative V-type Na+ -ATPase subunit J,5,5,1.0,1,270,,,,1392,1392,1392.0,EEABGJJD_00299,EEABGJJD_00299,EEABGJJD_00299,EEABGJJD_00299,EEABGJJD_00299 +group_129,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,269,,,,1395,1395,1395.0,EEABGJJD_01491,EEABGJJD_01491,EEABGJJD_01491,EEABGJJD_01491,EEABGJJD_01491 +oadA,oadA,putative oxaloacetate decarboxylase alpha chain,5,5,1.0,1,268,,,,1395,1395,1395.0,EEABGJJD_01001,EEABGJJD_01001,EEABGJJD_01001,EEABGJJD_01001,EEABGJJD_01001 +gabD,gabD,putative succinic semialdehyde dehydrogenase,5,5,1.0,1,267,,,,1398,1398,1398.0,EEABGJJD_00891,EEABGJJD_00891,EEABGJJD_00891,EEABGJJD_00891,EEABGJJD_00891 +group_128,,Cof-type HAD-IIB family hydrolase,5,5,1.0,1,266,,,,1401,1401,1401.0,EEABGJJD_01356,EEABGJJD_01356,EEABGJJD_01356,EEABGJJD_01356,EEABGJJD_01356 +bglA2,bglA.2,putative beta-glucosidase,5,5,1.0,1,265,,,,1401,1401,1401.0,EEABGJJD_01115,EEABGJJD_01115,EEABGJJD_01115,EEABGJJD_01115,EEABGJJD_01115 +group_127,,putative Mg2+/citrate complex transporter,5,5,1.0,1,264,,,,1404,1404,1404.0,EEABGJJD_00992,EEABGJJD_00992,EEABGJJD_00992,EEABGJJD_00992,EEABGJJD_00992 +lacG,lacG,putative phospho-beta-D-galactosidase,5,5,1.0,1,263,,,,1407,1407,1407.0,EEABGJJD_01599,EEABGJJD_01599,EEABGJJD_01599,EEABGJJD_01599,EEABGJJD_01599 +atpD,atpD,F0F1 ATP synthase subunit beta,5,5,1.0,1,262,,,,1407,1407,1407.0,EEABGJJD_00630,EEABGJJD_00630,EEABGJJD_00630,EEABGJJD_00630,EEABGJJD_00630 +atoE,atoE,putative short-chain fatty acids transporter,5,5,1.0,1,261,,,,1407,1407,1407.0,EEABGJJD_00147,EEABGJJD_00147,EEABGJJD_00147,EEABGJJD_00147,EEABGJJD_00147 +group_126,,deoxyribodipyrimidine photo-lyase,5,5,1.0,1,260,,,,1410,1410,1410.0,EEABGJJD_01257,EEABGJJD_01257,EEABGJJD_01257,EEABGJJD_01257,EEABGJJD_01257 +pepV,pepV,dipeptidase PepV,5,5,1.0,1,259,,,,1410,1410,1410.0,EEABGJJD_00894,EEABGJJD_00894,EEABGJJD_00894,EEABGJJD_00894,EEABGJJD_00894 +group_125,,dihydrolipoamide acetyltransferase,5,5,1.0,1,258,,,,1410,1410,1410.0,EEABGJJD_00858,EEABGJJD_00858,EEABGJJD_00858,EEABGJJD_00858,EEABGJJD_00858 +phiMGAS50052_31,phiMGAS5005.2_31,phage terminase,5,5,1.0,1,257,,,,1416,1416,1416.0,EEABGJJD_01217,EEABGJJD_01217,EEABGJJD_01217,EEABGJJD_01217,EEABGJJD_01217 +phiNCTC81981_1,phiNCTC8198.1_1,recombinase family protein,5,5,1.0,1,256,,,,1416,1416,1416.0,EEABGJJD_00542,EEABGJJD_00542,EEABGJJD_00542,EEABGJJD_00542,EEABGJJD_00542 +ntpB,ntpB,putative V-type Na+ -ATPase subunit B,5,5,1.0,1,255,,,,1416,1416,1416.0,EEABGJJD_00163,EEABGJJD_00163,EEABGJJD_00163,EEABGJJD_00163,EEABGJJD_00163 +pepD,pepD,putative dipeptidase,5,5,1.0,1,254,,,,1419,1419,1419.0,EEABGJJD_00591,EEABGJJD_00591,EEABGJJD_00591,EEABGJJD_00591,EEABGJJD_00591 +group_124,,Fe-S cluster assembly protein SufB,5,5,1.0,1,253,,,,1419,1419,1419.0,EEABGJJD_00269,EEABGJJD_00269,EEABGJJD_00269,EEABGJJD_00269,EEABGJJD_00269 +group_123,,glycoside hydrolase family 1 protein,5,5,1.0,1,252,,,,1425,1425,1425.0,EEABGJJD_00483,EEABGJJD_00483,EEABGJJD_00483,EEABGJJD_00483,EEABGJJD_00483 +gapN,gapN,putative NADP-dependent glyceraldehyde-3-phosphate dehydrogenase,5,5,1.0,1,251,,,,1428,1428,1428.0,EEABGJJD_01148,EEABGJJD_01148,EEABGJJD_01148,EEABGJJD_01148,EEABGJJD_01148 +group_122,,hypothetical protein,5,5,1.0,1,250,,,,1434,1434,1434.0,EEABGJJD_01009,EEABGJJD_01009,EEABGJJD_01009,EEABGJJD_01009,EEABGJJD_01009 +group_121,,PTS ascorbate transporter subunit IIC,5,5,1.0,1,249,,,,1434,1434,1434.0,EEABGJJD_00179,EEABGJJD_00179,EEABGJJD_00179,EEABGJJD_00179,EEABGJJD_00179 +scrB,scrB,putative sucrose-6-phosphate hydrolase,5,5,1.0,1,248,,,,1440,1440,1440.0,EEABGJJD_01510,EEABGJJD_01510,EEABGJJD_01510,EEABGJJD_01510,EEABGJJD_01510 +gatB,gatB,putative Glu-tRNAGln amidotransferase subunit B,5,5,1.0,1,247,,,,1440,1440,1440.0,EEABGJJD_01479,EEABGJJD_01479,EEABGJJD_01479,EEABGJJD_01479,EEABGJJD_01479 +group_120,,putative beta-glucosidase,5,5,1.0,1,246,,,,1443,1443,1443.0,EEABGJJD_01332,EEABGJJD_01332,EEABGJJD_01332,EEABGJJD_01332,EEABGJJD_01332 +group_119,,putative ABC transporter substrate binding lipoprotein,5,5,1.0,1,245,,,,1446,1446,1446.0,EEABGJJD_01328,EEABGJJD_01328,EEABGJJD_01328,EEABGJJD_01328,EEABGJJD_01328 +murE,murE,putative UDP-N-acetylmuramoylalanyl-D-glutamyl-26-diaminopimelate ligase,5,5,1.0,1,244,,,,1446,1446,1446.0,EEABGJJD_00346,EEABGJJD_00346,EEABGJJD_00346,EEABGJJD_00346,EEABGJJD_00346 +group_118,,glutamate--tRNA ligase,5,5,1.0,1,243,,,,1446,1446,1446.0,EEABGJJD_00228,EEABGJJD_00228,EEABGJJD_00228,EEABGJJD_00228,EEABGJJD_00228 +group_117,,PTS glucose transporter subunit IIC,5,5,1.0,1,242,,,,1452,1452,1452.0,EEABGJJD_01426,EEABGJJD_01426,EEABGJJD_01426,EEABGJJD_01426,EEABGJJD_01426 +psr,psr,putative PBP 5 synthesis repressor,5,5,1.0,1,241,,,,1452,1452,1452.0,EEABGJJD_01129,EEABGJJD_01129,EEABGJJD_01129,EEABGJJD_01129,EEABGJJD_01129 +emm1,emm1,M protein type 1,5,5,1.0,1,240,,,,1455,1455,1455.0,EEABGJJD_01673,EEABGJJD_01673,EEABGJJD_01673,EEABGJJD_01673,EEABGJJD_01673 +group_116,,nicotinate phosphoribosyltransferase,5,5,1.0,1,239,,,,1455,1455,1455.0,EEABGJJD_01384,EEABGJJD_01384,EEABGJJD_01384,EEABGJJD_01384,EEABGJJD_01384 +amiC,amiC,putative amidase,5,5,1.0,1,238,,,,1455,1455,1455.0,EEABGJJD_00747,EEABGJJD_00747,EEABGJJD_00747,EEABGJJD_00747,EEABGJJD_00747 +purF,purF,putative phosphoribosylpyrophosphate amidotransferase,5,5,1.0,1,237,,,,1455,1455,1455.0,EEABGJJD_00045,EEABGJJD_00045,EEABGJJD_00045,EEABGJJD_00045,EEABGJJD_00045 +group_115,,NCS2 family permease,5,5,1.0,1,236,,,,1461,1461,1461.0,EEABGJJD_01450,EEABGJJD_01450,EEABGJJD_01450,EEABGJJD_01450,EEABGJJD_01450 +group_114,,amino acid permease,5,5,1.0,1,235,,,,1461,1461,1461.0,EEABGJJD_00014,EEABGJJD_00014,EEABGJJD_00014,EEABGJJD_00014,EEABGJJD_00014 +gatA,gatA,putative Glutamyl-tRNA Gln amidotransferase subunit A,5,5,1.0,1,234,,,,1467,1467,1467.0,EEABGJJD_01480,EEABGJJD_01480,EEABGJJD_01480,EEABGJJD_01480,EEABGJJD_01480 +group_113,,membrane protein,5,5,1.0,1,233,,,,1470,1470,1470.0,EEABGJJD_01375,EEABGJJD_01375,EEABGJJD_01375,EEABGJJD_01375,EEABGJJD_01375 +guaB,guaB,inosine monophosphate dehydrogenase,5,5,1.0,1,232,,,,1482,1482,1482.0,EEABGJJD_01824,EEABGJJD_01824,EEABGJJD_01824,EEABGJJD_01824,EEABGJJD_01824 +group_112,,putative two-component sensor response regulator,5,5,1.0,1,231,,,,1485,1485,1485.0,EEABGJJD_01325,EEABGJJD_01325,EEABGJJD_01325,EEABGJJD_01325,EEABGJJD_01325 +group_111,,hypothetical protein,5,5,1.0,1,230,,,,1485,1485,1485.0,EEABGJJD_01125,EEABGJJD_01125,EEABGJJD_01125,EEABGJJD_01125,EEABGJJD_01125 +group_110,,YfcC family protein,5,5,1.0,1,229,,,,1494,1494,1494.0,EEABGJJD_01291,EEABGJJD_01291,EEABGJJD_01291,EEABGJJD_01291,EEABGJJD_01291 +phiMGAS50052_35,phiMGAS5005.2_35,phage protein,5,5,1.0,1,228,,,,1494,1494,1494.0,EEABGJJD_01221,EEABGJJD_01221,EEABGJJD_01221,EEABGJJD_01221,EEABGJJD_01221 +malM,malM,4-alpha-glucanotransferase,5,5,1.0,1,227,,,,1494,1494,1494.0,EEABGJJD_01086,EEABGJJD_01086,EEABGJJD_01086,EEABGJJD_01086,EEABGJJD_01086 +phiNCTC81984_3,phiNCTC8198.4_3,phage capsid protein,5,5,1.0,1,226,,,,1494,1494,1494.0,EEABGJJD_00567,EEABGJJD_00567,EEABGJJD_00567,EEABGJJD_00567,EEABGJJD_00567 +lysS,lysS,lysine--tRNA ligase,5,5,1.0,1,225,,,,1494,1494,1494.0,EEABGJJD_00497,EEABGJJD_00497,EEABGJJD_00497,EEABGJJD_00497,EEABGJJD_00497 +rofA,rofA,regulatory protein,5,5,1.0,1,224,,,,1494,1494,1494.0,EEABGJJD_00136,EEABGJJD_00136,EEABGJJD_00136,EEABGJJD_00136,EEABGJJD_00136 +group_109,,dipeptidase,5,5,1.0,1,223,,,,1497,1497,1497.0,EEABGJJD_01708,EEABGJJD_01708,EEABGJJD_01708,EEABGJJD_01708,EEABGJJD_01708 +group_108,,hypothetical protein,5,5,1.0,1,222,,,,1497,1497,1497.0,EEABGJJD_00662,EEABGJJD_00662,EEABGJJD_00662,EEABGJJD_00662,EEABGJJD_00662 +group_107,,putative transcriptional regulatory protein,5,5,1.0,1,221,,,,1500,1500,1500.0,EEABGJJD_01403,EEABGJJD_01403,EEABGJJD_01403,EEABGJJD_01403,EEABGJJD_01403 +pyk,pyk,putative pyruvate kinase,5,5,1.0,1,220,,,,1503,1503,1503.0,EEABGJJD_01076,EEABGJJD_01076,EEABGJJD_01076,EEABGJJD_01076,EEABGJJD_01076 +phiNCTC81984_2,phiNCTC8198.4_2,phage portal protein,5,5,1.0,1,219,,,,1503,1503,1503.0,EEABGJJD_00566,EEABGJJD_00566,EEABGJJD_00566,EEABGJJD_00566,EEABGJJD_00566 +group_106,,hypothetical protein,5,5,1.0,1,218,,,,1503,1503,1503.0,EEABGJJD_00420,EEABGJJD_00420,EEABGJJD_00420,EEABGJJD_00420,EEABGJJD_00420 +csrR,csrR,putative sensory transduction histidine kinase,5,5,1.0,1,217,,,,1503,1503,1503.0,EEABGJJD_00305,EEABGJJD_00305,EEABGJJD_00305,EEABGJJD_00305,EEABGJJD_00305 +oppB,oppB,oligopeptidepermease,5,5,1.0,1,216,,,,1503,1503,1503.0,EEABGJJD_00273,EEABGJJD_00273,EEABGJJD_00273,EEABGJJD_00273,EEABGJJD_00273 +atpA,atpA,putative proton-translocating ATPase alpha subunit,5,5,1.0,1,215,,,,1509,1509,1509.0,EEABGJJD_00628,EEABGJJD_00628,EEABGJJD_00628,EEABGJJD_00628,EEABGJJD_00628 +group_105,,putative regulatory protein - RofA related,5,5,1.0,1,214,,,,1509,1509,1509.0,EEABGJJD_00212,EEABGJJD_00212,EEABGJJD_00212,EEABGJJD_00212,EEABGJJD_00212 +isp2,isp2,immunogenic secreted precursor-like protein,5,5,1.0,1,213,,,,1512,1512,1512.0,EEABGJJD_01501,EEABGJJD_01501,EEABGJJD_01501,EEABGJJD_01501,EEABGJJD_01501 +glpK,glpK,putative glycerol kinase,5,5,1.0,1,212,,,,1527,1527,1527.0,EEABGJJD_01407,EEABGJJD_01407,EEABGJJD_01407,EEABGJJD_01407,EEABGJJD_01407 +group_104,,putative ABC transporter (binding protein),5,5,1.0,1,211,,,,1527,1527,1527.0,EEABGJJD_00950,EEABGJJD_00950,EEABGJJD_00950,EEABGJJD_00950,EEABGJJD_00950 +nox1,nox1,putative NADH oxidase/alkyl hydroperoxidase reductase,5,5,1.0,1,210,,,,1533,1533,1533.0,EEABGJJD_01717,EEABGJJD_01717,EEABGJJD_01717,EEABGJJD_01717,EEABGJJD_01717 +group_103,,putative sugar ABC transporter (ATP-binding protein),5,5,1.0,1,209,,,,1533,1533,1533.0,EEABGJJD_01032,EEABGJJD_01032,EEABGJJD_01032,EEABGJJD_01032,EEABGJJD_01032 +citF,citF,putative citrate lyase alpha subunit,5,5,1.0,1,208,,,,1533,1533,1533.0,EEABGJJD_00999,EEABGJJD_00999,EEABGJJD_00999,EEABGJJD_00999,EEABGJJD_00999 +group_102,,ClC family H(+)/Cl(-) exchange transporter,5,5,1.0,1,207,,,,1533,1533,1533.0,EEABGJJD_00932,EEABGJJD_00932,EEABGJJD_00932,EEABGJJD_00932,EEABGJJD_00932 +dltA,dltA,putative D-alanine-D-alanyl carrier protein ligase,5,5,1.0,1,206,,,,1539,1539,1539.0,EEABGJJD_01102,EEABGJJD_01102,EEABGJJD_01102,EEABGJJD_01102,EEABGJJD_01102 +hutH,hutH,putative histidine ammonia-lyase,5,5,1.0,1,205,,,,1542,1542,1542.0,EEABGJJD_01725,EEABGJJD_01725,EEABGJJD_01725,EEABGJJD_01725,EEABGJJD_01725 +group_101,,putative two-component sensor histidine kinase,5,5,1.0,1,204,,,,1542,1542,1542.0,EEABGJJD_00927,EEABGJJD_00927,EEABGJJD_00927,EEABGJJD_00927,EEABGJJD_00927 +prfC,prfC,putative peptide-chain-release factor 3,5,5,1.0,1,203,,,,1545,1545,1545.0,EEABGJJD_01182,EEABGJJD_01182,EEABGJJD_01182,EEABGJJD_01182,EEABGJJD_01182 +group_100,,ABC transporter ATP-binding protein,5,5,1.0,1,202,,,,1545,1545,1545.0,EEABGJJD_01012,EEABGJJD_01012,EEABGJJD_01012,EEABGJJD_01012,EEABGJJD_01012 +adcA,adcA,putative adhesion protein,5,5,1.0,1,201,,,,1548,1548,1548.0,EEABGJJD_00592,EEABGJJD_00592,EEABGJJD_00592,EEABGJJD_00592,EEABGJJD_00592 +group_99,,bifunctional phosphoribosylaminoimidazolecarboxamide formyltransferase/inosine monophosphate cyclohydrolase,5,5,1.0,1,200,,,,1548,1548,1548.0,EEABGJJD_00048,EEABGJJD_00048,EEABGJJD_00048,EEABGJJD_00048,EEABGJJD_00048 +ftsY,ftsY,putative signal recognition particle (docking protein),5,5,1.0,1,199,,,,1551,1551,1551.0,EEABGJJD_00479,EEABGJJD_00479,EEABGJJD_00479,EEABGJJD_00479,EEABGJJD_00479 +group_98,,membrane protein,5,5,1.0,1,198,,,,1560,1560,1560.0,EEABGJJD_01742,EEABGJJD_01742,EEABGJJD_01742,EEABGJJD_01742,EEABGJJD_01742 +guaA,guaA,glutamine-hydrolyzing GMP synthase,5,5,1.0,1,197,,,,1563,1563,1563.0,EEABGJJD_01010,EEABGJJD_01010,EEABGJJD_01010,EEABGJJD_01010,EEABGJJD_01010 +group_97,,signal recognition particle protein,5,5,1.0,1,196,,,,1563,1563,1563.0,EEABGJJD_01006,EEABGJJD_01006,EEABGJJD_01006,EEABGJJD_01006,EEABGJJD_01006 +group_96,,putative aminodeoxychorismate lyase,5,5,1.0,1,195,,,,1569,1569,1569.0,EEABGJJD_00314,EEABGJJD_00314,EEABGJJD_00314,EEABGJJD_00314,EEABGJJD_00314 +group_95,,putative glutamine-binding periplasmic protein,5,5,1.0,1,194,,,,1569,1569,1569.0,EEABGJJD_00260,EEABGJJD_00260,EEABGJJD_00260,EEABGJJD_00260,EEABGJJD_00260 +group_94,,putative cardiolipin synthetase,5,5,1.0,1,193,,,,1578,1578,1578.0,EEABGJJD_01017,EEABGJJD_01017,EEABGJJD_01017,EEABGJJD_01017,EEABGJJD_01017 +group_93,,type I restriction-modification system subunit M,5,5,1.0,1,192,,,,1581,1581,1581.0,EEABGJJD_01589,EEABGJJD_01589,EEABGJJD_01589,EEABGJJD_01589,EEABGJJD_01589 +phiSF3701_3,phiSF370.1_3,putative DEAD box family helicase phage associated,5,5,1.0,1,191,,,,1584,1584,1584.0,EEABGJJD_00554,EEABGJJD_00554,EEABGJJD_00554,EEABGJJD_00554,EEABGJJD_00554 +group_92,,M protein trans-acting positive regulator,5,5,1.0,1,190,,,,1590,1590,1590.0,EEABGJJD_01674,EEABGJJD_01674,EEABGJJD_01674,EEABGJJD_01674,EEABGJJD_01674 +pyrG,pyrG,putative CTP synthetase,5,5,1.0,1,189,,,,1605,1605,1605.0,EEABGJJD_01573,EEABGJJD_01573,EEABGJJD_01573,EEABGJJD_01573,EEABGJJD_01573 +group_91,,ribonuclease Y,5,5,1.0,1,188,,,,1608,1608,1608.0,EEABGJJD_01367,EEABGJJD_01367,EEABGJJD_01367,EEABGJJD_01367,EEABGJJD_01367 +group_90,,alpha-glucosidase,5,5,1.0,1,187,,,,1614,1614,1614.0,EEABGJJD_01640,EEABGJJD_01640,EEABGJJD_01640,EEABGJJD_01640,EEABGJJD_01640 +group_89,,ABC-F family ATPase,5,5,1.0,1,186,,,,1620,1620,1620.0,EEABGJJD_01827,EEABGJJD_01827,EEABGJJD_01827,EEABGJJD_01827,EEABGJJD_01827 +deaD,deaD,putative ATP-dependent RNA helicase,5,5,1.0,1,185,,,,1620,1620,1620.0,EEABGJJD_01181,EEABGJJD_01181,EEABGJJD_01181,EEABGJJD_01181,EEABGJJD_01181 +group_88,,CHAP domain-containing protein,5,5,1.0,1,184,,,,1626,1626,1626.0,EEABGJJD_01676,EEABGJJD_01676,EEABGJJD_01676,EEABGJJD_01676,EEABGJJD_01676 +salB,salB,putative salivaricin A modification enzyme; amino acid dehydration,5,5,1.0,1,183,,,,1626,1626,1626.0,EEABGJJD_01597,EEABGJJD_01597,EEABGJJD_01597,EEABGJJD_01597,EEABGJJD_01597 +group_87,,phage head morphogenesis protein,5,5,1.0,1,182,,,,1626,1626,1626.0,EEABGJJD_00815,EEABGJJD_00815,EEABGJJD_00815,EEABGJJD_00815,EEABGJJD_00815 +dexS,dexS,putative dextran glucosidase,5,5,1.0,1,181,,,,1629,1629,1629.0,EEABGJJD_01731,EEABGJJD_01731,EEABGJJD_01731,EEABGJJD_01731,EEABGJJD_01731 +dppA,dppA,surface lipoprotein,5,5,1.0,1,180,,,,1629,1629,1629.0,EEABGJJD_01661,EEABGJJD_01661,EEABGJJD_01661,EEABGJJD_01661,EEABGJJD_01661 +group_86,,ABC transporter permease,5,5,1.0,1,179,,,,1629,1629,1629.0,EEABGJJD_01398,EEABGJJD_01398,EEABGJJD_01398,EEABGJJD_01398,EEABGJJD_01398 +groEL,groEL,heat shock protein (chaperonin),5,5,1.0,1,178,,,,1632,1632,1632.0,EEABGJJD_01710,EEABGJJD_01710,EEABGJJD_01710,EEABGJJD_01710,EEABGJJD_01710 +group_85,,Na/Pi cotransporter family protein,5,5,1.0,1,177,,,,1632,1632,1632.0,EEABGJJD_01415,EEABGJJD_01415,EEABGJJD_01415,EEABGJJD_01415,EEABGJJD_01415 +group_84,,polysaccharide biosynthesis protein,5,5,1.0,1,176,,,,1635,1635,1635.0,EEABGJJD_00347,EEABGJJD_00347,EEABGJJD_00347,EEABGJJD_00347,EEABGJJD_00347 +group_83,,hypothetical protein,5,5,1.0,1,175,,,,1641,1641,1641.0,EEABGJJD_01707,EEABGJJD_01707,EEABGJJD_01707,EEABGJJD_01707,EEABGJJD_01707 +group_82,,AIPR family protein,5,5,1.0,1,174,,,,1644,1644,1644.0,EEABGJJD_00053,EEABGJJD_00053,EEABGJJD_00053,EEABGJJD_00053,EEABGJJD_00053 +group_81,,putative transcriptional regulator,5,5,1.0,1,173,,,,1647,1647,1647.0,EEABGJJD_00185,EEABGJJD_00185,EEABGJJD_00185,EEABGJJD_00185,EEABGJJD_00185 +group_80,,putative two-component sensor histidine kinase,5,5,1.0,1,172,,,,1650,1650,1650.0,EEABGJJD_00885,EEABGJJD_00885,EEABGJJD_00885,EEABGJJD_00885,EEABGJJD_00885 +fbp,fbp,putative fibronectin-binding protein-like protein A,5,5,1.0,1,171,,,,1653,1653,1653.0,EEABGJJD_00847,EEABGJJD_00847,EEABGJJD_00847,EEABGJJD_00847,EEABGJJD_00847 +recN,recN,putative DNA repair and genetic recombination protein,5,5,1.0,1,170,,,,1662,1662,1662.0,EEABGJJD_01249,EEABGJJD_01249,EEABGJJD_01249,EEABGJJD_01249,EEABGJJD_01249 +group_79,,ribonuclease J 2,5,5,1.0,1,169,,,,1662,1662,1662.0,EEABGJJD_00852,EEABGJJD_00852,EEABGJJD_00852,EEABGJJD_00852,EEABGJJD_00852 +group_78,,DAK2 domain-containing protein,5,5,1.0,1,168,,,,1665,1665,1665.0,EEABGJJD_01567,EEABGJJD_01567,EEABGJJD_01567,EEABGJJD_01567,EEABGJJD_01567 +group_77,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,167,,,,1671,1671,1671.0,EEABGJJD_01494,EEABGJJD_01494,EEABGJJD_01494,EEABGJJD_01494,EEABGJJD_01494 +dnaX,dnaX,DNA polymerase III subunits gamma / tau,5,5,1.0,1,166,,,,1671,1671,1671.0,EEABGJJD_01141,EEABGJJD_01141,EEABGJJD_01141,EEABGJJD_01141,EEABGJJD_01141 +fhs1,fhs.1,putative formate-tetrahydrofolate ligase,5,5,1.0,1,165,,,,1671,1671,1671.0,EEABGJJD_01018,EEABGJJD_01018,EEABGJJD_01018,EEABGJJD_01018,EEABGJJD_01018 +fhs2,fhs.2,putative formate-tetrahydrofolate ligase,5,5,1.0,1,164,,,,1674,1674,1674.0,EEABGJJD_01722,EEABGJJD_01722,EEABGJJD_01722,EEABGJJD_01722,EEABGJJD_01722 +group_76,,ribonuclease J,5,5,1.0,1,163,,,,1683,1683,1683.0,EEABGJJD_01560,EEABGJJD_01560,EEABGJJD_01560,EEABGJJD_01560,EEABGJJD_01560 +phiSF3704_3,phiSF370.4_3,putative DNA primase - phage associated,5,5,1.0,1,162,,,,1689,1689,1689.0,EEABGJJD_01766,EEABGJJD_01766,EEABGJJD_01766,EEABGJJD_01766,EEABGJJD_01766 +group_75,,arginine--tRNA ligase,5,5,1.0,1,161,,,,1692,1692,1692.0,EEABGJJD_01777,EEABGJJD_01777,EEABGJJD_01777,EEABGJJD_01777,EEABGJJD_01777 +group_74,,putative hyaluronidase,5,5,1.0,1,160,,,,1695,1695,1695.0,EEABGJJD_01333,EEABGJJD_01333,EEABGJJD_01333,EEABGJJD_01333,EEABGJJD_01333 +group_73,,putative phosphomannomutase,5,5,1.0,1,159,,,,1695,1695,1695.0,EEABGJJD_01256,EEABGJJD_01256,EEABGJJD_01256,EEABGJJD_01256,EEABGJJD_01256 +group_72,,PTS lactose transporter subunit IIBC,5,5,1.0,1,158,,,,1698,1698,1698.0,EEABGJJD_01600,EEABGJJD_01600,EEABGJJD_01600,EEABGJJD_01600,EEABGJJD_01600 +amyB,amyB,putative cyclomaltodextrinase,5,5,1.0,1,157,,,,1704,1704,1704.0,EEABGJJD_01096,EEABGJJD_01096,EEABGJJD_01096,EEABGJJD_01096,EEABGJJD_01096 +group_71,,ABC transporter ATP-binding protein,5,5,1.0,1,156,,,,1707,1707,1707.0,EEABGJJD_00221,EEABGJJD_00221,EEABGJJD_00221,EEABGJJD_00221,EEABGJJD_00221 +group_70,,streptolysin O,5,5,1.0,1,155,,,,1716,1716,1716.0,EEABGJJD_00172,EEABGJJD_00172,EEABGJJD_00172,EEABGJJD_00172,EEABGJJD_00172 +pgmA,pgmA,putative phosphoglucomutase,5,5,1.0,1,154,,,,1719,1719,1719.0,EEABGJJD_01029,EEABGJJD_01029,EEABGJJD_01029,EEABGJJD_01029,EEABGJJD_01029 +group_69,,sensor histidine kinase,5,5,1.0,1,153,,,,1725,1725,1725.0,EEABGJJD_01326,EEABGJJD_01326,EEABGJJD_01326,EEABGJJD_01326,EEABGJJD_01326 +group_68,,ABC transporter ATP-binding protein,5,5,1.0,1,152,,,,1725,1725,1725.0,EEABGJJD_00964,EEABGJJD_00964,EEABGJJD_00964,EEABGJJD_00964,EEABGJJD_00964 +ezrA,ezrA,septation ring formation regulator EzrA,5,5,1.0,1,151,,,,1725,1725,1725.0,EEABGJJD_00604,EEABGJJD_00604,EEABGJJD_00604,EEABGJJD_00604,EEABGJJD_00604 +group_67,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,150,,,,1728,1728,1728.0,EEABGJJD_00965,EEABGJJD_00965,EEABGJJD_00965,EEABGJJD_00965,EEABGJJD_00965 +group_66,,glycine/betaine ABC transporter permease,5,5,1.0,1,149,,,,1728,1728,1728.0,EEABGJJD_00188,EEABGJJD_00188,EEABGJJD_00188,EEABGJJD_00188,EEABGJJD_00188 +group_65,,putative two-component sensor kinase,5,5,1.0,1,148,,,,1734,1734,1734.0,EEABGJJD_01299,EEABGJJD_01299,EEABGJJD_01299,EEABGJJD_01299,EEABGJJD_01299 +pstI,pstI,putative phosphoenolpyruvate:sugar phosphotransferase system enzyme I,5,5,1.0,1,147,,,,1734,1734,1734.0,EEABGJJD_01149,EEABGJJD_01149,EEABGJJD_01149,EEABGJJD_01149,EEABGJJD_01149 +group_64,,putative glycerophosphodiester phosphodiesterase,5,5,1.0,1,146,,,,1734,1734,1734.0,EEABGJJD_00695,EEABGJJD_00695,EEABGJJD_00695,EEABGJJD_00695,EEABGJJD_00695 +group_63,,hypothetical protein,5,5,1.0,1,145,,,,1746,1746,1746.0,EEABGJJD_00657,EEABGJJD_00657,EEABGJJD_00657,EEABGJJD_00657,EEABGJJD_00657 +aspS,aspS,putative aspartyl-tRNA synthetase,5,5,1.0,1,144,,,,1749,1749,1749.0,EEABGJJD_01782,EEABGJJD_01782,EEABGJJD_01782,EEABGJJD_01782,EEABGJJD_01782 +pabB,pabB,putative para-aminobenzoate synthetase,5,5,1.0,1,143,,,,1758,1758,1758.0,EEABGJJD_01653,EEABGJJD_01653,EEABGJJD_01653,EEABGJJD_01653,EEABGJJD_01653 +group_62,,putative ABC transporter (ATP-binding protein),5,5,1.0,1,142,,,,1764,1764,1764.0,EEABGJJD_01495,EEABGJJD_01495,EEABGJJD_01495,EEABGJJD_01495,EEABGJJD_01495 +lpdA,lpdA,dihydrolipoyl dehydrogenase,5,5,1.0,1,141,,,,1764,1764,1764.0,EEABGJJD_00859,EEABGJJD_00859,EEABGJJD_00859,EEABGJJD_00859,EEABGJJD_00859 +group_61,,67 kDa Myosin-crossreactive streptococcal antigen,5,5,1.0,1,140,,,,1773,1773,1773.0,EEABGJJD_00398,EEABGJJD_00398,EEABGJJD_00398,EEABGJJD_00398,EEABGJJD_00398 +ntpA,ntpA,putative V-type Na+ -ATPase alpha subunit,5,5,1.0,1,139,,,,1776,1776,1776.0,EEABGJJD_00162,EEABGJJD_00162,EEABGJJD_00162,EEABGJJD_00162,EEABGJJD_00162 +group_60,,putative acyltransferase,5,5,1.0,1,138,,,,1776,1776,1776.0,EEABGJJD_00060,EEABGJJD_00060,EEABGJJD_00060,EEABGJJD_00060,EEABGJJD_00060 +phiMGAS50052_18,phiMGAS5005.2_18,phage infection protein,5,5,1.0,1,137,,,,1785,1785,1785.0,EEABGJJD_01202,EEABGJJD_01202,EEABGJJD_01202,EEABGJJD_01202,EEABGJJD_01202 +group_59,,ABC transporter ATP-binding protein,5,5,1.0,1,136,,,,1785,1785,1785.0,EEABGJJD_00222,EEABGJJD_00222,EEABGJJD_00222,EEABGJJD_00222,EEABGJJD_00222 +uvrC,uvrC,excinuclease ABC (subunit C),5,5,1.0,1,135,,,,1797,1797,1797.0,EEABGJJD_00892,EEABGJJD_00892,EEABGJJD_00892,EEABGJJD_00892,EEABGJJD_00892 +pepF,pepF,putative oligopeptidase,5,5,1.0,1,134,,,,1800,1800,1800.0,EEABGJJD_00506,EEABGJJD_00506,EEABGJJD_00506,EEABGJJD_00506,EEABGJJD_00506 +pepB,pepB,putative oligopeptidase,5,5,1.0,1,133,,,,1806,1806,1806.0,EEABGJJD_01164,EEABGJJD_01164,EEABGJJD_01164,EEABGJJD_01164,EEABGJJD_01164 +glmS,glmS,putative L-glutamine-D-fructose-6-phosphate amidotransferase,5,5,1.0,1,132,,,,1815,1815,1815.0,EEABGJJD_01074,EEABGJJD_01074,EEABGJJD_01074,EEABGJJD_01074,EEABGJJD_01074 +dnaG,dnaG,putative DNA primase,5,5,1.0,1,131,,,,1815,1815,1815.0,EEABGJJD_00648,EEABGJJD_00648,EEABGJJD_00648,EEABGJJD_00648,EEABGJJD_00648 +dnaK,dnaK,heat shock protein 70,5,5,1.0,1,130,,,,1827,1827,1827.0,EEABGJJD_01471,EEABGJJD_01471,EEABGJJD_01471,EEABGJJD_01471,EEABGJJD_01471 +lepA,lepA,elongation factor 4,5,5,1.0,1,129,,,,1833,1833,1833.0,EEABGJJD_00877,EEABGJJD_00877,EEABGJJD_00877,EEABGJJD_00877,EEABGJJD_00877 +glpO,glpO,putative alpha-glycerophosphate oxidase,5,5,1.0,1,128,,,,1839,1839,1839.0,EEABGJJD_01406,EEABGJJD_01406,EEABGJJD_01406,EEABGJJD_01406,EEABGJJD_01406 +typA,typA,translational GTPase TypA,5,5,1.0,1,127,,,,1842,1842,1842.0,EEABGJJD_01276,EEABGJJD_01276,EEABGJJD_01276,EEABGJJD_01276,EEABGJJD_01276 +proS,proS,putative prolyl-tRNA synthetase,5,5,1.0,1,126,,,,1857,1857,1857.0,EEABGJJD_01633,EEABGJJD_01633,EEABGJJD_01633,EEABGJJD_01633,EEABGJJD_01633 +group_58,,putative heavy metal-transporting ATPase,5,5,1.0,1,125,,,,1863,1863,1863.0,EEABGJJD_01192,EEABGJJD_01192,EEABGJJD_01192,EEABGJJD_01192,EEABGJJD_01192 +group_57,,beta-glucoside permease IIABC component,5,5,1.0,1,124,,,,1863,1863,1863.0,EEABGJJD_00482,EEABGJJD_00482,EEABGJJD_00482,EEABGJJD_00482,EEABGJJD_00482 +group_56,,ABC transporter ATP-binding protein,5,5,1.0,1,123,,,,1878,1878,1878.0,EEABGJJD_00718,EEABGJJD_00718,EEABGJJD_00718,EEABGJJD_00718,EEABGJJD_00718 +group_55,,PTS beta-glucoside transporter subunit IIBCA,5,5,1.0,1,122,,,,1884,1884,1884.0,EEABGJJD_01509,EEABGJJD_01509,EEABGJJD_01509,EEABGJJD_01509,EEABGJJD_01509 +group_54,,phage hyaluronidase,5,5,1.0,1,121,,,,1887,1887,1887.0,EEABGJJD_00583,EEABGJJD_00583,EEABGJJD_00583,EEABGJJD_00583,EEABGJJD_00583 +pepO,pepO,putative endopeptidase O,5,5,1.0,1,120,,,,1896,1896,1896.0,EEABGJJD_01730,EEABGJJD_01730,EEABGJJD_01730,EEABGJJD_01730,EEABGJJD_01730 +group_53,,type I-C CRISPR-associated protein Cas8c/Csd1,5,5,1.0,1,119,,,,1896,1896,1896.0,EEABGJJD_01308,EEABGJJD_01308,EEABGJJD_01308,EEABGJJD_01308,EEABGJJD_01308 +gidA,gidA,glucose inhibited division protein,5,5,1.0,1,118,,,,1899,1899,1899.0,EEABGJJD_01805,EEABGJJD_01805,EEABGJJD_01805,EEABGJJD_01805,EEABGJJD_01805 +group_52,,serine/threonine protein kinase,5,5,1.0,1,117,,,,1899,1899,1899.0,EEABGJJD_01360,EEABGJJD_01360,EEABGJJD_01360,EEABGJJD_01360,EEABGJJD_01360 +group_51,,hypothetical protein,5,5,1.0,1,116,,,,1899,1899,1899.0,EEABGJJD_00261,EEABGJJD_00261,EEABGJJD_00261,EEABGJJD_00261,EEABGJJD_00261 +group_50,,ABC transporter permease,5,5,1.0,1,115,,,,1908,1908,1908.0,EEABGJJD_01594,EEABGJJD_01594,EEABGJJD_01594,EEABGJJD_01594,EEABGJJD_01594 +group_49,,ABC transporter ATP-binding protein,5,5,1.0,1,114,,,,1908,1908,1908.0,EEABGJJD_00855,EEABGJJD_00855,EEABGJJD_00855,EEABGJJD_00855,EEABGJJD_00855 +group_48,,oligohyaluronate lyase,5,5,1.0,1,113,,,,1908,1908,1908.0,EEABGJJD_00518,EEABGJJD_00518,EEABGJJD_00518,EEABGJJD_00518,EEABGJJD_00518 +group_47,,hyaluronidase,5,5,1.0,1,112,,,,1911,1911,1911.0,EEABGJJD_00835,EEABGJJD_00835,EEABGJJD_00835,EEABGJJD_00835,EEABGJJD_00835 +thrS,thrS,putative threonyl-tRNA synthetase 1,5,5,1.0,1,111,,,,1944,1944,1944.0,EEABGJJD_00436,EEABGJJD_00436,EEABGJJD_00436,EEABGJJD_00436,EEABGJJD_00436 +group_46,,putative fructose-specific enzyme II PTS system BC component,5,5,1.0,1,110,,,,1947,1947,1947.0,EEABGJJD_00709,EEABGJJD_00709,EEABGJJD_00709,EEABGJJD_00709,EEABGJJD_00709 +parE,parE,putative DNA topoisomerase IV (subunit B),5,5,1.0,1,109,,,,1950,1950,1950.0,EEABGJJD_00753,EEABGJJD_00753,EEABGJJD_00753,EEABGJJD_00753,EEABGJJD_00753 +gyrB,gyrB,DNA topoisomerase (ATP-hydrolyzing) subunit B,5,5,1.0,1,108,,,,1953,1953,1953.0,EEABGJJD_00603,EEABGJJD_00603,EEABGJJD_00603,EEABGJJD_00603,EEABGJJD_00603 +group_45,,peptidase,5,5,1.0,1,107,,,,1959,1959,1959.0,EEABGJJD_01204,EEABGJJD_01204,EEABGJJD_01204,EEABGJJD_01204,EEABGJJD_01204 +lig,lig,putative DNA ligase,5,5,1.0,1,106,,,,1959,1959,1959.0,EEABGJJD_00622,EEABGJJD_00622,EEABGJJD_00622,EEABGJJD_00622,EEABGJJD_00622 +group_44,,peptide ABC transporter substrate-binding protein,5,5,1.0,1,105,,,,1971,1971,1971.0,EEABGJJD_00272,EEABGJJD_00272,EEABGJJD_00272,EEABGJJD_00272,EEABGJJD_00272 +group_43,,DHH family phosphoesterase,5,5,1.0,1,104,,,,1977,1977,1977.0,EEABGJJD_01804,EEABGJJD_01804,EEABGJJD_01804,EEABGJJD_01804,EEABGJJD_01804 +ftsH,ftsH,putative cell division protein,5,5,1.0,1,103,,,,1980,1980,1980.0,EEABGJJD_00013,EEABGJJD_00013,EEABGJJD_00013,EEABGJJD_00013,EEABGJJD_00013 +mutL,mutL,putative DNA mismatch repair protein,5,5,1.0,1,102,,,,1983,1983,1983.0,EEABGJJD_01753,EEABGJJD_01753,EEABGJJD_01753,EEABGJJD_01753,EEABGJJD_01753 +tkt,tkt,putative transketolase,5,5,1.0,1,101,,,,1986,1986,1986.0,EEABGJJD_01401,EEABGJJD_01401,EEABGJJD_01401,EEABGJJD_01401,EEABGJJD_01401 +uvrB,uvrB,putative excinuclease ABC (subunit B),5,5,1.0,1,100,,,,1992,1992,1992.0,EEABGJJD_01104,EEABGJJD_01104,EEABGJJD_01104,EEABGJJD_01104,EEABGJJD_01104 +group_42,,putative transcriptional antiterminator (BglG family),5,5,1.0,1,99,,,,1995,1995,1995.0,EEABGJJD_01113,EEABGJJD_01113,EEABGJJD_01113,EEABGJJD_01113,EEABGJJD_01113 +metS,metS,putative methionyl-tRNA synthetase,5,5,1.0,1,98,,,,1998,1998,1998.0,EEABGJJD_00363,EEABGJJD_00363,EEABGJJD_00363,EEABGJJD_00363,EEABGJJD_00363 +group_41,,putative cation (K+) transport protein,5,5,1.0,1,97,,,,2001,2001,2001.0,EEABGJJD_01180,EEABGJJD_01180,EEABGJJD_01180,EEABGJJD_01180,EEABGJJD_01180 +group_40,,ATP synthase subunit I,5,5,1.0,1,96,,,,2001,2001,2001.0,EEABGJJD_00157,EEABGJJD_00157,EEABGJJD_00157,EEABGJJD_00157,EEABGJJD_00157 +group_39,,5'-nucleotidase,5,5,1.0,1,95,,,,2013,2013,2013.0,EEABGJJD_00720,EEABGJJD_00720,EEABGJJD_00720,EEABGJJD_00720,EEABGJJD_00720 +recG,recG,ATP-dependent DNA helicase RecG,5,5,1.0,1,94,,,,2016,2016,2016.0,EEABGJJD_01490,EEABGJJD_01490,EEABGJJD_01490,EEABGJJD_01490,EEABGJJD_01490 +group_38,,putative PTS system enzyme II,5,5,1.0,1,93,,,,2025,2025,2025.0,EEABGJJD_01732,EEABGJJD_01732,EEABGJJD_01732,EEABGJJD_01732,EEABGJJD_01732 +hutU,hutU,putative urocanate hydratase,5,5,1.0,1,92,,,,2031,2031,2031.0,EEABGJJD_01719,EEABGJJD_01719,EEABGJJD_01719,EEABGJJD_01719,EEABGJJD_01719 +glyS,glyS,putative glycyl-tRNA synthetase (beta subunit),5,5,1.0,1,91,,,,2040,2040,2040.0,EEABGJJD_01410,EEABGJJD_01410,EEABGJJD_01410,EEABGJJD_01410,EEABGJJD_01410 +group_37,,hypothetical protein,5,5,1.0,1,90,,,,2049,2049,2049.0,EEABGJJD_00833,EEABGJJD_00833,EEABGJJD_00833,EEABGJJD_00833,EEABGJJD_00833 +group_36,,PRD domain-containing protein,5,5,1.0,1,89,,,,2061,2061,2061.0,EEABGJJD_01626,EEABGJJD_01626,EEABGJJD_01626,EEABGJJD_01626,EEABGJJD_01626 +fus,fus,translation elongation factor G EF-G,5,5,1.0,1,88,,,,2079,2079,2079.0,EEABGJJD_00256,EEABGJJD_00256,EEABGJJD_00256,EEABGJJD_00256,EEABGJJD_00256 +clpL,clpL,putative ATP-dependent Clp proteinase (ATP-binding subunit),5,5,1.0,1,87,,,,2100,2100,2100.0,EEABGJJD_00736,EEABGJJD_00736,EEABGJJD_00736,EEABGJJD_00736,EEABGJJD_00736 +topA,topA,putative DNA topoisomerase I,5,5,1.0,1,86,,,,2130,2130,2130.0,EEABGJJD_00979,EEABGJJD_00979,EEABGJJD_00979,EEABGJJD_00979,EEABGJJD_00979 +pnpA,pnpA,putative polynucleotide phosphorylase alpha chain,5,5,1.0,1,85,,,,2133,2133,2133.0,EEABGJJD_01622,EEABGJJD_01622,EEABGJJD_01622,EEABGJJD_01622,EEABGJJD_01622 +group_35,,RNA-binding transcriptional accessory protein,5,5,1.0,1,84,,,,2133,2133,2133.0,EEABGJJD_00487,EEABGJJD_00487,EEABGJJD_00487,EEABGJJD_00487,EEABGJJD_00487 +amyA,amyA,putative cyclomaltodextrin glucanotransferase,5,5,1.0,1,83,,,,2136,2136,2136.0,EEABGJJD_01095,EEABGJJD_01095,EEABGJJD_01095,EEABGJJD_01095,EEABGJJD_01095 +phiNCTC81981_2,phiNCTC8198.1_2,peptidase,5,5,1.0,1,82,,,,2145,2145,2145.0,EEABGJJD_00581,EEABGJJD_00581,EEABGJJD_00581,EEABGJJD_00581,EEABGJJD_00581 +nrdE2,nrdE.2,putative ribonucleotide reductase alpha-chain,5,5,1.0,1,81,,,,2160,2160,2160.0,EEABGJJD_01152,EEABGJJD_01152,EEABGJJD_01152,EEABGJJD_01152,EEABGJJD_01152 +group_34,,penicillin-binding protein,5,5,1.0,1,80,,,,2166,2166,2166.0,EEABGJJD_01381,EEABGJJD_01381,EEABGJJD_01381,EEABGJJD_01381,EEABGJJD_01381 +group_33,,amino acid ABC transporter permease,5,5,1.0,1,79,,,,2175,2175,2175.0,EEABGJJD_01105,EEABGJJD_01105,EEABGJJD_01105,EEABGJJD_01105,EEABGJJD_01105 +nrdE,nrdE,class 1b ribonucleoside-diphosphate reductase subunit alpha,5,5,1.0,1,78,,,,2181,2181,2181.0,EEABGJJD_00366,EEABGJJD_00366,EEABGJJD_00366,EEABGJJD_00366,EEABGJJD_00366 +group_32,,putative PTS system enzyme II A component,5,5,1.0,1,77,,,,2187,2187,2187.0,EEABGJJD_01649,EEABGJJD_01649,EEABGJJD_01649,EEABGJJD_01649,EEABGJJD_01649 +nrdD,nrdD,putative anaerobic ribonucleoside-triphosphate reductase,5,5,1.0,1,76,,,,2199,2199,2199.0,EEABGJJD_01741,EEABGJJD_01741,EEABGJJD_01741,EEABGJJD_01741,EEABGJJD_01741 +recJ,recJ,putative single-strand DNA-specific exonuclease,5,5,1.0,1,75,,,,2211,2211,2211.0,EEABGJJD_00771,EEABGJJD_00771,EEABGJJD_00771,EEABGJJD_00771,EEABGJJD_00771 +group_31,,LTA synthase family protein,5,5,1.0,1,74,,,,2211,2211,2211.0,EEABGJJD_00670,EEABGJJD_00670,EEABGJJD_00670,EEABGJJD_00670,EEABGJJD_00670 +relA,relA,(p)ppGpp synthetase,5,5,1.0,1,73,,,,2220,2220,2220.0,EEABGJJD_01645,EEABGJJD_01645,EEABGJJD_01645,EEABGJJD_01645,EEABGJJD_01645 +group_30,,copper-exporting ATPase,5,5,1.0,1,72,,,,2232,2232,2232.0,EEABGJJD_01431,EEABGJJD_01431,EEABGJJD_01431,EEABGJJD_01431,EEABGJJD_01431 +comEC,comEC,putative competence protein,5,5,1.0,1,71,,,,2244,2244,2244.0,EEABGJJD_01175,EEABGJJD_01175,EEABGJJD_01175,EEABGJJD_01175,EEABGJJD_01175 +pbpX,pbpX,putative penicillin binding protein 2X,5,5,1.0,1,70,,,,2256,2256,2256.0,EEABGJJD_01392,EEABGJJD_01392,EEABGJJD_01392,EEABGJJD_01392,EEABGJJD_01392 +group_29,,maltose phosphorylase,5,5,1.0,1,69,,,,2265,2265,2265.0,EEABGJJD_01085,EEABGJJD_01085,EEABGJJD_01085,EEABGJJD_01085,EEABGJJD_01085 +group_28,,YhgE/Pip domain-containing protein,5,5,1.0,1,68,,,,2274,2274,2274.0,EEABGJJD_01797,EEABGJJD_01797,EEABGJJD_01797,EEABGJJD_01797,EEABGJJD_01797 +group_27,,DNA primase,5,5,1.0,1,67,,,,2274,2274,2274.0,EEABGJJD_00556,EEABGJJD_00556,EEABGJJD_00556,EEABGJJD_00556,EEABGJJD_00556 +group_26,,pilus ancillary protein 1,5,5,1.0,1,66,,,,2274,2274,2274.0,EEABGJJD_00137,EEABGJJD_00137,EEABGJJD_00137,EEABGJJD_00137,EEABGJJD_00137 +pepXP,pepXP,putative X-Pro dipeptidyl-peptidase IV,5,5,1.0,1,65,,,,2283,2283,2283.0,EEABGJJD_01543,EEABGJJD_01543,EEABGJJD_01543,EEABGJJD_01543,EEABGJJD_01543 +clpE,clpE,putative ATP-dependent protease,5,5,1.0,1,64,,,,2283,2283,2283.0,EEABGJJD_01261,EEABGJJD_01261,EEABGJJD_01261,EEABGJJD_01261,EEABGJJD_01261 +group_25,,penicillin-binding protein,5,5,1.0,1,63,,,,2301,2301,2301.0,EEABGJJD_00114,EEABGJJD_00114,EEABGJJD_00114,EEABGJJD_00114,EEABGJJD_00114 +pcrA,pcrA,putative ATP-dependent DNA helicase,5,5,1.0,1,62,,,,2319,2319,2319.0,EEABGJJD_01066,EEABGJJD_01066,EEABGJJD_01066,EEABGJJD_01066,EEABGJJD_01066 +pfl,pfl,putative pyruvate formate-lyase,5,5,1.0,1,61,,,,2328,2328,2328.0,EEABGJJD_01536,EEABGJJD_01536,EEABGJJD_01536,EEABGJJD_01536,EEABGJJD_01536 +group_24,,putative exoribonuclease R,5,5,1.0,1,60,,,,2334,2334,2334.0,EEABGJJD_00423,EEABGJJD_00423,EEABGJJD_00423,EEABGJJD_00423,EEABGJJD_00423 +pbp2A,pbp2A,penicillin-binding protein 2a,5,5,1.0,1,59,,,,2337,2337,2337.0,EEABGJJD_01703,EEABGJJD_01703,EEABGJJD_01703,EEABGJJD_01703,EEABGJJD_01703 +mutS2,mutS2,putative DNA mismatch repair protein,5,5,1.0,1,58,,,,2340,2340,2340.0,EEABGJJD_01528,EEABGJJD_01528,EEABGJJD_01528,EEABGJJD_01528,EEABGJJD_01528 +phiMGAS50052_20,phiMGAS5005.2_20,phage protein,5,5,1.0,1,57,,,,2358,2358,2358.0,EEABGJJD_01206,EEABGJJD_01206,EEABGJJD_01206,EEABGJJD_01206,EEABGJJD_01206 +group_23,,internalin,5,5,1.0,1,56,,,,2379,2379,2379.0,EEABGJJD_01139,EEABGJJD_01139,EEABGJJD_01139,EEABGJJD_01139,EEABGJJD_01139 +priA,priA,putative primosomal replication factor Y,5,5,1.0,1,55,,,,2385,2385,2385.0,EEABGJJD_01364,EEABGJJD_01364,EEABGJJD_01364,EEABGJJD_01364,EEABGJJD_01364 +group_22,,CRISPR-associated helicase/endonuclease Cas3,5,5,1.0,1,54,,,,2403,2403,2403.0,EEABGJJD_01310,EEABGJJD_01310,EEABGJJD_01310,EEABGJJD_01310,EEABGJJD_01310 +pheT,pheT,phenylalanyl-tRNA synthetase (beta subunit),5,5,1.0,1,53,,,,2406,2406,2406.0,EEABGJJD_00637,EEABGJJD_00637,EEABGJJD_00637,EEABGJJD_00637,EEABGJJD_00637 +group_21,,DNA translocase FtsK,5,5,1.0,1,52,,,,2406,2406,2406.0,EEABGJJD_00388,EEABGJJD_00388,EEABGJJD_00388,EEABGJJD_00388,EEABGJJD_00388 +pflD,pflD,putative pyruvate formate-lyase 2,5,5,1.0,1,51,,,,2418,2418,2418.0,EEABGJJD_01694,EEABGJJD_01694,EEABGJJD_01694,EEABGJJD_01694,EEABGJJD_01694 +group_20,,hyaluronate lyase,5,5,1.0,1,50,,,,2418,2418,2418.0,EEABGJJD_00860,EEABGJJD_00860,EEABGJJD_00860,EEABGJJD_00860,EEABGJJD_00860 +parC,parC,putative DNA topoisomerase IV (subunit C),5,5,1.0,1,49,,,,2424,2424,2424.0,EEABGJJD_00754,EEABGJJD_00754,EEABGJJD_00754,EEABGJJD_00754,EEABGJJD_00754 +group_19,,ATP-dependent Clp protease ATP-binding subunit,5,5,1.0,1,48,,,,2445,2445,2445.0,EEABGJJD_01712,EEABGJJD_01712,EEABGJJD_01712,EEABGJJD_01712,EEABGJJD_01712 +group_18,,ATP-dependent RecD-like DNA helicase,5,5,1.0,1,47,,,,2454,2454,2454.0,EEABGJJD_01533,EEABGJJD_01533,EEABGJJD_01533,EEABGJJD_01533,EEABGJJD_01533 +group_17,,phosphoglycerol transferase,5,5,1.0,1,46,,,,2475,2475,2475.0,EEABGJJD_00658,EEABGJJD_00658,EEABGJJD_00658,EEABGJJD_00658,EEABGJJD_00658 +group_16,,pneumococcal-type histidine triad protein,5,5,1.0,1,45,,,,2478,2478,2478.0,EEABGJJD_01667,EEABGJJD_01667,EEABGJJD_01667,EEABGJJD_01667,EEABGJJD_01667 +gyrA,gyrA,DNA gyrase A subunit,5,5,1.0,1,44,,,,2487,2487,2487.0,EEABGJJD_00968,EEABGJJD_00968,EEABGJJD_00968,EEABGJJD_00968,EEABGJJD_00968 +dinG,dinG,putative ATP-dependent DNA helicase,5,5,1.0,1,43,,,,2502,2502,2502.0,EEABGJJD_00536,EEABGJJD_00536,EEABGJJD_00536,EEABGJJD_00536,EEABGJJD_00536 +leuS,leuS,putative leucyl-tRNA synthetase,5,5,1.0,1,42,,,,2502,2502,2502.0,EEABGJJD_00178,EEABGJJD_00178,EEABGJJD_00178,EEABGJJD_00178,EEABGJJD_00178 +secA,secA,preprotein translocase subunit SecA,5,5,1.0,1,41,,,,2520,2520,2520.0,EEABGJJD_01504,EEABGJJD_01504,EEABGJJD_01504,EEABGJJD_01504,EEABGJJD_01504 +pepN,pepN,putative lysyl-aminopeptidase; aminopeptidase N,5,5,1.0,1,40,,,,2538,2538,2538.0,EEABGJJD_01040,EEABGJJD_01040,EEABGJJD_01040,EEABGJJD_01040,EEABGJJD_01040 +mutS,mutS,putative DNA mismatch repair protein,5,5,1.0,1,39,,,,2556,2556,2556.0,EEABGJJD_01774,EEABGJJD_01774,EEABGJJD_01774,EEABGJJD_01774,EEABGJJD_01774 +group_15,,ABC transporter permease,5,5,1.0,1,38,,,,2577,2577,2577.0,EEABGJJD_01828,EEABGJJD_01828,EEABGJJD_01828,EEABGJJD_01828,EEABGJJD_01828 +alaS,alaS,putative alanyl-tRNA synthetase,5,5,1.0,1,37,,,,2619,2619,2619.0,EEABGJJD_01160,EEABGJJD_01160,EEABGJJD_01160,EEABGJJD_01160,EEABGJJD_01160 +prgA,prgA,putative surface exclusion protein,5,5,1.0,1,36,,,,2622,2622,2622.0,EEABGJJD_00253,EEABGJJD_00253,EEABGJJD_00253,EEABGJJD_00253,EEABGJJD_00253 +group_14,,FtsX-like permease family protein,5,5,1.0,1,35,,,,2637,2637,2637.0,EEABGJJD_01056,EEABGJJD_01056,EEABGJJD_01056,EEABGJJD_01056,EEABGJJD_01056 +polA,polA,DNA-directed DNA polymerase I,5,5,1.0,1,34,,,,2643,2643,2643.0,EEABGJJD_00189,EEABGJJD_00189,EEABGJJD_00189,EEABGJJD_00189,EEABGJJD_00189 +group_13,,bifunctional acetaldehyde-CoA/alcohol dehydrogenase,5,5,1.0,1,33,,,,2643,2643,2643.0,EEABGJJD_00061,EEABGJJD_00061,EEABGJJD_00061,EEABGJJD_00061,EEABGJJD_00061 +group_12,,valine--tRNA ligase,5,5,1.0,1,32,,,,2649,2649,2649.0,EEABGJJD_01311,EEABGJJD_01311,EEABGJJD_01311,EEABGJJD_01311,EEABGJJD_01311 +pacL,pacL,putative calcium-transporting ATPase,5,5,1.0,1,31,,,,2682,2682,2682.0,EEABGJJD_00516,EEABGJJD_00516,EEABGJJD_00516,EEABGJJD_00516,EEABGJJD_00516 +group_11,,alpha-mannosidase,5,5,1.0,1,30,,,,2706,2706,2706.0,EEABGJJD_01336,EEABGJJD_01336,EEABGJJD_01336,EEABGJJD_01336,EEABGJJD_01336 +group_10,,nuclease,5,5,1.0,1,29,,,,2733,2733,2733.0,EEABGJJD_00620,EEABGJJD_00620,EEABGJJD_00620,EEABGJJD_00620,EEABGJJD_00620 +ppc,ppc,putative phosphoenolpyruvate carboxylase,5,5,1.0,1,28,,,,2799,2799,2799.0,EEABGJJD_00507,EEABGJJD_00507,EEABGJJD_00507,EEABGJJD_00507,EEABGJJD_00507 +ileS,ileS,putative isoleucyl-tRNA synthetase,5,5,1.0,1,27,,,,2802,2802,2802.0,EEABGJJD_01264,EEABGJJD_01264,EEABGJJD_01264,EEABGJJD_01264,EEABGJJD_01264 +uvrA,uvrA,excinuclease ABC subunit UvrA,5,5,1.0,1,26,,,,2829,2829,2829.0,EEABGJJD_01517,EEABGJJD_01517,EEABGJJD_01517,EEABGJJD_01517,EEABGJJD_01517 +infB,infB,putative initiation factor 2,5,5,1.0,1,25,,,,2862,2862,2862.0,EEABGJJD_01435,EEABGJJD_01435,EEABGJJD_01435,EEABGJJD_01435,EEABGJJD_01435 +group_9,,type I restriction endonuclease subunit R,5,5,1.0,1,24,,,,2979,2979,2979.0,EEABGJJD_01587,EEABGJJD_01587,EEABGJJD_01587,EEABGJJD_01587,EEABGJJD_01587 +group_8,,endo-beta-N-acetylglucosaminidase,5,5,1.0,1,23,,,,2988,2988,2988.0,EEABGJJD_01508,EEABGJJD_01508,EEABGJJD_01508,EEABGJJD_01508,EEABGJJD_01508 +group_7,,cell surface protein,5,5,1.0,1,22,,,,3027,3027,3027.0,EEABGJJD_00698,EEABGJJD_00698,EEABGJJD_00698,EEABGJJD_00698,EEABGJJD_00698 +group_6,,putative regulatory protein,5,5,1.0,1,21,,,,3075,3075,3075.0,EEABGJJD_01727,EEABGJJD_01727,EEABGJJD_01727,EEABGJJD_01727,EEABGJJD_01727 +snf,snf,putative SNF helicase,5,5,1.0,1,20,,,,3096,3096,3096.0,EEABGJJD_00310,EEABGJJD_00310,EEABGJJD_00310,EEABGJJD_00310,EEABGJJD_00310 +dnaE,dnaE,putative DNA polymerase III (alpha subunit),5,5,1.0,1,19,,,,3111,3111,3111.0,EEABGJJD_01078,EEABGJJD_01078,EEABGJJD_01078,EEABGJJD_01078,EEABGJJD_01078 +carB,carB,putative carbamoylphosphate synthetase,5,5,1.0,1,18,,,,3177,3177,3177.0,EEABGJJD_00691,EEABGJJD_00691,EEABGJJD_00691,EEABGJJD_00691,EEABGJJD_00691 +rexB,rexB,putative ATP-dependent exonuclease subunit B,5,5,1.0,1,17,,,,3216,3216,3216.0,EEABGJJD_00643,EEABGJJD_00643,EEABGJJD_00643,EEABGJJD_00643,EEABGJJD_00643 +phiNCTC81984_5,phiNCTC8198.4_5,tape measure protein,5,5,1.0,1,16,,,,3261,3261,3261.0,EEABGJJD_00579,EEABGJJD_00579,EEABGJJD_00579,EEABGJJD_00579,EEABGJJD_00579 +group_5,,putative beta-galactosidase,5,5,1.0,1,15,,,,3417,3417,3417.0,EEABGJJD_01324,EEABGJJD_01324,EEABGJJD_01324,EEABGJJD_01324,EEABGJJD_01324 +pulA,pulA,putative pullulanase,5,5,1.0,1,14,,,,3498,3498,3498.0,EEABGJJD_01639,EEABGJJD_01639,EEABGJJD_01639,EEABGJJD_01639,EEABGJJD_01639 +trcF,trcF,putative transcription-repair coupling factor,5,5,1.0,1,13,,,,3504,3504,3504.0,EEABGJJD_00006,EEABGJJD_00006,EEABGJJD_00006,EEABGJJD_00006,EEABGJJD_00006 +smc,smc,putative chromosome segregation SMC protein,5,5,1.0,1,12,,,,3540,3540,3540.0,EEABGJJD_00447,EEABGJJD_00447,EEABGJJD_00447,EEABGJJD_00447,EEABGJJD_00447 +group_4,,S8 family serine peptidase,5,5,1.0,1,11,,,,3546,3546,3546.0,EEABGJJD_01670,EEABGJJD_01670,EEABGJJD_01670,EEABGJJD_01670,EEABGJJD_01670 +rpoB,rpoB,putative DNA-dependent RNA polymerase subunit beta,5,5,1.0,1,10,,,,3567,3567,3567.0,EEABGJJD_00115,EEABGJJD_00115,EEABGJJD_00115,EEABGJJD_00115,EEABGJJD_00115 +addA,addA,helicase-exonuclease AddAB subunit AddA,5,5,1.0,1,9,,,,3633,3633,3633.0,EEABGJJD_00644,EEABGJJD_00644,EEABGJJD_00644,EEABGJJD_00644,EEABGJJD_00644 +phiSF3702_8,phiSF370.2_8,putative minor tail protein - phage associated,5,5,1.0,1,8,,,,3636,3636,3636.0,EEABGJJD_00831,EEABGJJD_00831,EEABGJJD_00831,EEABGJJD_00831,EEABGJJD_00831 +rpoC,rpoC,DNA-dependent RNA polymerase B' subunit,5,5,1.0,1,7,,,,3642,3642,3642.0,EEABGJJD_00116,EEABGJJD_00116,EEABGJJD_00116,EEABGJJD_00116,EEABGJJD_00116 +group_3,,putative phosphoribosylformylglycinamidine synthase II,5,5,1.0,1,6,,,,3774,3774,3774.0,EEABGJJD_00044,EEABGJJD_00044,EEABGJJD_00044,EEABGJJD_00044,EEABGJJD_00044 +group_2,,amino acid ABC transporter substrate-binding protein,5,5,1.0,1,5,,,,3828,3828,3828.0,EEABGJJD_01500,EEABGJJD_01500,EEABGJJD_01500,EEABGJJD_01500,EEABGJJD_01500 +group_1,,type II CRISPR RNA-guided endonuclease Cas9,5,5,1.0,1,4,,,,4107,4107,4107.0,EEABGJJD_00872,EEABGJJD_00872,EEABGJJD_00872,EEABGJJD_00872,EEABGJJD_00872 +polC,polC,DNA polymerase III (alpha subunit),5,5,1.0,1,3,,,,4398,4398,4398.0,EEABGJJD_01632,EEABGJJD_01632,EEABGJJD_01632,EEABGJJD_01632,EEABGJJD_01632 +group_0,,peptidase S8,5,5,1.0,1,2,,,,4944,4944,4944.0,EEABGJJD_00361,EEABGJJD_00361,EEABGJJD_00361,EEABGJJD_00361,EEABGJJD_00361 +epf,epf,putative extracellular matrix binding protein,5,5,1.0,1,1,,,,6180,6180,6180.0,EEABGJJD_00610,EEABGJJD_00610,EEABGJJD_00610,EEABGJJD_00610,EEABGJJD_00610 From e5ee7626e4d7855f1da12171223e9f692aa8135d Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 11 Jan 2022 12:59:11 +1100 Subject: [PATCH 071/135] Add in correct name of program in expected commanline help print --- functional_tests/test_data/no_input.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 251840c..f991755 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: __main__.py -ig file.gff [file.gff ...] -ip path/to/pan_genome +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] [-h] From 532b71168b0c438886a310ad0b72a0cf1bebb1dd Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:27:40 +1100 Subject: [PATCH 072/135] Add in handling of refound genes in fragmented genes. Improve README with a conda environment installation guide. Add version print to commandline --- Corekaburra/__main__.py | 20 +-- Corekaburra/commandline_interface.py | 14 +- Corekaburra/correct_gffs.py | 8 +- Corekaburra/parse_gene_presence_absence.py | 62 ++++++-- README.md | 20 +-- functional_tests/test_data/no_input.expected | 12 +- unit_tests/Corekaburra_test.py | 149 ++++++++++++++++-- .../Corrected_gffs/place_holder | 0 .../Silas_the_Salmonella_w_refound.gff | 11 ++ ...ne_presence_absence_w_refound_fragment.csv | 8 + 10 files changed, 247 insertions(+), 57 deletions(-) create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index d0af37c..26668ab 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -146,7 +146,7 @@ def main(): inital_check_time_start = time.time() # get arguments from the commandline - args = get_commandline_arguments(sys.argv[1:]) + args = get_commandline_arguments(sys.argv[1:], PROGRAM_VERSION) # Construct output folder try: @@ -195,21 +195,23 @@ def main(): ## Read in gene presence absence file time_start_read_files = time.time() + # Prepair folder for reannotated genes and examine if any are already present + if source_program == "Panaroo" and args.annotate: + gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, + args.input_gffs, logger) + else: + gene_data_dict = None + corrected_dir = None + # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, - args.input_gffs, tmp_folder_path, logger) + args.input_gffs, tmp_folder_path, + gene_data_dict, corrected_dir, logger) - # Prepair folder for reannotated genes and examine if any are already present - if source_program == "Panaroo" and args.annotate: - gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, - args.input_gffs, logger) - else: - gene_data_dict = None - corrected_dir = None time_end_read_files = time.time() time_start_passing_gffs = time.time() diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 72614e6..62cbb40 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -4,15 +4,16 @@ EXIT_COMMAND_LINE_ERROR = 2 -def get_commandline_arguments(args): +def get_commandline_arguments(args, version): """ Function that takes the input given to the commandline and passes it. will check for no input and '-help' :param args: List of input arguments given to the commandline + :param version: Version of Corekaburra :return: matched argument object for passing in main function. """ # Set up parser - parser = argparse.ArgumentParser(description='Welcome to Corekaburra!' + parser = argparse.ArgumentParser(description='Welcome to Corekaburra! ' 'An extension to pan-genome analyses that summarise genomic regions ' 'between core genes and segments of neighbouring core genes using ' 'gene synteny from a set of input genomes and a pan-genome folder.', @@ -124,7 +125,10 @@ def get_commandline_arguments(args): action='help', help='Show help function') - + rem_args.add_argument('-v', + '--version', + action='version', + version=f'Corekaburra {version}') # Check if any thing is given as input otherwise warn and print help if len(args) < 1: @@ -137,3 +141,7 @@ def get_commandline_arguments(args): args = parser.parse_args(args) return args + + +if __name__ == '__main__': + get_commandline_arguments([], 666) \ No newline at end of file diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index dd6e5c7..40ad9ef 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -33,7 +33,7 @@ def read_gene_data(gene_data_file): # Check if refound gene if 'refound' in line[2]: - # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, + # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, name, and function in that order. # if the first key (genome) is not found in gene_data dict, # then construct dict for the genome and add the gene try: @@ -316,9 +316,3 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_ if __name__ == '__main__': pass - # _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv', - # 1, 0.05) - # - # correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv', - # "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict) - # # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff') diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index d680ad1..b1f1eba 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -3,6 +3,12 @@ from math import ceil, floor import gffutils +try: + from Corekaburra.correct_gffs import annotate_refound_genes +except ModuleNotFoundError: + from correct_gffs import annotate_refound_genes + + def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): """ @@ -22,7 +28,7 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): return main_dict -def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): +def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): """ Function that check for that placement of fragmented gene parts, to determine if they are neighbouring or have some genomic feature between them :param fragment_info: List of genes that are found to be fragmented, one composite of fragments for each index @@ -30,7 +36,20 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ - return_list = [] + # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: + refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + if refound_genes: + for i, gene_gff in refound_genes: + # TODO Check if corrected genome is already made if then skip and just correct genome to look in. + gene, gff = gene_gff + gff_name = [gff_name for gff_name in input_gffs + if gff in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + + fragment_info[i][1] = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + + fragments_close = [] for fragment in fragment_info: # split the two fragments fragment_pieces = fragment[0].split(';') @@ -39,21 +58,27 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): genome = fragment[1] # Get the gff and its path - try: - gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] - except IndexError: - raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') + if '.gff' not in genome: + try: + gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] + db_name = os.path.join(tmp_folder_path, f'{genome}_db') + except IndexError: + raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') + else: + gff_file = genome + db_name = f"{os.path.basename(genome)}_db" + db_name = os.path.join(tmp_folder_path, db_name) # Construct gff database to be searched - db_name = os.path.join(tmp_folder_path, f'{genome}_db') if not os.path.isfile(db_name): - gffutils.create_db(gff_file, db_name, force_gff=True) + gffutils.create_db(gff_file, db_name, force_gff=True, id_spec=['old_locus_tag', 'ID']) # Attach database gff_database = gffutils.FeatureDB(db_name) # Check that all fragments are on the same contig. first_fragment_contig = gff_database[fragment_pieces[0]][0] + frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragment_pieces]) if frag_same_contig: # Get all coordinates @@ -70,23 +95,29 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): # Find all features that are completely within the region region_features = gff_database.region(region=region, completely_within=True) + # Find if some pieces are refound and change old_locus_tag to ID + refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] + if refound_pieces: + for piece in refound_pieces: + fragment_pieces[i] = gff_database[piece[1]]['ID'][0] + # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) excess_genes = region_locus_tags.difference(fragment_pieces) # check the number of excess genes, if any then False to being core if len(excess_genes) > 0: - return_list.append(False) + fragments_close.append(False) else: - return_list.append(True) + fragments_close.append(True) else: - return_list.append(False) + fragments_close.append(False) - return return_list + return fragments_close # TODO - find out what the non-closed file problem is here! Can be seen when running unit-tests. -def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, logger): +def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): """ Function that pass a Roary style gene presence/absence file. :param pres_abs_file: File path to the gene presence/absence file identified @@ -166,10 +197,11 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, fragment_info = [[genes, gff] for genes, gff in zip(line[14:], gff_file_names[14:]) if ';' in genes] # Check that each annotation is neighboring the other annotation. - return_list = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? + fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, + corrected_dir, logger) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? # Check if gene was found to be a core gene - if all(return_list): + if all(fragments_close): # Add the gene to the annotation dict for genome in core_gene_dict: # Get the annoations for a specific genome diff --git a/README.md b/README.md index 720d766..e08541d 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,24 @@ and distance between core genes. Information from neighboring core genes is furt gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs from standard pan-genome pipelines: [Roary](academic.oup.com/bioinformatics/article/31/22/3691/240757) and [Panaroo](genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02090-4). -# When to use +# Why and When to use Corekaburra Corekaburra fits into the existing frameworks of bioinformatics pipelines for pan-genomes. It does not reinvent a new pan-genome pipeline, but leverages the existing ones. Because of this, Corekaburra is build to be a natural extension to the analysis of pan-genomes by summarising information and inferring relationships in the pan-genome otherwise not easily accessible via pan-genome graphs. Other tools provide similar outputs or information, but in their own standalone pan-genome analysis framework or pipeline. Such frameworks/pipelines are [PPanGGolin](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007732) and [Panakeia](https://www.biorxiv.org/content/biorxiv/early/2021/03/02/2021.03.02.433540.full.pdf). By building on top of existing tools Corekaburra frees users from potentially cross referencing beteween pan-genomes, which in itself is a challenging task. Corekaburra's workflow also allows it to be extended to any pan-genome tool, with an output similar to the gene_presence_absence.csv produced by Roary, making Corekaburra versatile for future implementations. -# Why use Corekaburra? - - # Installation -Corekaburra can be installed via pip and conda. A Docker container is also available. +Corekaburra is writen in Python 3.9, and can be installed via pip and conda. A Docker container is also available. ## pip -```Comming soon``` +```pip install corekaburra``` + +## building a Conda environment from scratch +```conda create -n Corekaburra python==3.9``` +```conda activate Corekaburra``` +```pip install corekaburra``` -## Conda +## Conda install ```Comming``` ## Docker -See the Wiki for more information (*** Link to wiki's Docker page ***)[] +See the (Wiki for more information)[https://github.com/milnus/Corekaburra/wiki/Docker.md] # Help ``` @@ -114,7 +116,7 @@ A folder containing Gff files that have been corrected by annotating the genes r **Notice this will duplicate your Gff files, meaning that ```-a``` or ```-d``` arguments should be used to avoid this, when dealing with memory issues or large datasets** # For more info -For more into on Corekaburra, its workings, inputs, outputs and more see the (wiki)[*** Wiki link ***] +For more into on Corekaburra, its workings, inputs, outputs and more see the (wiki)[https://github.com/milnus/Corekaburra/wiki] # Bug reporting and feature requests diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index f991755..bbc0960 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,9 +1,10 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] +usage: Corekaburra -ig file.gff [file.gff ...] -ip + path/to/pan_genome [-cg complete_genomes.txt] + [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] + [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] + [-h] [-v] -Welcome to Corekaburra!An extension to pan-genome analyses that summarise +Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes using gene synteny from a set of input genomes and a pan-genome folder. @@ -44,3 +45,4 @@ Other arguments: -l, --log Record program progress in for debugging purpose -q, --quiet Only print warnings -h, --help Show help function + -v, --version show program's version number and exit diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 2b073ea..15860ca 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -259,6 +259,10 @@ class TestCheckingFragmentedGenes(unittest.TestCase): """ Test of the function that examines the placement of a potential core gene's placement, if it is fragmented in at least one genome. """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) def tearDown(self): """ Class to remove created database files of gff files in tmp-folder""" @@ -281,10 +285,13 @@ def test_fragmented_gene_true(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [True] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -302,10 +309,13 @@ def test_fragmented_gene_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -324,10 +334,13 @@ def test_fragmented_gene_mutiple_genes_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [True, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -340,10 +353,13 @@ def test_fragments_on_separate_contigs(self): 'TestCheckingFragmentedGenes/Silas_the_Legionella.gff', 'TestCheckingFragmentedGenes/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [False, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -357,6 +373,20 @@ def setUpClass(cls): cls.logger = logging.getLogger('test_logger.log') cls.logger.setLevel(logging.INFO) + def tearDown(self): + try: + os.remove('TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound_db') + except FileNotFoundError: + pass + + try: + for file in os.listdir('TestParsingGenePresenceAbsenceFile/Corrected_gffs/'): + if '.gff' in file: + print(file) + os.remove(os.path.join('TestParsingGenePresenceAbsenceFile/Corrected_gffs/', file)) + except FileNotFoundError: + pass + def test_parsing_w_100_presence(self): file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv' core_gene_presence = 1 @@ -373,6 +403,8 @@ def test_parsing_w_100_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -441,7 +473,7 @@ def test_parsing_w_100_presence(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_core_gene_dict, core_gene_dict) self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) @@ -463,7 +495,8 @@ def test_parsing_w_100_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ @@ -471,7 +504,7 @@ def test_parsing_w_100_presence_roary(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -555,13 +588,15 @@ def test_parsing_w_90_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -645,13 +680,15 @@ def test_parsing_w_90_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -719,6 +756,100 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + def test_parsign_fragmented_gene_w_refound_component(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv' + core_gene_presence = 0.9 + low_freq_gene = 0.1 + source_program = 'Panaroo' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {'Silas_the_Salmonella_w_refound': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT', 'gene_name', 'gene_function']}} + corrected_dir ='TestParsingGenePresenceAbsenceFile/Corrected_gffs' + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + + expected_core_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-1': "A", + '0_refound_0': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B", + 'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D', }, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B", + 'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B", + "Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B", + 'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + 'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B", + "Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + 'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + 'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + 'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + + expected_low_freq_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag_2': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + + expected_acc_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + class TestReadGeneData(unittest.TestCase): """ Function to test the passing of gene_data.csv file from Panaroo """ diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff new file mode 100644 index 0000000..a2d074f --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff @@ -0,0 +1,11 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 +contig_1 . CDS 300 390 . . . ID=Silas_the_Salmonella_tag-1-3;locus_tag=Silas_the_Salmonella_tag-1-3 +contig_1 . CDS 400 490 . . . ID=Silas_the_Salmonella_tag-1-4.1;locus_tag=Silas_the_Salmonella_tag-1-4.1 +contig_1 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_the_Salmonella_tag-1-4.2 +contig_1 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 +contig_1 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag_2;locus_tag=Silas_the_Salmonella_tag_2 +contig_1 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv new file mode 100644 index 0000000..8065be5 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv @@ -0,0 +1,8 @@ +Gene,Non.unique.Gene.name,Annotation,No..isolates,No..sequences,Avg.sequences.per.isolate,Genome.Fragment,Order.within.Fragment,Accessory.Fragment,Accessory.Order.with.Fragment,QC,Min.group.size.nuc,Max.group.size.nuc,Avg.group.size.nuc,Christina_the_Streptococcus,Ajwa_the_Shigella,Ajwa_the_Legionella,Cari_the_Listeria,Aman_the_Streptococcus,Zion_the_Streptococcus,Dina_the_Shigella,Silas_the_Legionella,Lilly_the_Shigella,Silas_the_Salmonella_w_refound +A,,,10,10,1,,,,,,,,,Christina_the_Streptococcus_tag-2-1,Ajwa_the_Shigella_tag-3-1,Ajwa_the_Legionella_tag-4-1,Cari_the_Listeria_tag-5-1,Aman_the_Streptococcus_tag-6-1,Zion_the_Streptococcus_tag-7-1,Dina_the_Shigella_tag-8-1,Silas_the_Legionella_tag-9-1,Lilly_the_Shigella_tag-10-1,Silas_the_Salmonella_tag-1-1 +B,,,10,11,1.2,,,,,,,,,Christina_the_Streptococcus_tag-2-2,Ajwa_the_Shigella_tag-3-2,Ajwa_the_Legionella_tag-4-2,Cari_the_Listeria_tag-5-2,Aman_the_Streptococcus_tag-6-2,Zion_the_Streptococcus_tag-7-2,Dina_the_Shigella_tag-8-2,Silas_the_Legionella_tag-9-2,Lilly_the_Shigella_tag-10-2,0_refound_0;Silas_the_Salmonella_tag-1-2.2 +C,,,9,9,1,,,,,,,,,Christina_the_Streptococcus_tag-2-3,Ajwa_the_Shigella_tag-3-3,Ajwa_the_Legionella_tag-4-3,Cari_the_Listeria_tag-5-3,Aman_the_Streptococcus_tag-6-3,Zion_the_Streptococcus_tag-7-3,Dina_the_Shigella_tag-8-3,Silas_the_Legionella_tag-9-3,,Silas_the_Salmonella_tag-1-3 +D,,,9,10,1.1,,,,,,,,,Christina_the_Streptococcus_tag-2-4,Ajwa_the_Shigella_tag-3-4,Ajwa_the_Legionella_tag-4-4,Cari_the_Listeria_tag-5-4,Aman_the_Streptococcus_tag-6-4,Zion_the_Streptococcus_tag-7-4,Dina_the_Shigella_tag-8-4,Silas_the_Legionella_tag-9-4,,Silas_the_Salmonella_tag-1-4.1;Silas_the_Salmonella_tag-1-4.2 +E,,,10,11,1.2,,,,,,,,,Christina_the_Streptococcus_tag-2-5,Ajwa_the_Shigella_tag-3-5,Ajwa_the_Legionella_tag-4-5,Cari_the_Listeria_tag-5-5,Aman_the_Streptococcus_tag-6-5,Zion_the_Streptococcus_tag-7-5,Dina_the_Shigella_tag-8-5,Silas_the_Legionella_tag-9-5,Lilly_the_Shigella_tag-10-5,Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2 +F,,,1,1,1,,,,,,,,,,,,,,,,,Lilly_the_Shigella_tag-10-6, +G,,,1,1,1,,,,,,,,,,,,,,,,,,Silas_the_Salmonella_tag_2 \ No newline at end of file From ae7d71f4f31f0e4e853ea7af8d38a4245f87efc6 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:36:17 +1100 Subject: [PATCH 073/135] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index bbc0960..dbbc061 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,8 +1,7 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip - path/to/pan_genome [-cg complete_genomes.txt] - [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] - [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] - [-h] [-v] +usage: commandline_interface.py -ig file.gff [file.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 5ad0d938c1dc0ab83b5151d499219ed54065fe0e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:38:46 +1100 Subject: [PATCH 074/135] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index dbbc061..8a72a85 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: commandline_interface.py -ig file.gff [file.gff ...] -ip path/to/pan_genome +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] [-h] [-v] From 76054dceb1bc4085637a32b00d30c90c3890cc17 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:43:16 +1100 Subject: [PATCH 075/135] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 8a72a85..7bf37b1 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 3b5936d2fda4cecf9f3fda9b5a3e9c6ea3217483 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:45:44 +1100 Subject: [PATCH 076/135] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- unit_tests/Corekaburra_test.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 7bf37b1..b0ccabb 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 15860ca..b112a64 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -382,7 +382,6 @@ def tearDown(self): try: for file in os.listdir('TestParsingGenePresenceAbsenceFile/Corrected_gffs/'): if '.gff' in file: - print(file) os.remove(os.path.join('TestParsingGenePresenceAbsenceFile/Corrected_gffs/', file)) except FileNotFoundError: pass From 860f95f6519654149d90c27413646c81702c1c30 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:47:50 +1100 Subject: [PATCH 077/135] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index b0ccabb..f14ac07 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 5d5816e4c951a7ab6821314c9c4867bb9a5e9402 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:26:51 +1100 Subject: [PATCH 078/135] Add handling of core gene graphs that form multiple components can be handled, also if not complete (linear) --- Corekaburra/consesus_core_genome.py | 53 ++++++--- functional_tests/Corekaburra-test.sh | 21 ++++ ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 20 ++++ .../core_segments.csv.expected | 14 +++ .../low_frequency_gene_placement.tsv.expected | 46 ++++++++ .../no_accessory_core_segments.csv.expected | 14 +++ .../gene_presence_absence.csv | 14 +++ ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 18 +++ .../core_segments.csv.expected | 14 +++ .../low_frequency_gene_placement.tsv.expected | 40 +++++++ .../no_accessory_core_segments.csv.expected | 14 +++ .../complete_genome_double_chrom_2_larger.gff | 19 +++ .../complete_genome_double_chrom_3_larger.gff | 19 +++ .../complete_genome_double_chrom_larger.gff | 19 +++ ...complete_larger_double_chr_genome_list.txt | 3 + unit_tests/Corekaburra_test.py | 108 ++++++++++++++++-- 18 files changed, 415 insertions(+), 23 deletions(-) create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected create mode 100644 functional_tests/test_data/complete_genome_double_chrom_2_larger.gff create mode 100644 functional_tests/test_data/complete_genome_double_chrom_3_larger.gff create mode 100644 functional_tests/test_data/complete_genome_double_chrom_larger.gff create mode 100644 functional_tests/test_data/complete_larger_double_chr_genome_list.txt diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 1326abe..1ec8fbb 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -115,27 +115,40 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun return sub_segment_dict -def identify_segments(core_graph, num_gffs, core_gene_dict): +def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components): """ Function to identify stretches of core genes between core genes neighbouring multiple different genes :param core_graph: Graph over core genes with weights being the number of connections between the genes :param num_gffs: Number of gffs inputted + :param core_gene_dict: Dict with keys being genomes, each genome is a dict with keys being genes and values the mapped pan-genome gene cluster. + :return: Dict over stretches of core genes found in the core gene graph. """ + # TODO - Describe missing parameters in docstring + + # TODO - Fix Ouli's problem where the core gene graph may split into two seperat pieces, and also handle double chromosome. + # - Add a chek if the core gene graph is a single component of multiple. Handle components separately. - Write test then program + # - This likely require a change to the all-vs-all search of multi edge core gene search, by adding a try and expect statement maybe, or just handle each component separately. # Identify all nodes that contain more than two degrees. multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] + # Check if multiple components in core graph, if then find single edge core_genes + if num_core_graph_components > 1: + singe_edge_nodes = [node for node, connections in core_graph.degree if connections == 1] + else: + singe_edge_nodes = [] # Check if any node have multiple edges, if not then return. - if len(multi_edge_nodes) == 0: + if len(multi_edge_nodes+singe_edge_nodes) == 0: return None # Dict to hold connections between >2 edge nodes connect_dict = {} # for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees - for node in multi_edge_nodes: - connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes] + for node in multi_edge_nodes+singe_edge_nodes: + connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) + if neighbor in multi_edge_nodes or neighbor in singe_edge_nodes] # Turn the weight into a 'distance' or number of times not found together. for edge in core_graph.edges(data=True): @@ -147,8 +160,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Go through all source and taget nodes, # see if a path can be found where all nodes between them have only two degrees - for source_node in multi_edge_nodes: - for target_node in multi_edge_nodes: + for source_node in multi_edge_nodes+singe_edge_nodes: + for target_node in multi_edge_nodes+singe_edge_nodes: if target_node != source_node: # Get path (segment) from source to target segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra @@ -157,7 +170,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): segment_length = len(segment) # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes]) + two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+singe_edge_nodes]) # Check if no node between the source and target has more than two edges, # if then move to record the segment/path @@ -183,9 +196,9 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!") # Calculate the expected number of paths - total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) - num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict]) - expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + len(multi_edge_connect_adjust) + total_edges_from_non_two_edge_core_genes = sum([connections for _, connections in core_graph.degree if connections > 2 or connections < 2]) + num_edges_between_non_two_edge_core_genes = sum([len(connect_dict[key]) for key in connect_dict]) + expected_segment_number = int((total_edges_from_non_two_edge_core_genes / 2) - (num_edges_between_non_two_edge_core_genes / 2)) + len(multi_edge_connect_adjust) # Check if less than the number of expected paths has been found, # if then try to identify missing paths @@ -285,6 +298,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes :param logger: Program logger + # TODO - Add parameters :return double_edge_segements: :return no_acc_segments: @@ -294,11 +308,22 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num # Construct a graph from core gene neighbours core_graph = construct_core_graph(core_neighbour_pairs) + num_core_graph_components = nx.number_connected_components(core_graph) + + logger.debug(f'Identified: {num_core_graph_components} components in core genome graph') + + double_edge_segements = {} + # Identify all segments in components of core graph + for component in nx.connected_components(core_graph): + logger.debug(f'Searching component related to: {component}') - # Find segments in the genome between core genes with multiple neighbors - double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) + component_graph = core_graph.subgraph(component).copy() + return_segments = identify_segments(component_graph, num_gffs, core_gene_dict, num_core_graph_components) + if return_segments is not None: + double_edge_segements = double_edge_segements | return_segments - if double_edge_segements is not None: + # if double_edge_segements is not None: + if double_edge_segements: logger.debug(f'A total of {len(double_edge_segements)} core genes were identified to have multiple neighbours.') logger.debug(f'Genes with multiple neighbours: {double_edge_segements}') @@ -316,4 +341,4 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num if __name__ == '__main__': - pass \ No newline at end of file + pass diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 5b464ce..3ed12f8 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -249,6 +249,27 @@ rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. +# TODO - Test that segmnets can be identified on two 'chromosomes'/contigs that are linear and not circular. +call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" +Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multi_component_graph_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multi_component_graph_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multi_component_graph_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + + +# TODO Test the above but with complete genomes +call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - circular input gffs" +Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multiple_component_graph_complete_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multiple_component_graph_complete_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + call_new_test "Test with decreased core-gene cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..71a4959 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -0,0 +1,20 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +I-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 +K-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 +M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected new file mode 100644 index 0000000..f82eee9 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected @@ -0,0 +1,14 @@ +Segment_name,Segment_position,Core_gene +A-I,1,A +A-I,2,I +B-C,1,C +B-C,2,B +D-J,1,D +D-J,2,J +E-K,1,E +E-K,2,K +F-G,1,F +F-G,2,G +H-M,1,H +H-M,2,L +H-M,3,M diff --git a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5b32628 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,46 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_larger A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_larger A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_3_larger B C 9 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_larger C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_larger D J 0 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_larger E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_larger E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_3_larger F G 9 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_larger G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_larger H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_2_larger I Sequence_break 0 0 +complete_genome_double_chrom_3_larger I Sequence_break 0 0 +complete_genome_double_chrom_larger I Sequence_break 0 0 +complete_genome_double_chrom_2_larger J Sequence_break 1 0 +complete_genome_double_chrom_3_larger J Sequence_break 3 0 +complete_genome_double_chrom_larger J Sequence_break 1 0 +complete_genome_double_chrom_2_larger K Sequence_break 0 0 +complete_genome_double_chrom_3_larger K Sequence_break 0 0 +complete_genome_double_chrom_larger K Sequence_break 0 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_larger L M -700 0 +complete_genome_double_chrom_larger L M 0 0 +complete_genome_double_chrom_2_larger M Sequence_break 2 0 +complete_genome_double_chrom_3_larger M Sequence_break 698 0 +complete_genome_double_chrom_larger M Sequence_break 2 0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..5652fbe --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,14 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-I,A-I,1,1,A +A-I,A-I,1,2,I +B-C,C-B,1,1,C +B-C,C-B,1,2,B +D-J,D-J,1,1,D +D-J,D-J,1,2,J +E-K,E-K,1,1,E +E-K,E-K,1,2,K +F-G,F-G,1,1,F +F-G,F-G,1,2,G +H-M,H-M,1,1,H +H-M,H-M,1,2,L +H-M,H-M,1,3,M diff --git a/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv b/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv new file mode 100644 index 0000000..4c9ca7f --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv @@ -0,0 +1,14 @@ +","","","","","","","","","","","","","","complete_genome_double_chrom_larger","complete_genome_double_chrom_2_larger","complete_genome_double_chrom_3_larger" +"A","","","3","3","1","","","","","","","","","dub_chrom_A","dub_chrom_2_A","dub_chrom_A" +"B","","","3","3","1","","","","","","","","","dub_chrom_B","dub_chrom_2_B","dub_chrom_B" +"C","","","3","3","1","","","","","","","","","dub_chrom_C","dub_chrom_2_C","dub_chrom_C" +"D","","","3","3","1","","","","","","","","","dub_chrom_D","dub_chrom_2_D","dub_chrom_D" +"E","","","3","3","1","","","","","","","","","dub_chrom_E","dub_chrom_2_E","dub_chrom_E" +"F","","","3","3","1","","","","","","","","","dub_chrom_F","dub_chrom_2_F","dub_chrom_F" +"G","","","3","3","1","","","","","","","","","dub_chrom_G","dub_chrom_2_G","dub_chrom_G" +"H","","","3","3","1","","","","","","","","","dub_chrom_H","dub_chrom_2_H","dub_chrom_H" +"I","","","3","3","1","","","","","","","","","dub_chrom_I","dub_chrom_2_I","dub_chrom_I" +"J","","","3","3","1","","","","","","","","","dub_chrom_J","dub_chrom_2_J","dub_chrom_J" +"K","","","3","3","1","","","","","","","","","dub_chrom_K","dub_chrom_2_K","dub_chrom_K" +"L","","","3","3","1","","","","","","","","","dub_chrom_L","dub_chrom_2_L","dub_chrom_L" +"M","","","3","3","1","","","","","","","","","dub_chrom_M","dub_chrom_2_M","dub_chrom_M" \ No newline at end of file diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..66b9420 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected @@ -0,0 +1,18 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +I-J,3,3,3,3,1,3,1.7,1.0,0,0,0.0,0.0 +K-M,3,3,3,3,-698,2,-231.3,2.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected new file mode 100644 index 0000000..6450104 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected @@ -0,0 +1,14 @@ +Segment_name,Segment_position,Core_gene +A-D,1,A +A-D,2,I +A-D,3,J +A-D,4,D +B-C,1,B +B-C,2,C +E-H,1,E +E-H,2,K +E-H,3,M +E-H,4,L +E-H,5,H +F-G,1,F +F-G,2,G diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5d41d78 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,40 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_larger A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_larger A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_3_larger B C 9 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_larger C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_larger D J 0 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_larger E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_larger E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_3_larger F G 9 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_larger G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_larger H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_2_larger I J 1 0 +complete_genome_double_chrom_3_larger I J 3 0 +complete_genome_double_chrom_larger I J 1 0 +complete_genome_double_chrom_2_larger K M 2 0 +complete_genome_double_chrom_3_larger K M -698 0 +complete_genome_double_chrom_larger K M 2 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_larger L M -700 0 +complete_genome_double_chrom_larger L M 0 0 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..2ba270c --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,14 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-D,A-D,1,1,A +A-D,A-D,1,2,I +A-D,A-D,1,3,J +A-D,A-D,1,4,D +B-C,B-C,1,1,B +B-C,B-C,1,2,C +E-H,E-H,1,1,E +E-H,E-H,1,2,K +E-H,E-H,1,3,M +E-H,E-H,1,4,L +E-H,E-H,1,5,H +F-G,F-G,1,1,F +F-G,F-G,1,2,G diff --git a/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff new file mode 100644 index 0000000..e8d9bcf --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_2_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_2_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_2_C;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_2_B;Other_info +contig_1 . CDS 294 295 . . . ID=dub_chrom_2_D;Other_info +contig_1 . CDS 297 299 . . . ID=dub_chrom_2_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_2_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_2_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_2_G;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_2_F;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_2_H;Other_info +contig_2 . CDS 295 296 . . . ID=dub_chrom_2_L;Other_info +contig_2 . CDS 297 298 . . . ID=dub_chrom_2_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff new file mode 100644 index 0000000..9a0423f --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 295 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 296 297 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 996 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 998 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_larger.gff new file mode 100644 index 0000000..f26c08d --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 299 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 297 299 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 296 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 298 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_larger_double_chr_genome_list.txt b/functional_tests/test_data/complete_larger_double_chr_genome_list.txt new file mode 100644 index 0000000..f291a78 --- /dev/null +++ b/functional_tests/test_data/complete_larger_double_chr_genome_list.txt @@ -0,0 +1,3 @@ +complete_genome_double_chrom_2_larger.gff +complete_genome_double_chrom_3_larger.gff +complete_genome_double_chrom_larger \ No newline at end of file diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index b112a64..5846e37 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -9,6 +9,7 @@ import os from shutil import copyfile import logging +from networkx import number_connected_components, connected_components # pylint: disable=no-name-in-module # import Corekaburra functions @@ -3321,8 +3322,9 @@ def test_double_edge_segment_identification_all_2_degree_input(self): 'pan_cluster_6--pan_cluster_1': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}) + return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(None, return_1) @@ -3350,8 +3352,9 @@ def test_double_edge_segment_identification_two_segments(self): 'genome_10': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'},} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3375,8 +3378,9 @@ def test_double_edge_segment_identification_four_segments(self): 'pan_cluster_1--pan_cluster_10': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3402,7 +3406,9 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): 'pan_cluster_6--pan_cluster_1': 9} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3427,7 +3433,9 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se 'genome_5': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3456,7 +3464,9 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths_2( 'genome_8': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3487,7 +3497,9 @@ def test_double_edge_segment_identification_segments_node_w_all_challenging_path 'genome_5': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3511,7 +3523,9 @@ def test_double_edge_segment_identification_segments_node_w_less_than_all_presen } core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3538,10 +3552,86 @@ def test_double_edge_segment_identification_segments_node_w_two_gene_segment(sel 'genome_3': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) + def test_multiple_component_core_graph(self): + expected_segments = {'pan_cluster_A--pan_cluster_I': ['pan_cluster_A', 'pan_cluster_I'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_C', 'pan_cluster_B'], + 'pan_cluster_D--pan_cluster_J': ['pan_cluster_D', 'pan_cluster_J'], + 'pan_cluster_E--pan_cluster_K': ['pan_cluster_E', 'pan_cluster_K'], + 'pan_cluster_F--pan_cluster_G': ['pan_cluster_G', 'pan_cluster_F'], + 'pan_cluster_H--pan_cluster_M': ['pan_cluster_H', 'pan_cluster_L', 'pan_cluster_M'], + 'pan_cluster_Q--pan_cluster_O': ['pan_cluster_Q', 'pan_cluster_P', 'pan_cluster_O']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 1, + 'pan_cluster_A--pan_cluster_C': 1, + 'pan_cluster_A--pan_cluster_I': 2, + 'pan_cluster_B--pan_cluster_C': 2, + 'pan_cluster_B--pan_cluster_D': 1, + 'pan_cluster_C--pan_cluster_D': 1, + 'pan_cluster_D--pan_cluster_J': 2, + 'pan_cluster_E--pan_cluster_F': 1, + 'pan_cluster_E--pan_cluster_G': 1, + 'pan_cluster_E--pan_cluster_K': 2, + 'pan_cluster_F--pan_cluster_G': 2, + 'pan_cluster_F--pan_cluster_H': 1, + 'pan_cluster_G--pan_cluster_H': 1, + 'pan_cluster_H--pan_cluster_L': 2, + 'pan_cluster_L--pan_cluster_M': 2, + 'pan_cluster_O--pan_cluster_P': 2, + 'pan_cluster_P--pan_cluster_Q': 2, + } + + core_gene_dict = {'genome_1': {'tag_1': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_3': 'pan_cluster_C', + 'tag_4': 'pan_cluster_D', 'tag_5': 'pan_cluster_E', 'tag_6': 'pan_cluster_F', + 'tag_7': 'pan_cluster_G', 'tag_8': 'pan_cluster_H', 'tag_9': 'pan_cluster_I', + 'tag_10': 'pan_cluster_J', 'tag_11': 'pan_cluster_K', 'tag_12': 'pan_cluster_L', + 'tag_13': 'pan_cluster_M', 'tag_14': 'pan_cluster_O', 'tag_15': 'pan_cluster_P', + 'tag_16': 'pan_cluster_Q'}, + 'genome_2': {'tag_1': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_3': 'pan_cluster_C', + 'tag_4': 'pan_cluster_D', 'tag_5': 'pan_cluster_E', 'tag_6': 'pan_cluster_F', + 'tag_7': 'pan_cluster_G', 'tag_8': 'pan_cluster_H', 'tag_9': 'pan_cluster_I', + 'tag_10': 'pan_cluster_J', 'tag_11': 'pan_cluster_K', 'tag_12': 'pan_cluster_L', + 'tag_13': 'pan_cluster_M', 'tag_14': 'pan_cluster_O', 'tag_15': 'pan_cluster_P', + 'tag_16': 'pan_cluster_Q'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) + + double_edge_segements = {} + for component in connected_components(core_graph): + component_graph = core_graph.subgraph(component).copy() + double_edge_segements = double_edge_segements | consesus_core_genome.identify_segments(component_graph, 2, + core_gene_dict, + num_components) + + # comparisons = [True for x in double_edge_segements + # if + # (x in expected_segments and + # (expected_segments[x] == double_edge_segements[x] or expected_segments[x][::-1] == double_edge_segements[x])) + # or + # (f"{x.split('--')[1]}'--'{x.split('--')[0]}" in expected_segments and + # (expected_segments[x] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"] or expected_segments[x][::-1] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"])) + # ] + key_forward = [x for x in double_edge_segements if x in expected_segments] + key_reverse = [f"{x.split('--')[1]}--{x.split('--')[0]}" for x in double_edge_segements if f"{x.split('--')[1]}--{x.split('--')[0]}" in expected_segments] + expected_key_match = key_forward+key_reverse + + # Test if the number of expected segments were returned + self.assertEqual(len(expected_key_match), len(expected_segments)) + + comparisons = [True for returned_key, expected_key in zip(double_edge_segements, expected_key_match) + if double_edge_segements[returned_key] == expected_segments[expected_key] + or + double_edge_segements[returned_key] == expected_segments[expected_key][::-1]] + + # Test of all returned segments look as expected + self.assertTrue(all(comparisons)) + # TODO - Chat to Andrew about this function how it works and how we can test it more - possibly just run some things to see if it breaks From 42ee8517817080ef78e3d8c44b5c1933c1edc372 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:44:57 +1100 Subject: [PATCH 079/135] Add in sorting of segmenets, so they are sorted from 'lowest' to 'highest' and the segments is oriented by the sorting of the name --- Corekaburra/output_writer_functions.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 9d8117a..644d398 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -103,7 +103,7 @@ def segment_writer(segments, out_path, prefix): :param prefix: Prefix for any output files :return: Nothing """ - + # TODO - Maybe include presence of core genes in segment output? # Generate file name out_file_name = 'core_segments.csv' if prefix is not None: @@ -119,8 +119,21 @@ def segment_writer(segments, out_path, prefix): # Write remaining rows: for key in sorted(segments.keys()): + + # Examine if key pair is ordered + split_key = sorted(key.split('--')) + if key != f"{split_key[0]}--{split_key[1]}": + sorted_key = f"{split_key[0]}-{split_key[1]}" + else: + sorted_key = key.replace('--', '-') + + # Examine if segment follows ordered key + if sorted_key.split('-')[0] != segments[key][0]: + segments[key] = segments[key][::-1] + + # Write segment for index, gene in enumerate(segments[key]): - info = [key.replace('--', '-'), index+1, gene] + info = [sorted_key, index+1, gene] writer.writerow(info) From 923c7f5cc01970965db8abe67f86179ae565f73e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:56:13 +1100 Subject: [PATCH 080/135] Change check in main for presence of core gene segments add in changes to expecte output of segments for functional tests after implementation of sorting --- Corekaburra/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 26668ab..888eb11 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -261,7 +261,7 @@ def main(): time_end_passing_gffs = time.time() time_start_segments_search = time.time() - time_start = time.time() + time_start = time.time() # TODO - This seems like a lonely start timer? # Count number of unique accessory genes inserted into a core-core region across the genomes acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} # Count number of unique low frequency genes inserted into a core-core region across the genomes @@ -290,7 +290,7 @@ def main(): logger.debug("Summary output") summary_info_writer(master_summary_info, args.output_path, args.output_prefix) - if double_edge_segements is not None: + if double_edge_segements: logger.debug("Segment output") segment_writer(double_edge_segements, args.output_path, args.output_prefix) From f43cb5842c4c828dd54b8f06ce3d249e925d8c93 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:56:37 +1100 Subject: [PATCH 081/135] Change check in main for presence of core gene segments add in changes to expecte output of segments for functional tests after implementation of sorting --- .../Multi_component_graph_expected/core_segments.csv.expected | 4 ++-- .../no_accessory_core_segments.csv.expected | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected index f82eee9..db78a6b 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected @@ -1,8 +1,8 @@ Segment_name,Segment_position,Core_gene A-I,1,A A-I,2,I -B-C,1,C -B-C,2,B +B-C,1,B +B-C,2,C D-J,1,D D-J,2,J E-K,1,E diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected index 5652fbe..89c1ba2 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -1,8 +1,8 @@ Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene A-I,A-I,1,1,A A-I,A-I,1,2,I -B-C,C-B,1,1,C -B-C,C-B,1,2,B +B-C,C-B,1,1,B +B-C,C-B,1,2,C D-J,D-J,1,1,D D-J,D-J,1,2,J E-K,E-K,1,1,E From 4e00c75b22608865f2fa67568ba4f4cb95e70e43 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 17:01:35 +1100 Subject: [PATCH 082/135] Small changes to functional test result files and add in STDOUT to void in functional test --- functional_tests/Corekaburra-test.sh | 4 ++-- .../no_accessory_core_segments.csv.expected | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 3ed12f8..0ff335e 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -349,7 +349,7 @@ rm -r test_out_folder call_new_test "Test with a core-less contig draft" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -357,7 +357,7 @@ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv core rm -r test_out_folder call_new_test "Test with a core-less contig complete" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_expected/core_pair_summary.csv.expected diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected index 89c1ba2..0e13db4 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -1,8 +1,8 @@ Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene A-I,A-I,1,1,A A-I,A-I,1,2,I -B-C,C-B,1,1,B -B-C,C-B,1,2,C +B-C,B-C,1,1,B +B-C,B-C,1,2,C D-J,D-J,1,1,D D-J,D-J,1,2,J E-K,E-K,1,1,E From dce416f22b157de5bf64cf07cb171fb977e6b673 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 17:11:29 +1100 Subject: [PATCH 083/135] Add in ordering of non-accessory segments and test for it --- Corekaburra/output_writer_functions.py | 14 +++++++++++++- unit_tests/Corekaburra_test.py | 17 +++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 644d398..90e9d84 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -162,10 +162,22 @@ def no_acc_segment_writer(no_acc_segments, out_path, prefix): # Write remaining rows: for key in sorted(no_acc_segments.keys()): + + # Examine if key pair is ordered + split_key = sorted(key.split('--')) + if key != f"{split_key[0]}--{split_key[1]}": + sorted_key = f"{split_key[0]}-{split_key[1]}" + else: + sorted_key = key.replace('--', '-') + + # Examine if segment follows ordered key, if not reverse the element + if sorted_key.split('-')[0] != no_acc_segments[key][0][0]: + no_acc_segments[key] = [sub_seg[::-1] for sub_seg in no_acc_segments[key]][::-1] + for sub_index, subsegment in enumerate(no_acc_segments[key]): sub_name = f'{subsegment[0]}-{subsegment[-1]}' for index, gene in enumerate(subsegment): - info = [key.replace('--', '-'), sub_name, sub_index + 1, index + 1, gene] + info = [sorted_key, sub_name, sub_index + 1, index + 1, gene] writer.writerow(info) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 5846e37..313b5fb 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -3836,12 +3836,12 @@ def test_segment_writer(self): input_segments = {'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4'], - 'pan_cluster_2--pan_cluster_6': ['pan_cluster_2', + 'pan_cluster_2--pan_cluster_6': ['pan_cluster_6', 'pan_cluster_1', - 'pan_cluster_6'], - 'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', + 'pan_cluster_2'], + 'pan_cluster_6--pan_cluster_4': ['pan_cluster_6', 'pan_cluster_5', - 'pan_cluster_6']} + 'pan_cluster_4']} out_path = 'TestWritingOutputFunction' prefix = 'test' @@ -3858,12 +3858,13 @@ def test_no_acc_segment_writer(self): input_segments = {'pan_cluster_2--pan_cluster_4': [['pan_cluster_2'], ['pan_cluster_3', 'pan_cluster_4']], - 'pan_cluster_2--pan_cluster_6': [['pan_cluster_2'], + 'pan_cluster_6--pan_cluster_2': [['pan_cluster_2'], ['pan_cluster_1'], ['pan_cluster_6']], - 'pan_cluster_4--pan_cluster_6': [['pan_cluster_4', - 'pan_cluster_5'], - ['pan_cluster_6']]} + 'pan_cluster_6--pan_cluster_4': [['pan_cluster_6'], + ['pan_cluster_5', + 'pan_cluster_4'] + ]} out_path = 'TestWritingOutputFunction' prefix = 'test' From 8398d553e0b890839adbbecb263dfea9939cb47b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 13:31:35 +1100 Subject: [PATCH 084/135] Add in handling of refound genes in both fragments and to see if gffs have been previously corrected. Add in functional tests for multiple things regarding refound genes and resumption of run --- Corekaburra/correct_gffs.py | 9 ++- Corekaburra/gff_parser.py | 28 ++++----- Corekaburra/parse_gene_presence_absence.py | 48 +++++++++++----- functional_tests/Corekaburra-test.sh | 57 ++++++++++++++++--- ...e_core_accessory_gene_content.tsv.expected | 16 ++++++ .../core_pair_summary.csv.expected | 3 + .../low_frequency_gene_placement.tsv.expected | 9 +++ ...ingle_chrom_larger_refound_2_corrected.gff | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 7 +++ .../core_pair_summary.csv.expected | 7 +++ .../gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 16 ++++++ ...ingle_chrom_larger_refound_2_corrected.gff | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 6 ++ .../core_pair_summary.csv.expected | 10 ++++ .../Resume_refound_gene/gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 19 +++++++ ...om_larger_refound_2_corrected.gff.expected | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 7 +++ .../core_pair_summary.csv.expected | 7 +++ .../Resume_refound_run_fragment/gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 16 ++++++ .../gene_data.csv | 1 + .../gene_presence_absence_roary.csv | 8 +++ ...hrom_larger_refound_corrected.gff.expected | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 11 ++++ .../core_pair_summary.csv.expected | 4 ++ .../low_frequency_gene_placement.tsv.expected | 13 +++++ .../genome_single_chrom_larger_refound.gff | 10 ++++ .../genome_single_chrom_larger_refound_2.gff | 10 ++++ 36 files changed, 468 insertions(+), 34 deletions(-) create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/gene_data.csv create mode 100644 functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv create mode 100644 functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/genome_single_chrom_larger_refound.gff create mode 100644 functional_tests/test_data/genome_single_chrom_larger_refound_2.gff diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index 40ad9ef..a802c9d 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -30,6 +30,7 @@ def read_gene_data(gene_data_file): for line in gene_data.readlines(): # Split read line at commas line = line.split(',') + # TODO - Scaffold (contig) name can be found in second position of a gene_data.csv line. This could possibly be used to speed things up so that the entire set of contigs isn't required for search. # Check if refound gene if 'refound' in line[2]: @@ -69,6 +70,9 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): try: os.mkdir(corrected_gff_out_dir) except FileExistsError: + # Get path for input + input_path = os.path.split(gffs[0])[0] + corrected_folder_content = os.listdir(corrected_gff_out_dir) gff_names = [os.path.basename(gff) for gff in gffs] @@ -76,9 +80,12 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): corrected_files = [file for file in corrected_folder_content if f'{file.split("_corrected")[0]}.gff' in gff_names] + corrected_files_w_path = [os.path.join(corrected_gff_out_dir, file) for file in corrected_files] + if len(corrected_files) > 0: gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] - gffs = gffs + corrected_files + gffs = [os.path.join(input_path, gff) for gff in gffs] + gffs = gffs + corrected_files_w_path return gene_data_dict, corrected_gff_out_dir, gffs diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 83079e5..3946acd 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -99,7 +99,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous # Check that a line from gff is provided and previous gene is not a sequence break if gff_line is not None and previous_core_gene_id != "Sequence_break": - # Check if core gene is fragmented + # Check if core gene is fragmented, if then change coordinates to the last part of the fragment. if core_genes[gff_name][previous_core_gene_id] == core_genes[gff_name][gff_line[8]]: previous_core_gene_id = gff_line[8] previous_core_gene_end_coor = int(gff_line[4]) @@ -109,17 +109,19 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) # Set core cluster names - # If no line from gff is given there is a sequence break, + # If no line from gff is given there is a sequence-break, # if it is given then set current cluster and try to find previous if not found it is a sequence break if gff_line is not None: current_core_gene_cluster = core_genes[gff_name][gff_line[8]] try: previous_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] + core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) # Catch is previous gene was a sequence break. except KeyError: previous_core_gene_cluster = previous_core_gene_id + core_gene_neighbours = [previous_core_gene_cluster, current_core_gene_cluster] - core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) + # core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) else: current_core_gene_cluster = "Sequence_break" @@ -448,7 +450,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # Set that first core gene has been observed first_core_gene = False - # Check if first gene on new contig is a core gene, if the record it. + # Check if first gene on new contig is a core gene, if then record it. elif line[8] in core_genes[gff_name]: previous_core_gene_id = "Sequence_break" @@ -541,15 +543,15 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], - previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - master_info) + previous_core_gene_id, + previous_core_gene_end_coor, + acc_genes_in_region, + low_freq_genes_in_region, + core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, + core_gene_pairs, + master_info) else: # Add a core-less contig if there has been accessory genes: coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0]) diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index b1f1eba..e8dc131 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -2,12 +2,17 @@ import csv from math import ceil, floor import gffutils +EXIT_GFF_REANNOTATION_ERROR = 3 try: from Corekaburra.correct_gffs import annotate_refound_genes except ModuleNotFoundError: from correct_gffs import annotate_refound_genes +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): @@ -37,17 +42,35 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: - refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] - if refound_genes: - for i, gene_gff in refound_genes: - # TODO Check if corrected genome is already made if then skip and just correct genome to look in. + refound_fregments = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + if refound_fregments: + for i, gene_gff in refound_fregments: gene, gff = gene_gff - gff_name = [gff_name for gff_name in input_gffs - if gff in [os.path.basename(gff_name), - os.path.basename(gff_name).rsplit('.', 1)[0], - os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + gff_name = None + + try: + gff_name = [gff_name for gff_name in input_gffs + if f"{gff}_corrected" in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + print('HERE') + except IndexError: + pass + + if gff_name is None: + try: + gff_name = [gff_name for gff_name in input_gffs + if gff in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + except IndexError: + exit_with_error(EXIT_GFF_REANNOTATION_ERROR, + f'A problem occurred when trying to find a file for reannotation, when passing the ' + f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - fragment_info[i][1] = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + gff_name = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + + fragment_info[i][1] = gff_name fragments_close = [] for fragment in fragment_info: @@ -93,13 +116,13 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ region = (first_fragment_contig, min_frag_coor, max_frag_coor) # Find all features that are completely within the region - region_features = gff_database.region(region=region, completely_within=True) + region_features = gff_database.region(region=region, completely_within=True, featuretype=['ID']) # Find if some pieces are refound and change old_locus_tag to ID refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] if refound_pieces: - for piece in refound_pieces: - fragment_pieces[i] = gff_database[piece[1]]['ID'][0] + for i, piece in refound_pieces: + fragment_pieces[i] = gff_database[piece]['ID'][0] # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) @@ -199,7 +222,6 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, # Check that each annotation is neighboring the other annotation. fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? - # Check if gene was found to be a core gene if all(fragments_close): # Add the gene to the annotation dict diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 0ff335e..40fc5c6 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -249,7 +249,6 @@ rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. -# TODO - Test that segmnets can be identified on two 'chromosomes'/contigs that are linear and not circular. call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected @@ -259,8 +258,6 @@ test_output_file test_out_folder/core_segments.csv Multi_component_graph_expecte test_output_file test_out_folder/no_accessory_core_segments.csv Multi_component_graph_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder - -# TODO Test the above but with complete genomes call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - circular input gffs" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -336,12 +333,24 @@ test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_run_expec rm -r test_out_folder # TODO Test a fragmented core gene not accepted as core -#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ -# TODO - run the test check results and transfer to expected folder -#rm -r test_out_folder +call_new_test "Test a fragmented core gene not accepted as core" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO Test with part of fragmented gene being a refound gene +call_new_test "Test with part of fragmented gene being a refound gene" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected +rm -r test_out_folder call_new_test "Test for accessory genes being fragmented" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -364,6 +373,40 @@ test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder +# TODO - Test with a genome that have been corrected and one that have not - with fragmented refound gene (Resume run) +call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 +test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected +rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff +rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv +rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv +rm Resume_refound_run_fragment/core_pair_summary.csv +rm Resume_refound_run_fragment/Corekaburra.log + +# TODO!! - Test with all genomes that have been corrected (Resume run) +call_new_test "Test with all genomes that have been corrected (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 +test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv +rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv +rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv +rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log + +# TODO - Test recognition of corrected gff files in output folder (Resume run) +call_new_test "Test recognition of corrected gff files in output folder (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 +test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_gene/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +rm Resume_refound_gene/low_frequency_gene_placement.tsv +rm Resume_refound_gene/core_core_accessory_gene_content.tsv +rm Resume_refound_gene/core_pair_summary.csv +rm Resume_refound_gene/Corekaburra.log # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..122abf3 --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger C Sequence_break A intermediate_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency +complete_genome_single_chrom Sequence_break C A intermediate_frequency +complete_genome_single_chrom Sequence_break C B intermediate_frequency +complete_genome_single_chrom_2 Sequence_break C A intermediate_frequency +complete_genome_single_chrom_2 Sequence_break C E intermediate_frequency +genome_single_chrom_larger Sequence_break C A intermediate_frequency +genome_single_chrom_larger Sequence_break C B intermediate_frequency +genome_single_chrom_larger_rearrange Sequence_break C A intermediate_frequency +genome_single_chrom_larger_rearrange Sequence_break C D low_frequency diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..0462fbd --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,3 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +Sequence_break-C,4,0,4,0,199,199,199.0,199.0,2,2,2.0,2.0 diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..092dab2 --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger C Sequence_break 310 4 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break C 199 2 +complete_genome_single_chrom_2 Sequence_break C 199 2 +genome_single_chrom_larger Sequence_break C 199 2 +genome_single_chrom_larger_rearrange Sequence_break C 199 2 diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..9502287 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_rearrange C E B intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected new file mode 100644 index 0000000..137b761 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected @@ -0,0 +1,7 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,4,4,4,9,109,42.3,9.0,0,1,0.3,0.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,3,4,4,4,9,109,75.7,109.0,0,1,0.7,1.0 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv new file mode 100644 index 0000000..12775e5 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,6,1.5,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,tag_0001;1_refound_1,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..08e03ea --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +genome_single_chrom_larger_refound_2 A C 9 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_rearrange C E 109 1 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..974f506 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,6 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected new file mode 100644 index 0000000..c8e3646 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected @@ -0,0 +1,10 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +B-C,3,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +B-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,2,4,4,4,9,109,59.0,59.0,0,1,0.5,0.5 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_refound_gene/gene_data.csv b/functional_tests/test_data/Resume_refound_gene/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv new file mode 100644 index 0000000..750d6c5 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,4,1,,,,,,,,,tag_0001,single_comp_2_A,tag_0001,single_comp_2_A +B,,,4,4,1,,,,,,,,,0_refound_0,single_comp_B,1_refound_1,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..62da1fa --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,19 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_refound A B 9 0 +genome_single_chrom_larger_refound_2 A B 9 0 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom_2 A E 9 0 +genome_single_chrom_larger_rearrange B C 9 0 +genome_single_chrom_larger_refound B C 9 0 +genome_single_chrom_larger_refound_2 B C 9 0 +genome_single_chrom_larger_rearrange B E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..9502287 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_rearrange C E B intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected new file mode 100644 index 0000000..137b761 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected @@ -0,0 +1,7 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,4,4,4,9,109,42.3,9.0,0,1,0.3,0.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,3,4,4,4,9,109,75.7,109.0,0,1,0.7,1.0 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv b/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv new file mode 100644 index 0000000..12775e5 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,6,1.5,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,tag_0001;1_refound_1,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..08e03ea --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +genome_single_chrom_larger_refound_2 A C 9 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_rearrange C E 109 1 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv b/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv new file mode 100644 index 0000000..1f1531f --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv @@ -0,0 +1 @@ +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function \ No newline at end of file diff --git a/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv b/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv new file mode 100644 index 0000000..19b3ebb --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,complete_genome_single_chrom,genome_single_chrom_larger_rearrange +A,,,4,5,1.25,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,single_comp_A,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,single_comp_C,single_comp_2_C +D,,,1,1,1,,,,,,,,,tag_0004,,,single_comp_2_D +E,,,3,3,1,,,,,,,,,tag_0005,single_comp_2_B,,single_comp_2_E +F,,,2,2,1,,,,,,,,,tag_0006,,,single_comp_2_F +G,,,1,1,1,,,,,,,,,tag_0007,,, \ No newline at end of file diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..521eb93 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,11 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger_rearrange A C D low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break E intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break D low_frequency +genome_single_chrom_larger_refound C Sequence_break G low_frequency diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..befdd38 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5a78a0a --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom_2 A C 109 1 +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_refound C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 diff --git a/functional_tests/test_data/genome_single_chrom_larger_refound.gff b/functional_tests/test_data/genome_single_chrom_larger_refound.gff new file mode 100644 index 0000000..7185660 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_refound.gff @@ -0,0 +1,10 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff b/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff new file mode 100644 index 0000000..7185660 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff @@ -0,0 +1,10 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file From a474d744ecb74f77d14cc9d25e9df15f9223490f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 13:59:53 +1100 Subject: [PATCH 085/135] Add in so that non corrected genomes are returned with paths from prepair reannotation. Remove ID feature selection when looking for feature between gene fragments --- Corekaburra/correct_gffs.py | 5 +++-- Corekaburra/parse_gene_presence_absence.py | 4 +--- unit_tests/Corekaburra_test.py | 9 ++++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index a802c9d..f0d809f 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -71,7 +71,8 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): os.mkdir(corrected_gff_out_dir) except FileExistsError: # Get path for input - input_path = os.path.split(gffs[0])[0] + input_path_dict = {os.path.basename(gff): os.path.split(gff)[0] for gff in gffs} + # input_path = os.path.split(gffs[0])[0] corrected_folder_content = os.listdir(corrected_gff_out_dir) @@ -84,7 +85,7 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): if len(corrected_files) > 0: gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] - gffs = [os.path.join(input_path, gff) for gff in gffs] + gffs = [os.path.join(input_path_dict[gff], gff) for gff in gffs] gffs = gffs + corrected_files_w_path return gene_data_dict, corrected_gff_out_dir, gffs diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index e8dc131..4b4764b 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -53,7 +53,6 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ if f"{gff}_corrected" in [os.path.basename(gff_name), os.path.basename(gff_name).rsplit('.', 1)[0], os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - print('HERE') except IndexError: pass @@ -116,14 +115,13 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ region = (first_fragment_contig, min_frag_coor, max_frag_coor) # Find all features that are completely within the region - region_features = gff_database.region(region=region, completely_within=True, featuretype=['ID']) + region_features = gff_database.region(region=region, completely_within=True) # Find if some pieces are refound and change old_locus_tag to ID refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] if refound_pieces: for i, piece in refound_pieces: fragment_pieces[i] = gff_database[piece]['ID'][0] - # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) excess_genes = region_locus_tags.difference(fragment_pieces) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 313b5fb..0d386ff 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -892,13 +892,15 @@ def test_no_files_annotated(self): self.assertEqual(input_gffs, corrected_files_return) def test_some_files_annotated(self): - input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + input_gffs = ['mock/test/path/Mock_1.gff', 'mock/test/path/Mock_2.gff', 'Mocky/mock/mock/path/Mock_3.gff'] gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( 'TestPrepairForReannotation/Mock_gene_data.csv', 'TestPrepairForReannotation/Some_genomes', input_gffs, self.logger) - expected_gffs = ['Mock_2.gff', 'Mock_1_corrected.gff'] + expected_gffs = ['mock/test/path/Mock_2.gff', + 'Mocky/mock/mock/path/Mock_3.gff', + 'TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff'] self.assertEqual(expected_gffs, corrected_files_return) @@ -909,7 +911,8 @@ def test_all_files_annotated(self): 'TestPrepairForReannotation/All_genomes', input_gffs, self.logger) - expected_gffs = ['Mock_1_corrected.gff', 'Mock_2_corrected.gff'] + expected_gffs = ['TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff', + 'TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff'] self.assertEqual(expected_gffs, corrected_files_return) From eea7d76dfe4c6a5e1b0aaa21cb5ab9ea4b85d4cf Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 14:59:02 +1100 Subject: [PATCH 086/135] Change result for funcitonal tests with new changes --- functional_tests/Corekaburra-test.sh | 20 +++++++++---------- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 12 +++++------ .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- ...e_core_accessory_gene_content.tsv.expected | 3 ++- .../core_pair_summary.csv.expected | 5 +++-- .../low_frequency_gene_placement.tsv.expected | 5 +++-- 26 files changed, 73 insertions(+), 70 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 40fc5c6..ce81743 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -376,10 +376,10 @@ rm -r test_out_folder # TODO - Test with a genome that have been corrected and one that have not - with fragmented refound gene (Resume run) call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected -test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected +test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_run_fragment/core_pair_summary.csv Resume_refound_run_fragment/core_pair_summary.csv.expected +test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv @@ -389,9 +389,9 @@ rm Resume_refound_run_fragment/Corekaburra.log # TODO!! - Test with all genomes that have been corrected (Resume run) call_new_test "Test with all genomes that have been corrected (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv @@ -400,9 +400,9 @@ rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log # TODO - Test recognition of corrected gff files in output folder (Resume run) call_new_test "Test recognition of corrected gff files in output folder (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 -test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_gene/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv Resume_refound_gene/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_gene/core_pair_summary.csv Resume_refound_gene/core_pair_summary.csv.expected rm Resume_refound_gene/low_frequency_gene_placement.tsv rm Resume_refound_gene/core_core_accessory_gene_content.tsv rm Resume_refound_gene/core_pair_summary.csv diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index f4afb24..3cae9f5 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected index e8df429..530a15b 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 3 genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index 6167deb..d5a9336 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected index 5558b6d..6405fe0 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger_2 A C 9 0 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger_2 A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger_2 C Sequence_break 310 4 -genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected index 551faf3..45dace7 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected index e0313a2..e88cbe7 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected index bac3707..b9225b3 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected @@ -1,9 +1,9 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,4,3,3,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,3,3,4,3,9,9,9.0,9.0,0,0,0.0,0.0 B-E,1,3,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,1,4,3,3,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,1,4,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,2,3,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected index 929708d..46da0ae 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,6 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 genome_single_chrom_larger A B 9 0 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 genome_single_chrom_larger B C 9 0 genome_single_chrom_larger_rearrange B C 9 0 @@ -13,3 +10,6 @@ genome_single_chrom_larger C E 109 1 complete_genome_single_chrom C Sequence_break 10 0 genome_single_chrom_larger E Sequence_break 110 2 genome_single_chrom_larger_rearrange E Sequence_break 110 1 +complete_genome_single_chrom Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected index 71a4959..b05e67d 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -13,8 +13,8 @@ F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 -I-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 -K-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected index 5b32628..4ae9bca 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected @@ -29,18 +29,18 @@ complete_genome_double_chrom_larger G H 0 0 complete_genome_double_chrom_2_larger H L 0 0 complete_genome_double_chrom_3_larger H L 0 0 complete_genome_double_chrom_larger H L 0 0 -complete_genome_double_chrom_2_larger I Sequence_break 0 0 -complete_genome_double_chrom_3_larger I Sequence_break 0 0 -complete_genome_double_chrom_larger I Sequence_break 0 0 complete_genome_double_chrom_2_larger J Sequence_break 1 0 complete_genome_double_chrom_3_larger J Sequence_break 3 0 complete_genome_double_chrom_larger J Sequence_break 1 0 -complete_genome_double_chrom_2_larger K Sequence_break 0 0 -complete_genome_double_chrom_3_larger K Sequence_break 0 0 -complete_genome_double_chrom_larger K Sequence_break 0 0 complete_genome_double_chrom_2_larger L M 0 0 complete_genome_double_chrom_3_larger L M -700 0 complete_genome_double_chrom_larger L M 0 0 complete_genome_double_chrom_2_larger M Sequence_break 2 0 complete_genome_double_chrom_3_larger M Sequence_break 698 0 complete_genome_double_chrom_larger M Sequence_break 2 0 +complete_genome_double_chrom_2_larger Sequence_break I 0 0 +complete_genome_double_chrom_3_larger Sequence_break I 0 0 +complete_genome_double_chrom_larger Sequence_break I 0 0 +complete_genome_double_chrom_2_larger Sequence_break K 0 0 +complete_genome_double_chrom_3_larger Sequence_break K 0 0 +complete_genome_double_chrom_larger Sequence_break K 0 0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected index 83f14d5..413f596 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,2,1.2,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected index b41a429..616887b 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 2 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 1 complete_genome_single_chrom_2 C Sequence_break 10 1 genome_single_chrom_larger C Sequence_break 310 4 genome_single_chrom_larger_rearrange C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected index e48e928..78fd9f1 100644 --- a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected index 9c3b77e..1357baf 100644 --- a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 complete_genome_single_chrom_2 A B 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected index 57bef6d..8b628b8 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected @@ -1,6 +1,6 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 -A-Sequence_break,1,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,1,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 89bad36..18b6ce2 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -2,7 +2,7 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected index 89a2209..67512ab 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected @@ -1,8 +1,8 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-E,1,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,2,3,2,2,9,109,59.0,59.0,0,1,0.5,0.5 C-Sequence_break,2,3,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,1,2,0,0,110,110,110.0,110.0,2,2,2.0,2.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected index 90a8203..89a3b74 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,6 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 genome_single_chrom_larger A B 9 0 complete_genome_single_chrom_2 A E 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 genome_single_chrom_larger B C 9 0 complete_genome_single_chrom_2 C E 9 0 @@ -12,3 +9,6 @@ genome_single_chrom_larger C E 109 1 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger E Sequence_break 110 2 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected index e48e928..be20079 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 0aa11ff..31959d6 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_double_chrom A B 9 0 complete_genome_double_chrom_2 A B 9 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_double_chrom_2 A Sequence_break 0 0 complete_genome_double_chrom B C 9 0 complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom C Sequence_break 10 0 complete_genome_double_chrom_2 C Sequence_break 10 0 +complete_genome_double_chrom Sequence_break A 0 0 +complete_genome_double_chrom_2 Sequence_break A 0 0 \ No newline at end of file diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected index 551faf3..45dace7 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected index e0313a2..e88cbe7 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected index ec996bc..bed9362 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -1,3 +1,4 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_double_chrom B Sequence_break C low_frequency -complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom Sequence_break E D low_frequency diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index aacd5c6..fff00cc 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -1,6 +1,7 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 -E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 96337b1..3c10cbb 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,10 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_double_chrom A B 9 0 complete_genome_single_chrom A B 9 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom B E 9 0 complete_genome_double_chrom B Sequence_break 110 1 complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 +complete_genome_double_chrom A Sequence_break 0 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom E Sequence_break 99 1 From f8d124499cb10bcafab9334b1f44b7a806cab8a4 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:18:20 +1100 Subject: [PATCH 087/135] Small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 4 ++-- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../Simple_run_expected/core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../core_90_cutoff_expected/core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../low_freq_cutoff_0_expected/core_pair_summary.csv.expected | 2 +- .../core_core_accessory_gene_content.tsv.expected | 2 +- .../core_pair_summary.csv.expected | 4 ++-- 17 files changed, 19 insertions(+), 19 deletions(-) diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index 3cae9f5..fb54173 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index d5a9336..82662bf 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected index 6405fe0..73993a4 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -6,7 +6,7 @@ genome_single_chrom_larger_rearrange A C 109 1 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger_2 C Sequence_break 310 4 -genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 complete_genome_single_chrom Sequence_break A 0 0 complete_genome_single_chrom_2 Sequence_break A 0 0 genome_single_chrom_larger_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected index 45dace7..e7b2b7f 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected index b9225b3..fe68367 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected @@ -6,4 +6,4 @@ B-E,1,3,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,1,4,3,3,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,1,4,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,2,3,0,0,110,110,110.0,110.0,1,2,1.5,1.5 -Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected index b05e67d..fea2c4f 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -16,5 +16,5 @@ H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 -Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected index 413f596..ff0d453 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,2,1.2,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected index 616887b..5bf8eaa 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected @@ -3,7 +3,6 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 2 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 1 complete_genome_single_chrom_2 C Sequence_break 10 1 genome_single_chrom_larger C Sequence_break 310 4 @@ -11,3 +10,4 @@ genome_single_chrom_larger_rearrange C Sequence_break 310 4 complete_genome_single_chrom Sequence_break A 0 0 complete_genome_single_chrom_2 Sequence_break A 0 0 genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected index 78fd9f1..0ad556a 100644 --- a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected index 8b628b8..318be40 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected @@ -2,5 +2,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 Sequence_break-A,1,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 18b6ce2..88d24e9 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -4,5 +4,5 @@ complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected index 67512ab..fcbf02e 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected @@ -5,4 +5,4 @@ B-C,2,2,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,2,3,2,2,9,109,59.0,59.0,0,1,0.5,0.5 C-Sequence_break,2,3,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,1,2,0,0,110,110,110.0,110.0,2,2,2.0,2.0 -Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected index be20079..0ad556a 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -2,4 +2,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 -Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 31959d6..8d781b1 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -6,4 +6,4 @@ complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom C Sequence_break 10 0 complete_genome_double_chrom_2 C Sequence_break 10 0 complete_genome_double_chrom Sequence_break A 0 0 -complete_genome_double_chrom_2 Sequence_break A 0 0 \ No newline at end of file +complete_genome_double_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected index 45dace7..e7b2b7f 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected index bed9362..76d4e0a 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -1,4 +1,4 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_double_chrom B Sequence_break C low_frequency -complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom E Sequence_break F low_frequency complete_genome_double_chrom Sequence_break E D low_frequency diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index fff00cc..b9862ed 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -2,6 +2,6 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 -E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 -Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 From 192e692c234014ed732f023179cc5a6b1d24e13c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:25:12 +1100 Subject: [PATCH 088/135] More small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index fb54173..b34d600 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 -Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index 82662bf..befdd38 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 -A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index b9862ed..ab752ec 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -4,4 +4,4 @@ B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 +Sequence_break-E,1,0,2,0,99,99,99.0,99.0,0,1,1.0,1.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 3c10cbb..e7ae4b2 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -5,6 +5,6 @@ complete_genome_single_chrom B E 9 0 complete_genome_double_chrom B Sequence_break 110 1 complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom E Sequence_break 99 1 +complete_genome_double_chrom Sequence_break A 0 0 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom Sequence_break E 99 1 From 974582f0c35ad6151e94e38d9876e32b0eaeb51b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:31:08 +1100 Subject: [PATCH 089/135] Even more small changes to expected files for functional tests --- .../low_frequency_gene_placement.tsv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index e7ae4b2..86b1099 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -7,4 +7,4 @@ complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 complete_genome_double_chrom Sequence_break A 0 0 complete_genome_single_chrom Sequence_break A 0 0 -complete_genome_single_chrom Sequence_break E 99 1 +complete_genome_double_chrom Sequence_break E 99 1 From 88f82aa3a3246935f7756787673eb4f0aadfb39c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:37:26 +1100 Subject: [PATCH 090/135] Final small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index ab752ec..aa6dc0d 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -4,4 +4,4 @@ B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-E,1,0,2,0,99,99,99.0,99.0,0,1,1.0,1.0 +Sequence_break-E,1,0,2,0,99,99,99.0,99.0,1,1,1.0,1.0 From 7f4bf152353c42cb12e7ee9c317e928400316180 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:44:39 +1100 Subject: [PATCH 091/135] Bump version for new PyPi upload and add keywords and classifiers for PyPi --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4513c01..1371e4f 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='Corekaburra', - version='0.0.1', + version='0.0.2', author='Magnus Ganer Jespersen', author_email='magnus.ganer.j@gmail.com', packages=['Corekaburra'], @@ -25,4 +25,11 @@ description=('A prototypical bioinformatics command line tool'), long_description=(LONG_DESCRIPTION), install_requires=["biopython", "networkx", "gffutils", "numpy"], + keywords=['Genomic', 'pan-genome', 'bacteria', 'prokaryotes', 'bioinformatics'], + classifiers=[ + 'Programming Language :: Python :: 3.9', + 'License :: OSI Approved :: MIT License', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Development Status :: 4 - Beta'] ) From d6c8793c332dd8dfbb943c4241918c0b3cdb2f53 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 9 May 2022 11:41:28 +1000 Subject: [PATCH 092/135] Move to new location locally --- .github/workflows/Test.yml | 0 .github/workflows/test_dev.yml | 0 .travis.yml | 0 Corekaburra/.gitignore | 0 Corekaburra/__init__.py | 0 Corekaburra/__main__.py | 0 Corekaburra/check_inputs.py | 0 Corekaburra/commandline_interface.py | 0 Corekaburra/consesus_core_genome.py | 0 Corekaburra/correct_gffs.py | 0 Corekaburra/exit_with_error.py | 0 Corekaburra/gff_parser.py | 0 Corekaburra/merge_dicts.py | 0 Corekaburra/output_writer_functions.py | 0 Corekaburra/parse_gene_presence_absence.py | 0 Corekaburra/read_complete_genome_file.py | 0 Corekaburra/summary_table.py | 0 Dockerfile | 0 LICENSE | 0 README.md | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../Change_cutoffs/gene_presence_absence.csv | 0 .../test_data/Complete_double_chromosomes.txt | 0 .../test_data/Complete_single_chromosome.txt | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 ...contig_accessory_gene_content.tsv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../core_segments.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../no_accessory_core_segments.csv.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../core_segments.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../no_accessory_core_segments.csv.expected | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../core_segments.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../no_accessory_core_segments.csv.expected | 0 .../Panaroo_run/gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 .../Reannotate_run_fail/gene_data.csv | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 .../Reannotate_run_succes/gene_data.csv | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 ...nome_single_chrom_2_corrected.gff.expected | 0 ...genome_single_chrom_corrected.gff.expected | 0 ...om_larger_rearrange_corrected.gff.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../core_segments.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../no_accessory_core_segments.csv.expected | 0 .../Roray_run/gene_presence_absence.csv | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence.csv | 0 .../complete_genome_double_chrom.gff | 0 .../complete_genome_double_chrom_2.gff | 0 .../complete_genome_double_chrom_2_larger.gff | 0 .../complete_genome_double_chrom_3_larger.gff | 0 .../complete_genome_double_chrom_larger.gff | 0 .../complete_genome_single_chrom.gff | 0 .../complete_genome_single_chrom_2.gff | 0 .../test_data/complete_genomes_file | 0 ...complete_larger_double_chr_genome_list.txt | 0 .../test_data/complete_larger_genome_list.txt | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 ...contig_accessory_gene_content.tsv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../.DS_Store | Bin 6148 -> 0 bytes ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 functional_tests/test_data/empty_file | 0 .../test_data/genome_single_chrom_larger.gff | 0 .../genome_single_chrom_larger_2.gff | 0 .../genome_single_chrom_larger_rearrange.gff | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 functional_tests/test_data/no_input.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 ...e_core_accessory_gene_content.tsv.expected | 0 .../core_pair_summary.csv.expected | 0 .../low_frequency_gene_placement.tsv.expected | 0 .../test_out_folder/Corekaburra.log | 40 ++++++++++++++++++ .../core_core_accessory_gene_content.tsv | 1 + .../test_out_folder/core_pair_summary.csv | 4 ++ .../test_out_folder/core_segments.csv | 1 + ...coreless_contig_accessory_gene_content.tsv | 3 ++ .../low_frequency_gene_placement.tsv | 7 +++ .../no_accessory_core_segments.csv | 1 + requirements-dev.txt | 0 setup.py | 0 unit_tests/Corekaburra_test.py | 0 .../reannotate_gff.gff | 0 .../Lilly_the_Shigella.gff | 0 .../Silas_the_Legionella.gff | 0 .../Silas_the_Salmonella.gff | 0 .../Zion_the_Streptococcus.gff | 0 .../TestExitWithError/.DS_Store | Bin 6148 -> 0 bytes .../TestExitWithError/tmp_folder/test_file | 0 .../TestExtractGenomeFasta/Mock_gff.gff | 0 .../multi_contig_unwrapped.txt | 0 .../multi_contig_wrapped.txt | 0 .../single_contig_unwrapped.txt | 0 .../single_contig_wrapped.txt | 0 .../gene_presence_absence_roary.csv | 0 .../Mock_panaroo/gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 .../Mock_roary/gene_presence_absence.csv | 0 .../Mock_unknwon/place_holder_file | 0 .../complete_genomes_file.txt | 0 .../Ajwa_the_Legionella.gff | 0 .../Ajwa_the_Shigella.gff | 0 .../Aman_the_Streptococcus.gff | 0 .../Cari_the_Listeria.gff | 0 .../Christina_the_Streptococcus.gff | 0 .../Corrected_gffs/place_holder | 0 .../Dina_the_Shigella.gff | 0 .../Lilly_the_Shigella.gff | 0 .../Silas_the_Legionella.gff | 0 .../Silas_the_Salmonella.gff | 0 .../Silas_the_Salmonella_w_refound.gff | 0 .../Zion_the_Streptococcus.gff | 0 .../gene_presence_absence.csv | 0 .../gene_presence_absence_roary.csv | 0 ...ne_presence_absence_w_refound_fragment.csv | 0 .../Silas_the_Salmonella.gff | 0 .../Silas_the_Salmonella_corrected.gff | 0 .../Corrected_gff_files/Mock_1_corrected.gff | 0 .../Corrected_gff_files/Mock_2_corrected.gff | 0 .../Mock_gene_data.csv | 0 .../Corrected_gff_files/Mock_1_corrected.gff | 0 .../absent/place_holder_file | 0 .../present/gene_data.csv | 0 .../gene_presence_absence_roary.csv | 0 .../TestReadGeneData/Mock_gene_data.csv | 0 .../test_double_chromosome.gff | 0 .../test_single_chromosome.gff | 0 .../test_triple_chromosome.gff | 0 .../core_segments.txt | 0 .../gene_content.txt | 0 .../TestWritingOutputFunction/low_freq.txt | 0 .../no_acc_segments.txt | 0 .../no_core_contigs.txt | 0 .../summary_table.txt | 0 .../test_tmp_folder/tmp_file_in_tmp_folder | 0 191 files changed, 57 insertions(+) mode change 100644 => 100755 .github/workflows/Test.yml mode change 100644 => 100755 .github/workflows/test_dev.yml mode change 100644 => 100755 .travis.yml mode change 100644 => 100755 Corekaburra/.gitignore mode change 100644 => 100755 Corekaburra/__init__.py mode change 100644 => 100755 Corekaburra/__main__.py mode change 100644 => 100755 Corekaburra/check_inputs.py mode change 100644 => 100755 Corekaburra/commandline_interface.py mode change 100644 => 100755 Corekaburra/consesus_core_genome.py mode change 100644 => 100755 Corekaburra/correct_gffs.py mode change 100644 => 100755 Corekaburra/exit_with_error.py mode change 100644 => 100755 Corekaburra/gff_parser.py mode change 100644 => 100755 Corekaburra/merge_dicts.py mode change 100644 => 100755 Corekaburra/output_writer_functions.py mode change 100644 => 100755 Corekaburra/parse_gene_presence_absence.py mode change 100644 => 100755 Corekaburra/read_complete_genome_file.py mode change 100644 => 100755 Corekaburra/summary_table.py mode change 100644 => 100755 Dockerfile mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Complete_double_chromosomes.txt mode change 100644 => 100755 functional_tests/test_data/Complete_single_chromosome.txt mode change 100644 => 100755 functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence_roary.csv mode change 100644 => 100755 functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Panaroo_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_fail/gene_data.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_succes/gene_data.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected mode change 100644 => 100755 functional_tests/test_data/Roray_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/Single_core_contig/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv mode change 100644 => 100755 functional_tests/test_data/complete_genome_double_chrom.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_double_chrom_2.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_double_chrom_2_larger.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_double_chrom_3_larger.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_double_chrom_larger.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_single_chrom.gff mode change 100644 => 100755 functional_tests/test_data/complete_genome_single_chrom_2.gff mode change 100644 => 100755 functional_tests/test_data/complete_genomes_file mode change 100644 => 100755 functional_tests/test_data/complete_larger_double_chr_genome_list.txt mode change 100644 => 100755 functional_tests/test_data/complete_larger_genome_list.txt mode change 100644 => 100755 functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected delete mode 100644 functional_tests/test_data/double_comple_chromosome_expected/.DS_Store mode change 100644 => 100755 functional_tests/test_data/double_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/double_comple_chromosome_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/double_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/empty_file mode change 100644 => 100755 functional_tests/test_data/genome_single_chrom_larger.gff mode change 100644 => 100755 functional_tests/test_data/genome_single_chrom_larger_2.gff mode change 100644 => 100755 functional_tests/test_data/genome_single_chrom_larger_rearrange.gff mode change 100644 => 100755 functional_tests/test_data/low_freq_cutoff_0_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/no_input.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_complete_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_complete_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_complete_expected/low_frequency_gene_placement.tsv.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected mode change 100644 => 100755 functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected create mode 100755 functional_tests/test_out_folder/Corekaburra.log create mode 100755 functional_tests/test_out_folder/core_core_accessory_gene_content.tsv create mode 100755 functional_tests/test_out_folder/core_pair_summary.csv create mode 100755 functional_tests/test_out_folder/core_segments.csv create mode 100755 functional_tests/test_out_folder/coreless_contig_accessory_gene_content.tsv create mode 100755 functional_tests/test_out_folder/low_frequency_gene_placement.tsv create mode 100755 functional_tests/test_out_folder/no_accessory_core_segments.csv mode change 100644 => 100755 requirements-dev.txt mode change 100644 => 100755 setup.py mode change 100644 => 100755 unit_tests/Corekaburra_test.py mode change 100644 => 100755 unit_tests/unit_test_data/TestAnnotateRefoundGenomes/reannotate_gff.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Lilly_the_Shigella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Legionella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Silas_the_Salmonella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestCheckingFragmentedGenes/Zion_the_Streptococcus.gff delete mode 100644 unit_tests/unit_test_data/TestExitWithError/.DS_Store mode change 100644 => 100755 unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file mode change 100644 => 100755 unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence_roary.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_roary/gene_presence_absence.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_unknwon/place_holder_file mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file mode change 100644 => 100755 unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv mode change 100644 => 100755 unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt mode change 100644 => 100755 unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt mode change 100644 => 100755 unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml old mode 100644 new mode 100755 diff --git a/.github/workflows/test_dev.yml b/.github/workflows/test_dev.yml old mode 100644 new mode 100755 diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/Corekaburra/.gitignore b/Corekaburra/.gitignore old mode 100644 new mode 100755 diff --git a/Corekaburra/__init__.py b/Corekaburra/__init__.py old mode 100644 new mode 100755 diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py old mode 100644 new mode 100755 diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py old mode 100644 new mode 100755 diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py old mode 100644 new mode 100755 diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py old mode 100644 new mode 100755 diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py old mode 100644 new mode 100755 diff --git a/Corekaburra/exit_with_error.py b/Corekaburra/exit_with_error.py old mode 100644 new mode 100755 diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py old mode 100644 new mode 100755 diff --git a/Corekaburra/merge_dicts.py b/Corekaburra/merge_dicts.py old mode 100644 new mode 100755 diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py old mode 100644 new mode 100755 diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py old mode 100644 new mode 100755 diff --git a/Corekaburra/read_complete_genome_file.py b/Corekaburra/read_complete_genome_file.py old mode 100644 new mode 100755 diff --git a/Corekaburra/summary_table.py b/Corekaburra/summary_table.py old mode 100644 new mode 100755 diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv b/functional_tests/test_data/Accessory_chrom_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Accessory_chrom_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv b/functional_tests/test_data/Change_cutoffs/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Complete_double_chromosomes.txt b/functional_tests/test_data/Complete_double_chromosomes.txt old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Complete_single_chromosome.txt b/functional_tests/test_data/Complete_single_chromosome.txt old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv b/functional_tests/test_data/Coreless_contig_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv b/functional_tests/test_data/Crash_gff_folder/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence.csv b/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence_roary.csv b/functional_tests/test_data/Crash_panaroo_folder/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_accessory_gene_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_core_gene_break_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv b/functional_tests/test_data/Fragmented_core_gene_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv b/functional_tests/test_data/Less_than_all_gffs/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv b/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Panaroo_run/gene_presence_absence.csv b/functional_tests/test_data/Panaroo_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv b/functional_tests/test_data/Panaroo_run/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_data.csv b/functional_tests/test_data/Reannotate_run_fail/gene_data.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv b/functional_tests/test_data/Reannotate_run_fail/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_data.csv b/functional_tests/test_data/Reannotate_run_succes/gene_data.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv b/functional_tests/test_data/Reannotate_run_succes/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected b/functional_tests/test_data/Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv b/functional_tests/test_data/Rearrangement_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Rearrangement_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Rearrangement_run_expected/no_accessory_core_segments.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Roray_run/gene_presence_absence.csv b/functional_tests/test_data/Roray_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Simple_run_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/Single_core_contig/gene_presence_absence.csv b/functional_tests/test_data/Single_core_contig/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv b/functional_tests/test_data/complete_double_chromoosme_run/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_double_chrom.gff b/functional_tests/test_data/complete_genome_double_chrom.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_double_chrom_2.gff b/functional_tests/test_data/complete_genome_double_chrom_2.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_double_chrom_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_larger.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_single_chrom.gff b/functional_tests/test_data/complete_genome_single_chrom.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genome_single_chrom_2.gff b/functional_tests/test_data/complete_genome_single_chrom_2.gff old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_genomes_file b/functional_tests/test_data/complete_genomes_file old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_larger_double_chr_genome_list.txt b/functional_tests/test_data/complete_larger_double_chr_genome_list.txt old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/complete_larger_genome_list.txt b/functional_tests/test_data/complete_larger_genome_list.txt old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/coreless_contig_accessory_gene_content.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected old mode 100644 new mode 100755 diff --git a/functional_tests/test_data/double_comple_chromosome_expected/.DS_Store b/functional_tests/test_data/double_comple_chromosome_expected/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0> zvftSAvGb(Z9wOq+W49(+5mAFC$f8t4rmLnSb6x_u+&K(M6z3Eg><%FdAk>SRkyS zKn-OpF<8SfAIvWrW2L2fXyll7a1|Ma2>%r&AT^rD@&_u*9ivoc@djw!0=g4U>sy&Dfzi5~hC5z0L QaG*Z~3L)M(1HZt)JJ$I!9{>OV diff --git a/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file b/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff b/unit_tests/unit_test_data/TestExtractGenomeFasta/Mock_gff.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_unwrapped.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_unwrapped.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt b/unit_tests/unit_test_data/TestGetContigLenth/single_contig_wrapped.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_minimal_panaroo/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence.csv b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_panaroo/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_roary/gene_presence_absence.csv b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_roary/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_unknwon/place_holder_file b/unit_tests/unit_test_data/TestPangenomeSourceProgram/Mock_unknwon/place_holder_file old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt b/unit_tests/unit_test_data/TestParsingCompleteGenomes/complete_genomes_file.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv b/unit_tests/unit_test_data/TestPrepairForReannotation/Mock_gene_data.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff b/unit_tests/unit_test_data/TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/absent/place_holder_file old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv b/unit_tests/unit_test_data/TestPresenceOfGenedataFile/present/gene_data.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv b/unit_tests/unit_test_data/TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv b/unit_tests/unit_test_data/TestReadGeneData/Mock_gene_data.csv old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_double_chromosome.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_single_chromosome.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff b/unit_tests/unit_test_data/TestSegmentingMockGffs/test_triple_chromosome.gff old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/core_segments.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/gene_content.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/low_freq.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/no_acc_segments.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/no_core_contigs.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt b/unit_tests/unit_test_data/TestWritingOutputFunction/summary_table.txt old mode 100644 new mode 100755 diff --git a/unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder b/unit_tests/unit_test_data/test_tmp_folder/tmp_file_in_tmp_folder old mode 100644 new mode 100755 From 30a4115e0e360d66fce1f1174c3b8f03f602eea8 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 12:56:15 +1000 Subject: [PATCH 093/135] Add in function to print the core gene graph produced during segment search, if segment search is performed --- Corekaburra/__main__.py | 17 +++++++++-------- Corekaburra/consesus_core_genome.py | 5 +++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 888eb11..1311ee5 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -17,6 +17,7 @@ import logging import time import concurrent.futures +from networkx import write_gml try: from Corekaburra.commandline_interface import get_commandline_arguments @@ -196,6 +197,7 @@ def main(): ## Read in gene presence absence file time_start_read_files = time.time() # Prepair folder for reannotated genes and examine if any are already present + # TODO - likely not required after new implementations in Panaroo. if source_program == "Panaroo" and args.annotate: gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, args.input_gffs, logger) @@ -203,10 +205,8 @@ def main(): gene_data_dict = None corrected_dir = None - # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? - # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes - # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] + # TODO - Add in so that the user can give a list of genes that they wish to use as 'core genes' core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, args.input_gffs, tmp_folder_path, @@ -261,7 +261,6 @@ def main(): time_end_passing_gffs = time.time() time_start_segments_search = time.time() - time_start = time.time() # TODO - This seems like a lonely start timer? # Count number of unique accessory genes inserted into a core-core region across the genomes acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} # Count number of unique low frequency genes inserted into a core-core region across the genomes @@ -271,7 +270,7 @@ def main(): # Combine the accessory and low frequency counts: combined_acc_gene_count = {key: low_frew_region_count[key] + acc_region_count[key] for key in low_frew_region_count} - double_edge_segements, no_acc_segments = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, + double_edge_segements, no_acc_segments, core_graph = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, len(args.input_gffs), core_dict, logger) time_end_segments_search = time.time() @@ -296,10 +295,12 @@ def main(): logger.debug("No Accessory segment output") no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix) - # TODO - Possibly output core gene graph. with segment annotations in colour - possibly info on edges using weight for conenctions and other atributes for acc content.? - # TODO - Print summary number of genes and names for non-core contigs - # TODO - Should we print a low-freq, placement? + logger.debug("Writing core gene graph") + graph_name = f'{args.output_prefix}_core_gene_graph.gml' if args.output_prefix is not None else 'core_gene_graph.gml' + write_gml(core_graph, path=os.path.join(args.output_path, graph_name)) + + # TODO - Make this work! if len(non_core_contig_info)> 0: logger.debug("Non-core contig output") non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 1ec8fbb..3ea2178 100755 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -164,7 +164,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo for target_node in multi_edge_nodes+singe_edge_nodes: if target_node != source_node: # Get path (segment) from source to target - segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra + segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # Get length of path segment_length = len(segment) @@ -181,6 +181,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo if num_gffs - core_graph[segment[0]][segment[1]]['weight'] < gene_co_occurrences: continue else: + # Check if segment has been added in opposite direction, if not they add it to be further examined if all([x != segment[::-1] for x in multi_edge_connect_adjust]): multi_edge_connect_adjust.append(segment) # Construct name for path @@ -337,7 +338,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num logger.debug(f'No segments can be identified in given pan-genome\n') no_acc_segments = None - return double_edge_segements, no_acc_segments + return double_edge_segements, no_acc_segments, core_graph if __name__ == '__main__': From 6e5fb4909ed22d1864ac8d18808f6297c19bd3cf Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 14:25:47 +1000 Subject: [PATCH 094/135] Add in function to read gff files that are passed as gzipped files --- Corekaburra/gff_parser.py | 59 +++++++++++------- unit_tests/Corekaburra_test.py | 28 ++++++++- .../Silas_the_Salmonella.gff | 1 + .../Silas_the_Salmonella.gff.gz | Bin 0 -> 247 bytes 4 files changed, 64 insertions(+), 24 deletions(-) create mode 100755 unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff.gz diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 83079e5..eb118b2 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -1,4 +1,5 @@ import os +import gzip try: from Corekaburra.correct_gffs import annotate_refound_genes @@ -8,31 +9,44 @@ def parse_gff(input_file): """ - Read a gff file and return it as a generator object that return all line containing CDS + Try to read a GFF file as gzipped then normal + pass it to the parser and return it as a generator object that return all line containing CDS. + Break whenever the fasta sequence is reached. :param input_file: File-path to a given gff file to be processed - :return: Generator object returning CDS from a gff file + :return: line from generator object returning CDS from a gff file """ - with open(input_file, 'r') as gff_file: - for line in gff_file: - if "##FASTA" in line: - break - if "#" not in line and 'CDS' in line: - # Strip line for newline and split columns into list - line = line.strip() - line = line.split("\t") - - # See if refound gene or Prokka annotated and isolate ID in gene_presence_absence.csv accordingly - if "old_locus_tag=" in line[8]: - gene_id = line[8][line[8].find('old_locus_tag'):] - if ';' in gene_id: - gene_id = gene_id[:gene_id.find(';')] - else: - gene_id = line[8][line[8].find('ID'):line[8].find(';')] - # Remove equal sign from id and add as identifier for the returned gff line - gene_id = gene_id[gene_id.find('=') + 1:] - line[8] = gene_id - yield line + # Open input file as if it was gzipped + open_file = gzip.open(input_file, 'rt') + try: + # Test if gzipped by reading line + open_file.readline() + except (OSError, gzip.BadGzipFile): + # Open inout as if normal + open_file = open(input_file, 'r') + + for line in open_file: + if "##FASTA" in line: + # FASTA found - close file and end loop + open_file.close() + break + if "#" not in line and 'CDS' in line: + # Strip line for newline and split columns into list + line = line.strip() + line = line.split("\t") + + # See if refound gene or Prokka annotated and isolate ID in gene_presence_absence.csv accordingly + if "old_locus_tag=" in line[8]: + gene_id = line[8][line[8].find('old_locus_tag'):] + if ';' in gene_id: + gene_id = gene_id[:gene_id.find(';')] + else: + gene_id = line[8][line[8].find('ID'):line[8].find(';')] + + # Remove equal sign from id and add as identifier for the returned gff line + gene_id = gene_id[gene_id.find('=') + 1:] + line[8] = gene_id + yield line def get_contig_lengths(input_file): @@ -591,6 +605,7 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dir, logger) gff_generator = parse_gff(input_gff_file) + return_data = segment_gff_content(gff_generator=gff_generator, gff_path=input_gff_file, core_genes=core_genes, diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 313b5fb..8997382 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -1152,7 +1152,29 @@ def test_gff_generator_generation_not_corrected(self): ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"]] - return_generator = gff_parser.parse_gff(input_gff_file) + return_generator = [] + for line in gff_parser.parse_gff(input_gff_file): + return_generator += [line] + + for expected, generated in zip(expected_output, return_generator): + self.assertEqual(expected, generated) + + def test_gff_generator_generation_gzipped_input(self): + input_gff_file = 'TestParsingGffFile/Silas_the_Salmonella.gff.gz' + + expected_output = [['contig_1', '.', 'CDS', '1', '90', '.', '.', '.', 'Silas_the_Salmonella_tag-1-1'], + ['contig_1', '.', 'CDS', '100', '190', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.1'], + ['contig_1', '.', 'CDS', '200', '290', '.', '.', '.', 'Silas_the_Salmonella_tag-1-2.2'], + ['contig_1', '.', 'CDS', '300', '390', '.', '.', '.', 'Silas_the_Salmonella_tag-1-3'], + ['contig_1', '.', 'CDS', '400', '490', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.1'], + ['contig_1', '.', 'CDS', '500', '590', '.', '.', '.', 'Silas_the_Salmonella_tag-1-4.2'], + ['contig_1', '.', 'CDS', '600', '690', '.', '.', '.', 'Silas_the_Salmonella_tag-1-5.1'], + ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], + ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"]] + + return_generator = [] + for line in gff_parser.parse_gff(input_gff_file): + return_generator += [line] for expected, generated in zip(expected_output, return_generator): self.assertEqual(expected, generated) @@ -1171,7 +1193,9 @@ def test_gff_generator_generation_corrected_gff(self): ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"], ['contig_1', 'Panaroo', 'CDS', '900', '1000', '.', '+', '0', 'refound_gene_1']] - return_generator = gff_parser.parse_gff(input_gff_file) + return_generator = [] + for line in gff_parser.parse_gff(input_gff_file): + return_generator += [line] for expected, generated in zip(expected_output, return_generator): self.assertEqual(expected, generated) diff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff index 9a3ece9..75fc126 100755 --- a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff +++ b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff @@ -1,3 +1,4 @@ +##gff-version 3 contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.1;locus_tag=Silas_the_Salmonella_tag-1-2.1 contig_1 . CDS 200 290 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 diff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff.gz b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella.gff.gz new file mode 100755 index 0000000000000000000000000000000000000000..98733114b4b8668ebf24f446ec65d510f7f8582b GIT binary patch literal 247 zcmVH8W&rKcO$viB5C!0M za*9IM2J@q_loq5?=%$-=FCuD%RH53Qzas?~2Fe^DFPj7gKIVn7u5F`F^P2i`*eP0% z!`yeJAe@{FO9a`+a9z%|INIJ-Q#n7H(pK&{4vlkFIaggIB5`nI{hpTB-!Tt}V9b{+P?<@r;MLG#~pTytC xfMakDHz Date: Mon, 30 May 2022 14:28:59 +1000 Subject: [PATCH 095/135] Make so that logger is initiated in a single function not two --- Corekaburra/__main__.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 1311ee5..6a0db4b 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -118,18 +118,8 @@ def init_logging(debug_log, quiet, out_path): # Log command-line argument and debug line for Corekaburra start file_logger.info(f"command line: {' '.join(sys.argv)}") - return file_logger - - -def stream_logging(file_logger): - """ - Function adding in stream logging following initial logging - :param file_logger: Logger object - :return: Logger object with added stream logging - """ stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) - file_logger.addHandler(stream_handler) file_logger.info('\n----------------------Processing started----------------------\n') @@ -156,15 +146,11 @@ def main(): pass # Run initialisation of logger: - logger = init_logging(args.log, args.quiet, args.output_path) # TODO - if not dependency check is done then it should be possible to add the stream logger following the logging of the command line in the initial logging function. - logger = stream_logging(logger) + logger = init_logging(args.log, args.quiet, args.output_path) # Check that low-frequency cutoff and core cutoff are as expected check_cutoffs(args.low_cutoff, args.core_cutoff, logger) - # TODO - Make Corekaburra take gzipped inputs - # TODO - Add so that a single gff file can only be given as input once and not multiple times? - # Check the presence of provided complete genomes among input GFFs if args.comp_genomes is not None: comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs, logger) @@ -175,11 +161,13 @@ def main(): source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan, logger) # Check if gene_data file is present if Panaroo input is given an gffs should be annotated + # TODO Likely not needed anymore with new implementations in Panaroo if args.annotate and source_program == 'Panaroo': gene_data_path = check_gene_data(args.input_pan, logger) else: gene_data_path = None + # Check that all GFF files given can be found in the pan-genome check_gff_in_pan(args.input_gffs, input_pres_abs_file_path, logger) # Construct temporary folder: From 540a7f0d7dcefe8156faaa309d72d5a9e56a5ebb Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 14:41:07 +1000 Subject: [PATCH 096/135] Add in a pythonic way of handling temporary folder --- Corekaburra/__main__.py | 18 +++---------- Corekaburra/exit_with_error.py | 15 +---------- unit_tests/Corekaburra_test.py | 26 ------------------- .../TestExitWithError/tmp_folder/test_file | 0 4 files changed, 4 insertions(+), 55 deletions(-) delete mode 100755 unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 6a0db4b..08c5758 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -15,6 +15,7 @@ import os import logging +import tempfile import time import concurrent.futures from networkx import write_gml @@ -171,13 +172,7 @@ def main(): check_gff_in_pan(args.input_gffs, input_pres_abs_file_path, logger) # Construct temporary folder: - # TODO - check that the temporary folder does not exist and that the user does not have a folder with same name already. (Maybe use a time stamp for the start to make it unique.) - tmp_folder_path = os.path.join(args.output_path, 'Corekaburra_tmp') - try: - os.mkdir(tmp_folder_path) - except FileExistsError: - for file in os.listdir(tmp_folder_path): - os.remove(file) + tmp_folder_path = tempfile.TemporaryDirectory() logger.info('Initial checks successful\n') inital_check_time_end = time.time() @@ -289,7 +284,7 @@ def main(): write_gml(core_graph, path=os.path.join(args.output_path, graph_name)) # TODO - Make this work! - if len(non_core_contig_info)> 0: + if len(non_core_contig_info) > 0: logger.debug("Non-core contig output") non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) @@ -310,12 +305,5 @@ def main(): logger.debug(f"Searching for segments time: {segment_search_time}s") - # Remove temporary database holding gff databases - if os.path.isdir(tmp_folder_path): - os.rmdir(tmp_folder_path) - if args.discard_gffs: - os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files')) - - if __name__ == '__main__': main() diff --git a/Corekaburra/exit_with_error.py b/Corekaburra/exit_with_error.py index 8403942..4ce3bc6 100755 --- a/Corekaburra/exit_with_error.py +++ b/Corekaburra/exit_with_error.py @@ -4,29 +4,16 @@ from logging import getLogger -def exit_with_error(message, exit_status, logger, tmp_folder=None): +def exit_with_error(message, exit_status, logger): """ Print an error message to stderr, prefixed by the program name and 'ERROR'. Then exit program with supplied exit status. :param message: Message to give the user upon exit :param exit_status: Status returned as exit status - :param tmp_folder: Temporary folder for Corekaburra to be deleted under some circumstances. :param logger: Logger for program :return: None """ - # Delete tmp files and folder - try: - if tmp_folder is not None: - tmp_files = os.listdir(tmp_folder) - for file in tmp_files: - os.remove(os.path.join(tmp_folder, file)) - os.rmdir(tmp_folder) - else: - pass - except FileNotFoundError: - pass - logger.error(message) print(f"Corekaburra ERROR: {message}, exiting", file=sys.stderr) sys.exit(exit_status) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 8997382..6f369c1 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -30,32 +30,6 @@ except FileNotFoundError: os.chdir('unit_test_data/') - -class TestExitWithError(unittest.TestCase): - """ Test for the function carrying out a nice exit """ - @classmethod - def setUpClass(cls): - cls.logger = logging.getLogger('test_logger.log') - cls.logger.setLevel(logging.INFO) - - def test_exit_w_tmp_folder_deletion(self): - ''' Test the exit function is able to remove the temporary folder ''' - - # copy the placeholder tmp folder to replace it afterwards - tmp_folder = 'TestExitWithError/tmp_folder' - tmp_folder_copy = 'TestExitWithError/tmp_folder_copy' - os.mkdir(tmp_folder_copy) - - tmp_files = os.listdir(tmp_folder) - for file in tmp_files: - copyfile(os.path.join(tmp_folder, file), os.path.join(tmp_folder_copy, file)) - - with self.assertRaises(SystemExit): - exit_with_error.exit_with_error(exit_status=2, message='test msg', logger=self.logger, tmp_folder=tmp_folder) - - os.rename(tmp_folder_copy, tmp_folder) - - class TestCutOffViolations(unittest.TestCase): """ Test for the function that examines the cutoffs given for core and low-frequency genes""" @classmethod diff --git a/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file b/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file deleted file mode 100755 index e69de29..0000000 From 216c91e49a5b1939dc97596e57c470f040a6a26b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 15:18:03 +1000 Subject: [PATCH 097/135] Add in multi processing of core gene graph segments --- Corekaburra/__main__.py | 2 +- Corekaburra/consesus_core_genome.py | 34 ++++++++++++++++------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 08c5758..ddaa0c0 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -254,7 +254,7 @@ def main(): combined_acc_gene_count = {key: low_frew_region_count[key] + acc_region_count[key] for key in low_frew_region_count} double_edge_segements, no_acc_segments, core_graph = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, - len(args.input_gffs), core_dict, logger) + len(args.input_gffs), core_dict, args.cpu, logger) time_end_segments_search = time.time() diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 3ea2178..6b410b8 100755 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -1,4 +1,5 @@ import networkx as nx +import concurrent.futures try: from Corekaburra.exit_with_error import exit_with_error except ModuleNotFoundError: @@ -115,7 +116,7 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun return sub_segment_dict -def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components): +def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components, logger): """ Function to identify stretches of core genes between core genes neighbouring multiple different genes :param core_graph: Graph over core genes with weights being the number of connections between the genes @@ -126,10 +127,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo """ # TODO - Describe missing parameters in docstring - # TODO - Fix Ouli's problem where the core gene graph may split into two seperat pieces, and also handle double chromosome. - # - Add a chek if the core gene graph is a single component of multiple. Handle components separately. - Write test then program - # - This likely require a change to the all-vs-all search of multi edge core gene search, by adding a try and expect statement maybe, or just handle each component separately. - + # TODO - Handle multiple chromosomes # Identify all nodes that contain more than two degrees. multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] # Check if multiple components in core graph, if then find single edge core_genes @@ -194,7 +192,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo else: if double_edge_segements[source_target_name] != segment[::-1]: exit_with_error(EXIT_SEGMENT_IDENTIFICATION_ERROR, - f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!") + f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!", logger) # Calculate the expected number of paths total_edges_from_non_two_edge_core_genes = sum([connections for _, connections in core_graph.degree if connections > 2 or connections < 2]) @@ -291,13 +289,13 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo return double_edge_segements -def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict, logger): +def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num_gffs, core_gene_dict, max_cpus, logger): """ Function to be called from main that collects the functions for determining core segments in pan-genome :param core_neighbour_pairs: Dict of the number of times core pairs have been detected :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs - :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes + :param num_gffs: Number of inputted gff files :param logger: Program logger # TODO - Add parameters @@ -315,13 +313,19 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num double_edge_segements = {} # Identify all segments in components of core graph - for component in nx.connected_components(core_graph): - logger.debug(f'Searching component related to: {component}') - - component_graph = core_graph.subgraph(component).copy() - return_segments = identify_segments(component_graph, num_gffs, core_gene_dict, num_core_graph_components) - if return_segments is not None: - double_edge_segements = double_edge_segements | return_segments + #for component in nx.connected_components(core_graph): + + logger.debug(f'Searching components of core gene graph') + with concurrent.futures.ProcessPoolExecutor(max_workers=max_cpus) as executor: + return_object = [executor.submit(identify_segments, + core_graph.subgraph(component).copy(), num_gffs, + core_gene_dict, num_core_graph_components, logger) + for component in nx.connected_components(core_graph)] + # identify_segments(core_graph.subgraph(component).copy(), num_gffs, core_gene_dict, num_core_graph_components, logger) + for output in concurrent.futures.as_completed(return_object): + return_segments = output.result() + if return_segments is not None: + double_edge_segements = double_edge_segements | return_segments # if double_edge_segements is not None: if double_edge_segements: From 49bd1ee642ca0f6e2703848955daa7faac2d15cc Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 15:42:50 +1000 Subject: [PATCH 098/135] Remove code to be transfered, large file and remove TODOs from multiple files --- Code_to_transfer/__main__.py | 224 ---- .../construct_multi_fasta_genome.py | 106 -- Code_to_transfer/plots.py | 11 - Code_to_transfer/read_complete_genome_file.py | 25 - Code_to_transfer/requirements-dev.txt | 4 - Code_to_transfer/test_coredial.py | 1034 ----------------- Code_to_transfer/time_calculator.py | 19 - Corekaburra.cwl | 26 - Corekaburra/gff_parser.py | 2 +- Corekaburra/output_writer_functions.py | 1 - Corekaburra/parse_gene_presence_absence.py | 12 +- Corekaburra/summary_table.py | 4 +- unit_tests/Corekaburra_test.py | 28 +- 13 files changed, 29 insertions(+), 1467 deletions(-) delete mode 100644 Code_to_transfer/__main__.py delete mode 100644 Code_to_transfer/construct_multi_fasta_genome.py delete mode 100644 Code_to_transfer/plots.py delete mode 100644 Code_to_transfer/read_complete_genome_file.py delete mode 100644 Code_to_transfer/requirements-dev.txt delete mode 100644 Code_to_transfer/test_coredial.py delete mode 100644 Code_to_transfer/time_calculator.py delete mode 100644 Corekaburra.cwl diff --git a/Code_to_transfer/__main__.py b/Code_to_transfer/__main__.py deleted file mode 100644 index 982d724..0000000 --- a/Code_to_transfer/__main__.py +++ /dev/null @@ -1,224 +0,0 @@ -# General function -from parse_gene_presence_absence import read_gene_presence_absence -from gff_parser import segment_genome_content -from correct_gffs import correct_gffs -from merge_dicts import merge_dicts_counts, merge_dicts_lists, merge_first_genes -from output_writer_functions import master_info_writer, summary_info_writer, segment_writer, no_acc_segment_writer, \ - write_consensus_core_gene_synteny, \ - write_core_gene_coverage, \ - write_alternative_core_gene_counts, \ - write_core_gene_types -from consesus_core_genome import determine_genome_segments, determine_core_gene_consesus, identify_rearrangements, \ - characterise_rearrangements, core_pair_matrix -from check_inputs import define_input_source, check_gene_data, check_gff_files, check_gene_alignments, check_gff_in_pan -from time_calculator import time_calculator -from commandline_interface import get_commandline_arguments -from read_complete_genome_file import parse_complete_genome_file -from summary_table import calculate_n_create_summaries -from construct_multi_fasta_genome import construct_consensus_alignment -# from plots import consesus_genome_coverage -import concurrent.futures -from os import listdir, mkdir, rmdir -from os.path import join, isdir -import time -import sys - - -def main(): - total_time_start = time.time() - # get arguments from the commandline - args = get_commandline_arguments(sys.argv[1:]) - - - # Check presence of gff files. # TODO - is this necessary? - if check_gff_files(args.input_gffs): - print("All .gff files were found!") - - - # Parse complete genome file and check that genomes are present - if args.comp_genomes is not None: - comp_genomes = parse_complete_genome_file(args.comp_genomes, args.input_gffs) - else: - comp_genomes = None - - # Check input - if not args.quiet: - print("\n----Checking presence of input files in pan genome folder----\n") - - # Check if Panaroo or Roary input folder is given - source_program, input_pres_abs_file_path = define_input_source(args.input_pan) - - # Check if gene_data file is present if Panaroo input is given an gffs should be annotated - if args.annotate and source_program is not 'Rorary': - gene_data_path = check_gene_data(args.input_pan) - - if not args.quiet: - print(f"Pan genome determined to come from {source_program}") - print("All files found, let's move on!\n") - print("--------------------------------------------------------------\n") - - # TODO - Make the program work with less than all files in the pangenome. Just make sure that all gff files supplied can be found in the pan genome. This will make is possible to look at hotspots and segments in different lineages - check_gff_in_pan(args.input_gffs, input_pres_abs_file_path) - - # Construct output folder - try: - mkdir(args.output_path) - if not args.quiet: - print("Output folder constructed") - except FileExistsError: - if not args.quiet: - print("Output folder exists") - - # Construct temporary folder: - # TODO - check that the temporary folder does not exist and that the user does not have a folder with same name already. (Maybe use a time stamp for the start to make it unique.) - temp_folder_path = join(args.output_path, 'genome_corer_tmp') - mkdir(temp_folder_path) - - ## Read in gene presence absence file - time_start = time.time() - # TODO - Add the user specified thresholds for core and low frequency genes. - core_dict, low_freq_dict, acc_gene_dict, attribute_dict = read_gene_presence_absence(input_pres_abs_file_path, - 1, 0.05, source_program, - args.input_gffs, temp_folder_path) - if not args.quiet: - time_calculator(time_start, time.time(), "reading in gene presence/absence file") - - # If source program is Panaroo, see if alignments folder is available, and core genes are contained in it. - # if source_program == 'Panaroo': - # alignment_folder = check_gene_alignments(args.input_pan, core_dict) - - # Add in the refound genes into the gff files and print the corrected GFF files. - if source_program == "Panaroo" and args.annotate: - time_start = time.time() - print(f"\n----------Adding in refound annotations for gff files---------") - - corrected_folder = correct_gffs(args.input_gffs, gene_data_path, args.output_path, attribute_dict, temp_folder_path) - - args.input_gffs = [join(corrected_folder, file) for file in listdir(corrected_folder) if '.gff' in file] - if not args.quiet: - time_calculator(time_start, time.time(), "add refound annotations to gff files") - - - # Loop over all gffs and extract info from each of them. - time_start = time.time() - # Initialise dictionaries to contain results from all gff files - core_neighbour_pairs = {} - core_neighbour_distance = {} - core_neighbour_accessory_count = {} - core_neighbour_low_freq = {} - master_info_total = {} - non_core_contig_info = {} - merged_start_gene_clusters = [] - merged_second_gene_clusters = [] - - with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor: - print(f"\n------Start core region identification of given gff files-----") - print(f'{len(args.input_gffs)} GFF files to process') - - results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, i, comp_genomes) - for i, gff in enumerate(args.input_gffs)] - - for output in concurrent.futures.as_completed(results): - # Split the outputs - core_pairs, distance, acc_count, \ - low_freq, master_info_return, \ - core_less_contigs_return, start_gene_cluster = output.result() - - # Merge results into single/master dictionaries - core_neighbour_pairs = merge_dicts_counts(core_neighbour_pairs, core_pairs) - core_neighbour_distance = merge_dicts_lists(core_neighbour_distance, distance) - core_neighbour_accessory_count = merge_dicts_lists(core_neighbour_accessory_count, acc_count) - core_neighbour_low_freq = merge_dicts_lists(core_neighbour_low_freq, low_freq) - master_info_total.update(master_info_return) - non_core_contig_info.update(core_less_contigs_return) - merged_start_gene_clusters, merged_second_gene_clusters = merge_first_genes(start_gene_cluster, - merged_start_gene_clusters, - merged_second_gene_clusters, - core_pairs[0]) - time_calculator(time_start, time.time(), "searching gff files for core genes") - - - # TODO - Try to identify variable regions - # * Check if there are any sequence breaks in the dataset - # * if sequence breaks are present, examine which core genes could substitute sequence breaks - # If sequence break has single substitute and variance is observed in length or accessory content then flag as variable - # If sequence break has multipl substitutes then flag as variable anyway, as this eludes to rearrangements - - - ### FUNCTION ### - # Determine the most common core gene synteny. - # time_start = time.time() - # Find the core gene synteny and possible core genes with alternative neighbours - # consensus_core_genome, \ - # possible_rearrangement_genes, \ - # core_path_coverage = determine_core_gene_consesus(core_neighbour_pairs, - # merged_start_gene_clusters, - # merged_second_gene_clusters, args.output_path) - - ### Determine segments of core genome ### - print(f"\n--------------Identifying segments in pan genome--------------") - time_start = time.time() - # Count number of unique accessory genes inserted into a core-core region across the genomes - acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} - # Count number of unique low frequency genes inserted into a core-core region across the genomes - low_frew_region_count = {key: len(set(core_neighbour_accessory_count[key])) for key in core_neighbour_accessory_count} - - # Combine the accessory and low frequency counts: - combined_acc_gene_count = {key: low_frew_region_count[key] + acc_region_count[key] for key in low_frew_region_count} - - double_edge_segements, no_acc_segments = determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, len(args.input_gffs)) - - time_calculator(time_start, time.time(), "identifying segments in pan genome") - # Assign core-gene synteny types: - # Identify alternative connections and their occurrence - # alt_core_pairs, alt_core_pair_count, \ - # core_genome_types, alt_core_comp_types = identify_rearrangements(consensus_core_genome, - # possible_rearrangement_genes, - # master_info_total, - # args.input_gffs) - - # rearrangement_predictions = characterise_rearrangements(alt_core_pairs, consensus_core_genome) - - # time_calculator(time_start, time.time(), "determining best core gene synteny") - ### DO CALCULATIONS ### - # TODO mean number length between core genes - # for neighbours in core_neighbour_distance: - # print(mean(core_neighbour_distance[neighbours])) - # print(std(core_neighbour_distance[neighbours])) - - ####################### - # TODO - make function that relates every core gene to a given reference genomes' locus_tags, if given such a reference. - On request from Andrew - - - # Make a Summary table like the one produced in R. - master_summary_info = calculate_n_create_summaries(master_info_total) - - - - ### WRITE OUTPUTS ### - print(f"\n-----------------------Printing outputs-----------------------") - # Write master information to output file - time_start = time.time() - master_info_writer(master_info_total, args.output_path, args.output_prefix, args.quiet) - summary_info_writer(master_summary_info, args.output_path, args.output_prefix, args.quiet) - # if return_of_segments is not None - # TODO - Contruct output for segments - parent column. - segment_writer(double_edge_segements, args.output_path, args.output_prefix, args.quiet) - no_acc_segment_writer(no_acc_segments, args.output_path, args.output_prefix, args.quiet) - # print(non_core_contig_info) TODO - Print core less contigs. - # TODO print a list of accessory genes that have not been related to any region? - # TODO - Possibly output core gene graph. with segment annotations? - - time_calculator(time_start, time.time(), "writing output files") - - # Finish up running - time_calculator(total_time_start, time.time(), "running the entire program") - - # Remove temporary database holding gff databases - # TODO - Implement a nice crash function where the temporary folder is removed not to cause unessecary frustration for the user when trying to rerun the program. - print(isdir(temp_folder_path)) - if isdir(temp_folder_path): - rmdir(temp_folder_path) - -if __name__ == "__main__": - main() diff --git a/Code_to_transfer/construct_multi_fasta_genome.py b/Code_to_transfer/construct_multi_fasta_genome.py deleted file mode 100644 index 3f8cf55..0000000 --- a/Code_to_transfer/construct_multi_fasta_genome.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -from numpy import mean, median - - -def construct_consensus_alignment(consesus_genome_synteny, alignment_folder, core_neighbour_distance): - con_gen_aln = {} - - # Add in the first gene and the genomes to the alignment dict - with open(os.path.join(alignment_folder, f'{consesus_genome_synteny[0]}.aln.fas'), 'r') as alignment: - for line in alignment.readlines(): - if '>' in line: - genome_name = line.split(';')[0] - genome_name = genome_name.split('>')[1] - con_gen_aln[genome_name] = [] - else: - con_gen_aln[genome_name].append(line.strip()) - - # Add in the intergenic sequence to next gene - # # Get a random key to get an alignments - # rand_genome = list(con_gen_aln.keys())[0] - - # Get length to match 60 character fasta lines. - # miss_length, full_lines, remaining_chars = adjust_character(con_gen_aln, rand_genome, core_neighbour_distance, consesus_genome_synteny=consesus_genome_synteny) - alignment.close() - - # TODO: Construct the output file - # with open('consensus_core_gene_alignment', 'w') as XXX: - - # TODO: Constrict a dict that keeps track of the gene, its coordinates and its strand. - # Output in tsv format. - - # Loop through the consensus core genome using enumerate - for i, gene in enumerate(consesus_genome_synteny[1:]): - # add N as intergenetic distance - core_gene_pair = sorted([consesus_genome_synteny[i-1], consesus_genome_synteny[i]]) - intergen_dist = int(mean(core_neighbour_distance[f'{core_gene_pair[0]}--{core_gene_pair[1]}'])) - - # Check if the distance is negative and change it to 0 - if intergen_dist < 0: - intergen_dist = 0 - - for genome in con_gen_aln.keys(): - con_gen_aln[genome].append("N" * intergen_dist) - - # Add next gene - with open(os.path.join(alignment_folder, f'{gene}.aln.fas'), 'r') as alignment: - for line in alignment.readlines(): - if '>' in line: - genome_name = line.split(';')[0] - genome_name = genome_name.split('>')[1] - else: - con_gen_aln[genome_name].append(line.strip()) - - # Add last intergenic sequence - if i == len(consesus_genome_synteny)-2: - # add N as intergenetic distance - core_gene_pair = sorted([consesus_genome_synteny[0], consesus_genome_synteny[-1]]) - intergen_dist = int(mean(core_neighbour_distance[f'{core_gene_pair[0]}--{core_gene_pair[1]}'])) - # print(core_neighbour_distance[f'{core_gene_pair[0]}--{core_gene_pair[1]}']) - # print(intergen_dist) - - # Check if the distance is negative and change it to 0 - if intergen_dist < 0: - intergen_dist = 0 - - for genome in con_gen_aln.keys(): - con_gen_aln[genome].append("N" * intergen_dist) - - # Write the alignment to file - # Open the file - with open('consensus_core_gene_alignment.aln.fas', 'w') as con_aln_file: - # Loop through genomes in dict - for genome in con_gen_aln: - cur_genome_seq = con_gen_aln[genome].copy() - # Writer header line - con_aln_file.write(f'>{genome}\n') - # take a line - cur_line = cur_genome_seq.pop(0) - # While there are more lines in the current genome, add them in. - while len(cur_genome_seq) != 0 or len(cur_line) >= 60: - # if line current line >= 60 characters, write line else add next set of the sequence - if len(cur_line) >= 60: - cur_line, add_string = cur_line[60:], cur_line[0:60] - con_aln_file.write(add_string + '\n') - else: - # pop next line in genome and add to current line - cur_line = cur_line + cur_genome_seq.pop(0) - con_aln_file.close() - - #print(sum([len(x) for x in con_gen_aln[list(con_gen_aln.keys())[0]]])) - # print(con_gen_aln[list(con_gen_aln.keys())[0]]) - - - # Check if i is equal to number of genes in the consensus genome - # if then add the last distance (from last to first gene) - - # Following may be a seperate function to keep it DRY with the above of adding the last to first segment. - # Take the current gene and the next - sort and paste together using -- to get key to distance dict - - # Find the mean/median or other distance between core genes. - - # Append the number of N's to the file that the distance dictates - - -## TODO - Panaroo seems to use only the fragment with the least gaps "-" if two fragments of a single gene is found. -# * can we combine some of them based on genome placement? \ No newline at end of file diff --git a/Code_to_transfer/plots.py b/Code_to_transfer/plots.py deleted file mode 100644 index 4b47bc2..0000000 --- a/Code_to_transfer/plots.py +++ /dev/null @@ -1,11 +0,0 @@ -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt -import pandas as pd - - -def consesus_genome_coverage(consensus_genome, coverage): - df = pd.DataFrame.from_records(coverage) - df['0'].astype(str) + df['1'] - - #print(df) \ No newline at end of file diff --git a/Code_to_transfer/read_complete_genome_file.py b/Code_to_transfer/read_complete_genome_file.py deleted file mode 100644 index 276e18c..0000000 --- a/Code_to_transfer/read_complete_genome_file.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - - -def parse_complete_genome_file(complete_genome_file, gff_files): - # Read the file and all lines (complete genomes given) - with open(complete_genome_file, 'r') as genome_file: - complete_genomes = genome_file.readlines() - complete_genomes = [name.strip().replace('.gff', '') for name in complete_genomes] - complete_genomes = [os.path.basename(name) for name in complete_genomes] - - # Take input gffs and remove path to the file - gffs = [os.path.basename(gff).replace('.gff', '') for gff in gff_files] - - # check that all complete genomes are in the input gffs - complete_genome_status = all(complete_genome in gffs for complete_genome in complete_genomes) - - # If the complete genomes are found, return. Else try to remove the extension of input files and compare - if complete_genome_status: - return complete_genomes - - NotImplementedError('Reading the file of complete genomes encountered an error that has not been expected!\n' - 'Please report this and give the file you passed. Cheers!') - -if __name__ == '__main__': - pass diff --git a/Code_to_transfer/requirements-dev.txt b/Code_to_transfer/requirements-dev.txt deleted file mode 100644 index 00fdb32..0000000 --- a/Code_to_transfer/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Dependencies, with versions explicitly declared for development and testing -biopython==1.66 -# Packages that are not dependencies but used for building or testing -pylint diff --git a/Code_to_transfer/test_coredial.py b/Code_to_transfer/test_coredial.py deleted file mode 100644 index 443cab9..0000000 --- a/Code_to_transfer/test_coredial.py +++ /dev/null @@ -1,1034 +0,0 @@ -import unittest -import warnings -from hypothesis import given -import hypothesis.strategies as st -from gff_parser import get_genome_size_from_gff, segment_genome_content, get_contig_lengths, record_core_core_region, connect_first_n_last_gene_on_contig -from parse_gene_presence_absence import read_gene_presence_absence -from merge_dicts import merge_dicts_lists, merge_dicts_counts -from consesus_core_genome import characterise_rearrangements -from check_inputs import define_input_source, check_gene_data, check_gene_alignments, check_gff_in_pan -from correct_gffs import read_gene_data, extract_genome_fasta -from read_complete_genome_file import parse_complete_genome_file -from consesus_core_genome import construct_core_graph, identify_segments, identify_no_accessory_segments -from random import randint, choices -import os -import glob -from numpy import arange, ceil -import networkx as nx - - -class TestPresenceAbsenceParser(unittest.TestCase): - - def test_parser_core_genes(self): - core_genes, accessory_genes, _, _ = read_gene_presence_absence("/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - keys = [key for key in core_genes.keys()] - self.assertEqual(len(core_genes[keys[1]]), 10) - - def test_parser_low_freq_genes(self): - core_genes, accessory_genes, _, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - - low_frew_genes = 0 - for key in accessory_genes.keys(): - low_frew_genes += len(accessory_genes[key]) - self.assertEqual(low_frew_genes, 5) - - -class TestInputGiven(unittest.TestCase): - # Test pairing of all files in pan genome - def test_input_gff_pres_abs_full_pairing(self): - input_pres_abs = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_pan_folder/gene_presence_absence_roary.csv' - input_file_list = ['Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff', 'Ajwa_the_Shigella.gff', - 'Ajwa_the_Legionella.gff', 'Cari_the_Listeria.gff', 'Aman_the_Streptococcus.gff', - 'Zion_the_Streptococcus.gff', 'Dina_the_Shigella.gff', 'Silas_the_Legionella.gff', - 'Lilly_the_Shigella.gff', 'Chantal_the_Listeria.gff', 'Cari_the_Shigella.gff', - 'Cari_the_Legionella.gff', 'Aman_the_Shigella.gff', 'Ajwa_the_Streptococcus.gff', - 'Aman_the_Legionella.gff', 'Zayan_the_Shigella.gff', 'Chantal_the_Salmonella.gff', - 'Silas_the_Shigella.gff', 'Zayan_the_Legionella.gff'] - - return_bool = check_gff_in_pan(input_file_list, input_pres_abs) - - self.assertEqual(return_bool, True) - - # Test pairing of some files in pan genome - Warning - def test_input_gff_pres_abs_partial_pairing_catch_warning(self): - input_pres_abs = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_pan_folder/gene_presence_absence_roary.csv' - input_file_list = ['Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff', 'Ajwa_the_Shigella.gff'] - - with self.assertWarns(Warning): - check_gff_in_pan(input_file_list, input_pres_abs) - - # Test pairing of some files in pan genome - def test_input_gff_pres_abs_partial_pairing(self): - input_pres_abs = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_pan_folder/gene_presence_absence_roary.csv' - input_file_list = ['Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff', 'Ajwa_the_Shigella.gff'] - - return_bool = check_gff_in_pan(input_file_list, input_pres_abs) - - self.assertEqual(return_bool, True) - - # Test when given a file not in pan genome among others that are in the pan genome - def test_input_gff_pres_abs_file_not_in_pan(self): - input_pres_abs = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_pan_folder/gene_presence_absence_roary.csv' - input_file_list = ['Cappuccino.gff', 'Flat_white.gff', 'Doubble_espresso.gff'] - - with self.assertRaises(FileNotFoundError): - check_gff_in_pan(input_file_list, input_pres_abs) - - def test_input_gff_pres_abs_some_file_not_in_pan(self): - input_pres_abs = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_pan_folder/gene_presence_absence_roary.csv' - input_file_list = ['Cappuccino.gff', 'Silas_the_Salmonella.gff', 'Christina_the_Streptococcus.gff'] - - with self.assertRaises(FileNotFoundError): - check_gff_in_pan(input_file_list, input_pres_abs) - - -class TestGffparser(unittest.TestCase): - - def test_get_genome_size_from_gff(self): - genome_length = get_genome_size_from_gff( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_locating_inputs/GCA_000006785.gff") - - print(genome_length) - self.assertEqual(1852433, genome_length) - - def test_segmentation_core_gene_number_all_complete(self): - core_genes, accessory_genes, acc_gene_dict, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - - pre_path = "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/Single_contig/" - gff_files = glob.glob(pre_path+"*.gff") - - core_result_dict = {} - - for file in gff_files: - core_gene_pairs, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, master_info, _, _ = segment_genome_content(low_freq_genes=accessory_genes, core_genes=core_genes, input_file=os.path.join(pre_path,file), i=1, acc_gene_dict=acc_gene_dict) - core_result_dict = merge_dicts_counts(core_result_dict, core_gene_pairs) - - self.assertEqual(len(core_result_dict.keys()), 10) - - def test_segmentation_distance_number(self): - core_genes, accessory_genes, acc_gene_dict, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - - pre_path = "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/Single_contig/" - gff_files = glob.glob(pre_path+"*.gff") - - distance_result_dict = {} - - for file in gff_files: - core_gene_pairs, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, master_info, _, _ = segment_genome_content(low_freq_genes=accessory_genes, core_genes=core_genes, input_file=os.path.join(pre_path,file), acc_gene_dict=acc_gene_dict, i=1) - distance_result_dict = merge_dicts_lists(distance_result_dict, core_gene_pair_distance) - - self.assertEqual(len(distance_result_dict.keys()), 10) - - - def test_segmentation_accessory_gene_number(self): - core_genes, accessory_genes, acc_gene_dict, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - - pre_path = "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/complete/Single_contig/" - gff_files = glob.glob(pre_path+"*.gff") - - accessory_result_dict = {} - - for file in gff_files: - core_gene_pairs, core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, master_info, _, _ = segment_genome_content(low_freq_genes=accessory_genes, core_genes=core_genes, input_file=os.path.join(pre_path,file), acc_gene_dict=acc_gene_dict, i=1) - accessory_result_dict = merge_dicts_lists(accessory_result_dict, accessory_gene_content) - accessory_gene_amount = [] - for i, key in enumerate(accessory_result_dict.keys()): - if i == 10: - self.assertListEqual([7,6,7,8,7,5,7,9,7,7,6,7,7,6,8,3,7,8,4,5], accessory_result_dict[key]) - - -# TODO - constuct test that tests load a single gff file - -class TestLocatingInput_files(unittest.TestCase): - - def test_locate_core_gene_alignments(self): - input_folder = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_locating_inputs/test_pan_alignment_complete' - core_dict, _, _, _ = read_gene_presence_absence(os.path.join(input_folder, 'gene_presence_absence_roary.csv'), - 1, 0.01, verbose=False) - - return_value = check_gene_alignments(input_folder, core_dict) - self.assertEqual(os.path.join(input_folder, 'aligned_gene_sequences'), return_value) - - - incomplete_input_folder = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_locating_inputs/test_pan_alignment_incomplete_alignments' - with self.assertRaises(FileNotFoundError): - check_gene_alignments(incomplete_input_folder, core_dict) - - -# TODO - constuct test that loads a mock gene_prensence_absence file - def test_loading_gene_pres_abs(self): - # TODO - Construct mock gene_presence_absence_file - # TODO - Load Panaroo file - # Load mock Panaroo gene presence absence file - read_core, read_low_freq, read_acc, _ = read_gene_presence_absence("/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_gene_pres_abs_parser/read_mock_gene_presence_absence.csv", - 1, 0.05, verbose=False) - - # Test number of keys to see if all genomes are read in - self.assertEqual(len(read_core), 5) - - # Test accessory genes - # Extract acc genes found - ace_genes = [gene for x in read_acc.keys() for y in read_acc[x].keys() for gene in read_acc[x][y]] - self.assertEqual(len(set(ace_genes)), 9) - - ## Test if the expected number of core genes are found - for i in arange(0.8, 1, 0.1): - read_core, _, _, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_gene_pres_abs_parser/read_mock_gene_presence_absence.csv", - i, 0.05, verbose=False) - - # Extract core genes found - core_genes = [gene for x in read_core.keys() for y in read_core[x].keys() for gene in read_core[x][y]] - - # Test depending on percent presence - if i == 1: - self.assertEqual(len(set(core_genes)), 3) - else: - self.assertEqual(len(set(core_genes)), 5) - - - # Test low frequency genes - for i in arange(0.1, 0.8, 0.1): - _, read_low_freq, _, _ = read_gene_presence_absence( - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_gene_pres_abs_parser/read_mock_gene_presence_absence.csv", - 1, i, verbose=False) - - low_genes = [gene for x in read_low_freq.keys() for y in read_low_freq[x].keys() for gene in read_low_freq[x][y]] - - low_gene_presence = ceil(5*i) - - # Test depending on percent presence - if low_gene_presence == 1: - self.assertEqual(len(set(low_genes)), 4) - elif low_gene_presence == 2: - self.assertEqual(len(set(low_genes)), 9) - elif low_gene_presence == 2: - self.assertEqual(len(set(low_genes)), 11) - elif low_gene_presence == 2: - self.assertEqual(len(set(low_genes)), 13) - - - # TODO - Load Roary file - - # TODO - - - # def test_rearrangement_predictions(self): - # consus_genome = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] - # alternative_core_pairs = {'D--G': 1, 'E--H': 1} - # - # characterise_rearrangements(consus_genome, alternative_core_pairs) - - - def test_soruce_identification(self): - # Identify Roary file - path = "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Test_source_identification/Roray_gene_presence_absence" - source, file_path = define_input_source(path) - self.assertEqual(source, "Roary") - self.assertEqual(file_path, - "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Test_source_identification/Roray_gene_presence_absence/gene_presence_absence.csv") - - # Identify Panaroo file - path = '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Test_source_identification/Panaroo_gene_presence_absence' - source, file_path = define_input_source(path) - self.assertEqual(source, "Panaroo") - self.assertNotEqual(file_path, - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Test_source_identification/Panaroo_gene_presence_absence/gene_presence_absence.csv') - self.assertEqual(file_path, - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Test_source_identification/Panaroo_gene_presence_absence/gene_presence_absence_roary.csv') - - def test_locate_gene_data(self): - check_return = check_gene_data('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/same_genome_pan_split') - - self.assertEqual(os.path.isfile(check_return), True) - - with self.assertRaises(FileNotFoundError): - check_gene_data('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome') - - def test_reading_gene_data_csv(self): - gene_data = read_gene_data('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_gene_data_reading/test_gene_data.csv') - - expected_dict = { - 'GCA_900636485': {'49_refound_1369': 'ATGACGATAGATGAAGCGTTGCAAAATTTACGTGATAACTTTAATAAAATAATGAATGTCCTAAAAAACGATTGGAAAGCACTATTGTTTCTTGCAATCACAATATTTGGGATGATGGTAACCGTGTCGTATTTTAGCTATCGCGACGCACGACAATATTACGAGTCGCAAATCACAGGACTACGTACACAGCTAAGCAGGACACAAAAGCAGCTTAAACGTGCTAGCGAAGATAGAGCTAGACAGACAAAGCGGATTGCGGAACTTACGCACAACGGAGGGTAG'}, - 'GCA_001019635': {'8_refound_250': 'ATGGAACCAAAATTACATCGGCAACTGCGTCAAAAATATGACGACGCTGAAAAACAATATCTTGAAAAGTTTGGAGAAGACTCGCTTGATAGAGTATTTTTTTGGGAGCCAGACGTTTACTTTGATGAGTGGAAAAAGGTTCTACCAGATGCAACACTGGAATTAAACAAAGCTATTAATAGCGGGGTGGCGATTGATCCAGATCCAGAAAACGCAATATATTAA', - '8_refound_251': 'ATGAAAAGCTTTTTAAATTTAGTCAAACAAAAGTTGTTTAAACCAGGTCTAAAAAAACTCGTAAAGCTTCACAACTCCCAGAACGTTAATATATGCTTATATATCAACGATTGGAACTAATTTATGGTTCGCACCATGGTTTTTGTGGAAGGATCAAAAGTTGTCCTGAAAATTTCTCTTAACCGTGTTTAA'} - } - - self.assertEqual(expected_dict, gene_data) - - def test_reading_fasta_from_gff(self): - single_genome_dict, _, _ = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Sample_gff_files/single_contig_mock_for_reading_fasta_in_gff.gff') - self.assertEqual(len(single_genome_dict), 1) - self.assertEqual(len(single_genome_dict[list(single_genome_dict.keys())[0]]), 540) - - multi_genome_dict, _, _ = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Sample_gff_files/multi_contig_mock_for_reading_fasta_in_gff.gff') - self.assertEqual(len(multi_genome_dict), 2) - self.assertEqual(len(multi_genome_dict[list(multi_genome_dict.keys())[0]]), 540) - self.assertEqual(len(multi_genome_dict[list(multi_genome_dict.keys())[1]]), 300) - - def test_finding_largest_locus_tag(self): - _, locus_tag, _ = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/Sample_gff_files/single_contig_mock_for_reading_fasta_in_gff.gff') - - self.assertEqual(locus_tag, 'MONDJAPC_01960') - - -class TestGffParsing(unittest.TestCase): - def test_contig_length(self): - contig_dir = get_contig_lengths('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_length_of_contigs/contigs_length_test_pass.txt') - - contig_lengths = [contig_dir[contig] for contig in contig_dir.keys()] - - true_contig_length = [100, 10, 3, 5000] - - self.assertEqual(contig_lengths, true_contig_length) - - def test_contig_name_identification(self): - contig_dir = get_contig_lengths( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_length_of_contigs/contigs_length_test_pass.txt') - - true_contig_length = ['contig_1', 'test_contig_2', '3', '4'] - - self.assertEqual(true_contig_length, list(contig_dir.keys())) - - def test_duplicate_contig_name(self): - with self.assertRaises(ValueError): - get_contig_lengths('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_length_of_contigs/test_duplicate_contig_names.txt') - - -class TestCompleteGenomeParser(unittest.TestCase): - def test_parsing_file_no_extension(self): - expected_names = ['complete_genome1', 'comp.genome', '2'] - - complete_genomes = parse_complete_genome_file( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_parsing_complete_genomes/complete_genomes.txt', - expected_names) - - self.assertEqual(complete_genomes, expected_names) - - def test_parsing_file_with_input_extension(self): - expected_names = ['complete_genome1', 'comp.genome', '2'] - input_gffs = ['complete_genome1.gff', 'comp.genome.gff', '2.gff'] - - complete_genomes = parse_complete_genome_file( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_parsing_complete_genomes/complete_genomes_w_extensions.txt', - input_gffs) - - self.assertEqual(complete_genomes, expected_names) - - def test_parsing_file_with_extension(self): - expected_names = ['complete_genome1', 'comp.genome', '2'] - input_gffs = ['complete_genome1.gff', 'comp.genome.gff', '2.gff'] - - complete_genomes = parse_complete_genome_file( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_parsing_complete_genomes/complete_genomes.txt', - input_gffs) - - self.assertEqual(complete_genomes, expected_names) - - def test_parsing_file_with_path(self): - expected_names = ['complete_genome1', 'comp.genome', '2'] - input_gffs = ['/test/path/complete_genome1', '/all/the/paths/comp.genome', '/no/more/paths/2'] - - complete_genomes = parse_complete_genome_file( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_parsing_complete_genomes/complete_genomes.txt', - input_gffs) - - self.assertEqual(complete_genomes, expected_names) - - def test_parsing_file_with_path_and_extension(self): - expected_names = ['complete_genome1', 'comp.genome', '2'] - input_gffs = ['/test/path/complete_genome1.gff', '/all/the/paths/comp.genome.gff', '/no/more/paths/2.gff'] - - complete_genomes = parse_complete_genome_file( - '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_parsing_complete_genomes/complete_genomes.txt', - input_gffs) - - self.assertEqual(complete_genomes, expected_names) - -# class TestCoreGeneSyntenyTypes(unittest.TestCase): -# def test_assign_core_synteny_types(self): -# thing = [] -# -# self.assertEqual(len(set(thing)), 1) -# -# thing_90 = [] -# self.assertEqual(len(set(thing_90)), 2) - - -class TestRecordingCoreGene(unittest.TestCase): - def test_regular_core_gene_recording(self): - expected_previous_core_gene_id ='gene_1' - expected_previous_core_gene_end_coor = 200 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_cluster_1--pan_cluster_2': 99} - expected_accessory_gene_content = {'pan_cluster_1--pan_cluster_2': ['Acc_1', 'Acc_2']} - expected_low_freq_gene_content = {'pan_cluster_1--pan_cluster_2': ['low_1']} - expected_core_gene_pairs = ['pan_cluster_1--pan_cluster_2'] - expected_num_acc_genes_in_region = {'pan_cluster_1--pan_cluster_2': 3} - expected_master_info = {'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 99, 3, ['Acc_1', 'Acc_2'], ['low_1']]} - - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=['contig_1', '', '', '100', '200', '', '', '', 'gene_1'], - contig_end=None, - previous_core_gene_id='gene_2', - previous_core_gene_end_coor=0, - acc_genes_in_region=['Acc_1', 'Acc_2'], - low_freq_genes_in_region=['low_1'], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_adding_last_core_at_contig_change(self): - expected_previous_core_gene_id = 'Sequence_break' - expected_previous_core_gene_end_coor = 100 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_cluster_2--Sequence_break': 99} - expected_accessory_gene_content = {'pan_cluster_2--Sequence_break': ['Acc_1', 'Acc_2']} - expected_low_freq_gene_content = {'pan_cluster_2--Sequence_break': ['low_1']} - expected_core_gene_pairs = ['pan_cluster_2--Sequence_break'] - expected_num_acc_genes_in_region = {'pan_cluster_2--Sequence_break': 3} - expected_master_info = { - 'pan_cluster_2--Sequence_break--genome_1': ['genome_1', 'pan_cluster_2', 'Sequence_break', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']]} - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=None, - contig_end=200, - previous_core_gene_id='gene_2', - previous_core_gene_end_coor=100, - acc_genes_in_region=['Acc_1', 'Acc_2'], - low_freq_genes_in_region=['low_1'], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_add_first_gene_on_new_contig(self): - expected_previous_core_gene_id = 'gene_1' - expected_previous_core_gene_end_coor = 200 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'Sequence_break--pan_cluster_1': 99} - expected_accessory_gene_content = {'Sequence_break--pan_cluster_1': ['Acc_1', 'Acc_2']} - expected_low_freq_gene_content = {'Sequence_break--pan_cluster_1': ['low_1']} - expected_core_gene_pairs = ['Sequence_break--pan_cluster_1'] - expected_num_acc_genes_in_region = {'Sequence_break--pan_cluster_1': 3} - expected_master_info = { - 'Sequence_break--pan_cluster_1--genome_1': ['genome_1', 'Sequence_break', 'pan_cluster_1', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']]} - - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=['contig_1', '', '', '100', '200', '', '', '', 'gene_1'], - contig_end=0, - previous_core_gene_id='Sequence_break', - previous_core_gene_end_coor=200, - acc_genes_in_region=['Acc_1', 'Acc_2'], - low_freq_genes_in_region=['low_1'], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_adding_first_gene_of_genome_next_to_sequence_break(self): - expected_core_gene_pair_distance = {'Sequence_break--pan_cluster_1': 99} - expected_accessory_gene_content = {'Sequence_break--pan_cluster_1': []} - expected_low_freq_gene_content = {'Sequence_break--pan_cluster_1': []} - expected_core_gene_pairs = ['Sequence_break--pan_cluster_1'] - expected_num_acc_genes_in_region = {'Sequence_break--pan_cluster_1': 0} - expected_master_info = { - 'Sequence_break--pan_cluster_1--genome_1': ['genome_1', 'Sequence_break', 'pan_cluster_1', 99, 0, - [], []]} - - first_core_gene_id = 'gene_1' - first_core_gene_start_coor = 100 - - - (_, - _, - _, - _, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=['contig_1', '', '', first_core_gene_start_coor, 100000, '', '', '', first_core_gene_id], - contig_end=0, - previous_core_gene_id='Sequence_break', - previous_core_gene_end_coor=10000000, - acc_genes_in_region=[], - low_freq_genes_in_region=[], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_adding_last_core_gene_next_to_sequence_break_in_incomplete_genome(self): - expected_previous_core_gene_id = 'Sequence_break' - expected_previous_core_gene_end_coor = 200 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_cluster_2--Sequence_break': 99} - expected_accessory_gene_content = {'pan_cluster_2--Sequence_break': ['Acc_1', 'Acc_2']} - expected_low_freq_gene_content = {'pan_cluster_2--Sequence_break': ['low_1']} - expected_core_gene_pairs = ['pan_cluster_2--Sequence_break'] - expected_num_acc_genes_in_region = {'pan_cluster_2--Sequence_break': 3} - expected_master_info = { - 'pan_cluster_2--Sequence_break--genome_1': ['genome_1', 'pan_cluster_2', 'Sequence_break', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']]} - - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, core_gene_pairs, - num_acc_genes_in_region, master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=None, - contig_end=300, - previous_core_gene_id='gene_2', - previous_core_gene_end_coor=200, - acc_genes_in_region=['Acc_1', 'Acc_2'], - low_freq_genes_in_region=['low_1'], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_adding_last_core_at_contig_and_first_of_next_in_chain(self): - expected_previous_core_gene_id = 'gene_1' - expected_previous_core_gene_end_coor = 200 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pair_distance = {'pan_cluster_2--Sequence_break': 99, 'Sequence_break--pan_cluster_1': 999} - expected_accessory_gene_content = {'pan_cluster_2--Sequence_break': ['Acc_1', 'Acc_2'], 'Sequence_break--pan_cluster_1': []} - expected_low_freq_gene_content = {'pan_cluster_2--Sequence_break': ['low_1'], 'Sequence_break--pan_cluster_1': []} - expected_core_gene_pairs = ['pan_cluster_2--Sequence_break', 'Sequence_break--pan_cluster_1'] - expected_num_acc_genes_in_region = {'pan_cluster_2--Sequence_break': 3, 'Sequence_break--pan_cluster_1': 0} - expected_master_info = { - 'pan_cluster_2--Sequence_break--genome_1': ['genome_1', 'pan_cluster_2', 'Sequence_break', 99, 3, - ['Acc_1', 'Acc_2'], ['low_1']], - 'Sequence_break--pan_cluster_1--genome_1': ['genome_1', 'Sequence_break', 'pan_cluster_1', 999, 0, [], []]} - - - # Add last gene before contig break - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=None, - contig_end=200, - previous_core_gene_id='gene_2', - previous_core_gene_end_coor=100, - acc_genes_in_region=['Acc_1', 'Acc_2'], - low_freq_genes_in_region=['low_1'], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - core_gene_pairs=[], - num_acc_genes_in_region={}, - master_info={}) - - - # Use info from previous gene to add the first core gene on the next contig. - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - num_acc_genes_in_region, - master_info) = record_core_core_region(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - gff_line=['contig_1', '', '', '1000', '200', '', '', '', 'gene_1'], - contig_end=0, - previous_core_gene_id=previous_core_gene_id, - previous_core_gene_end_coor=previous_core_gene_end_coor, - acc_genes_in_region=acc_genes_in_region, - low_freq_genes_in_region=low_freq_genes_in_region, - core_gene_pair_distance=core_gene_pair_distance, - accessory_gene_content=accessory_gene_content, - low_freq_gene_content=low_freq_gene_content, - core_gene_pairs=core_gene_pairs, - num_acc_genes_in_region=num_acc_genes_in_region, - master_info=master_info) - - # Assess how the test went. - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - # self.assertEqual(expected_num_acc_genes_in_region, num_acc_genes_in_region) - self.assertEqual(expected_master_info, master_info) - - def test_recording_last_n_first_core_on_closed_contig(self): - expected_previous_core_gene_id = 'Complete_genome_end_fail' - expected_previous_core_gene_end_coor = 100 - expected_acc_genes_in_region = [] - expected_low_freq_genes_in_region = [] - expected_core_gene_pairs = ['pan_cluster_1--pan_cluster_2'] - expected_core_gene_pair_distance = {'pan_cluster_1--pan_cluster_2': 100} - expected_accessory_gene_content = {'pan_cluster_1--pan_cluster_2': ['Acc_2', 'Acc_3', 'Acc_1']} - expected_low_freq_gene_content = {'pan_cluster_1--pan_cluster_2': ['low_1', 'low_2']} - expected_master_info = {'pan_cluster_1--pan_cluster_2--genome_1': ['genome_1', 'pan_cluster_1', 'pan_cluster_2', 100, 5, - ['Acc_2', 'Acc_3', 'Acc_1'], ['low_1', 'low_2']]} - - (previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pairs, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - master_info) = connect_first_n_last_gene_on_contig(core_genes={'genome_1': {'gene_1': 'pan_cluster_1', 'gene_2': 'pan_cluster_2'}}, - gff_name='genome_1', - previous_core_gene_id='gene_2', - previous_core_gene_end_coor=100, - first_core_gene_gff_line=['contig_1', '', '', '0', '50', '', '', '', 'gene_1'], - acc_genes_in_region=['Acc_2', 'Acc_3'], - first_core_accessory_content=['Acc_1'], - low_freq_genes_in_region=['low_1'], - first_core_low_freq_genes=['low_2'], - contig_size=200, - core_gene_pairs=[], - core_gene_pair_distance={}, - accessory_gene_content={}, - low_freq_gene_content={}, - master_info={}) - - self.assertEqual(expected_previous_core_gene_id, previous_core_gene_id) - self.assertEqual(expected_previous_core_gene_end_coor, previous_core_gene_end_coor) - self.assertEqual(expected_acc_genes_in_region, acc_genes_in_region) - self.assertEqual(expected_low_freq_genes_in_region, low_freq_genes_in_region) - self.assertEqual(expected_core_gene_pairs, core_gene_pairs) - self.assertEqual(expected_core_gene_pair_distance, core_gene_pair_distance) - self.assertEqual(expected_accessory_gene_content, accessory_gene_content) - self.assertEqual(expected_low_freq_gene_content, low_freq_gene_content) - self.assertEqual(expected_master_info, master_info) - - -class TestSegmentationIdentification(unittest.TestCase): - def test_core_gene_graph_construction_circle_case(self): - expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_2', 'pan_cluster_3'), - ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] - - expected_degrees = [('pan_cluster_1', 2), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 2), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] - - expected_edge_weights = [10, 10, 10, 10, 10, 10] - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 10, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_1--pan_cluster_6': 10} - - core_graph = construct_core_graph(core_neighbour_pairs) - - # Get edge weights: - edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] - - # Assert outputs - self.assertEqual(expected_edges, list(core_graph.edges)) - self.assertEqual(expected_degrees, list(core_graph.degree)) - self.assertEqual(expected_edge_weights, edge_weights) - - def test_core_gene_graph_construction_circle_case_with_single_break(self): - expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_2', 'pan_cluster_3'), - ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] - - expected_degrees = [('pan_cluster_1', 2), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 2), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] - - expected_edge_weights = [9, 10, 9, 10, 10, 10] - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, - 'pan_cluster_1--Sequence_break': 1, - 'Sequence_break--pan_cluster_2': 1, - 'pan_cluster_2--pan_cluster_3': 9, - 'pan_cluster_3--pan_cluster_4': 10, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_1--pan_cluster_6': 10} - - core_graph = construct_core_graph(core_neighbour_pairs) - - # Get edge weights: - edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] - - # Assert outputs - self.assertEqual(expected_edges, list(core_graph.edges)) - self.assertEqual(expected_degrees, list(core_graph.degree)) - self.assertEqual(expected_edge_weights, edge_weights) - - def test_core_gene_graph_construction_three_degree_case(self): - expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_1', 'pan_cluster_4'), ('pan_cluster_2', 'pan_cluster_3'), - ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] - - expected_degrees = [('pan_cluster_1', 3), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 3), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] - - expected_edge_weights = [10, 8, 2, 10, 8, 10, 10] - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 8, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_1--pan_cluster_6': 8, - 'pan_cluster_1--pan_cluster_4': 2} - - core_graph = construct_core_graph(core_neighbour_pairs) - - # Get edge weights: - edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] - - # Assert outputs - self.assertEqual(expected_edges, list(core_graph.edges)) - self.assertEqual(expected_degrees, list(core_graph.degree)) - self.assertEqual(expected_edge_weights, edge_weights) - - def test_core_gene_graph_construction_three_degree_n_sequence_breaks_case(self): - expected_edges = [('pan_cluster_1', 'pan_cluster_2'), ('pan_cluster_1', 'pan_cluster_6'), ('pan_cluster_1', 'pan_cluster_4'), ('pan_cluster_2', 'pan_cluster_3'), - ('pan_cluster_3', 'pan_cluster_4'), ('pan_cluster_4', 'pan_cluster_5'), ('pan_cluster_5', 'pan_cluster_6')] - - expected_degrees = [('pan_cluster_1', 3), ('pan_cluster_2', 2), ('pan_cluster_3', 2), ('pan_cluster_4', 3), ('pan_cluster_5', 2), ('pan_cluster_6', 2)] - - expected_edge_weights = [10, 8, 2, 10, 8, 10, 10] - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 8, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_1--pan_cluster_6': 8, - 'pan_cluster_1--pan_cluster_4': 2, - 'pan_cluster_3--Sequence_break': 2, - 'pan_cluster_6--Sequence_break': 2} - - core_graph = construct_core_graph(core_neighbour_pairs) - - # Get edge weights: - edge_weights = [core_graph.get_edge_data(edge[0], edge[1])['weight'] for edge in list(core_graph.edges)] - - # Assert outputs - self.assertEqual(expected_edges, list(core_graph.edges)) - self.assertEqual(expected_degrees, list(core_graph.degree)) - self.assertEqual(expected_edge_weights, edge_weights) - - def test_double_edge_segment_identification_all_2_degree_input(self): - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 10, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_6--pan_cluster_1': 10} - - core_graph = construct_core_graph(core_neighbour_pairs) - - return_1, return_2, return_3 = identify_segments(core_graph, 10) - - self.assertEqual(None, return_1) - self.assertEqual(None, return_2) - self.assertEqual(None, return_3) - - - def test_double_edge_segment_identification_two_segments(self): - expected_segments = {'pan_cluster_1--pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], 'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, - 'pan_cluster_1--pan_cluster_4': 1, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 10, - 'pan_cluster_2--pan_cluster_5': 1, - 'pan_cluster_4--pan_cluster_5': 9, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_6--pan_cluster_1': 10} - - core_graph = construct_core_graph(core_neighbour_pairs) - - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - self.assertEqual(expected_segments, double_edge_segements) - - def test_double_edge_segment_identification_four_segments(self): - expected_segments = {'pan_cluster_1--pan_cluster_3': ['pan_cluster_1', 'pan_cluster_2', 'pan_cluster_3'], - 'pan_cluster_1--pan_cluster_9': ['pan_cluster_1', 'pan_cluster_10', 'pan_cluster_9'], - 'pan_cluster_3--pan_cluster_6': ['pan_cluster_6', 'pan_cluster_5', 'pan_cluster_4', 'pan_cluster_3'], - 'pan_cluster_6--pan_cluster_9': ['pan_cluster_6', 'pan_cluster_7', 'pan_cluster_8', 'pan_cluster_9']} - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, - 'pan_cluster_1--pan_cluster_6': 1, - 'pan_cluster_2--pan_cluster_3': 10, - 'pan_cluster_3--pan_cluster_4': 9, - 'pan_cluster_3--pan_cluster_9': 1, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_6--pan_cluster_7': 10, - 'pan_cluster_7--pan_cluster_8': 10, - 'pan_cluster_8--pan_cluster_9': 9, - 'pan_cluster_9--pan_cluster_10': 10, - 'pan_cluster_1--pan_cluster_10': 10} - - core_graph = construct_core_graph(core_neighbour_pairs) - - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - self.assertEqual(expected_segments, double_edge_segements) - - def test_double_edge_segment_identification_segments_node_w_four_degrees(self): - print(4) - expected_segments = {'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', 'pan_cluster_5', 'pan_cluster_6']} - - core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 9, - 'pan_cluster_2--pan_cluster_3': 9, - 'pan_cluster_2--pan_cluster_4': 1, - 'pan_cluster_2--pan_cluster_6': 1, - 'pan_cluster_3--pan_cluster_4': 9, - 'pan_cluster_4--pan_cluster_5': 10, - 'pan_cluster_5--pan_cluster_6': 10, - 'pan_cluster_6--pan_cluster_1': 9} - - core_graph = construct_core_graph(core_neighbour_pairs) - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - self.assertEqual(expected_segments, double_edge_segements) - - def test_double_edge_segment_identification_segments_node_w_challenging_paths(self): - print(4) - expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_E', 'pan_cluster_F', 'pan_cluster_G', 'pan_cluster_B']} - - core_neighbour_pairs = {'pan_cluster_A--pan_cluster_C': 4, - 'pan_cluster_A--pan_cluster_D': 4, - 'pan_cluster_A--pan_cluster_E': 2, - 'pan_cluster_B--pan_cluster_C': 5, - 'pan_cluster_B--pan_cluster_D': 3, - 'pan_cluster_B--pan_cluster_G': 2, - 'pan_cluster_C--pan_cluster_D': 1, - 'pan_cluster_E--pan_cluster_F': 2, - 'pan_cluster_F--pan_cluster_G': 2, - } - - core_graph = construct_core_graph(core_neighbour_pairs) - print(core_graph.degree) - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - self.assertEqual(expected_segments, double_edge_segements) - - def test_double_edge_segment_identification_segments_node_w_challenging_paths_2(self): - expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_F', 'pan_cluster_B'], - 'pan_cluster_B--pan_cluster_C': ['pan_cluster_B', 'pan_cluster_I', 'pan_cluster_C']} - - core_neighbour_pairs = {'pan_cluster_A--pan_cluster_D': 2, - 'pan_cluster_A--pan_cluster_E': 1, - 'pan_cluster_A--pan_cluster_F': 7, - 'pan_cluster_B--pan_cluster_F': 7, - 'pan_cluster_B--pan_cluster_I': 8, - 'pan_cluster_B--pan_cluster_D': 1, - 'pan_cluster_C--pan_cluster_E': 1, - 'pan_cluster_C--pan_cluster_D': 1, - 'pan_cluster_C--pan_cluster_I': 8, - 'pan_cluster_D--pan_cluster_E': 1 - } - - core_graph = construct_core_graph(core_neighbour_pairs) - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - self.assertEqual(expected_segments, double_edge_segements) - - def test_double_edge_segment_identification_segments_node_w_all_challenging_paths(self): - expected_segments = {'pan_cluster_A--pan_cluster_D': ['pan_cluster_A', 'pan_cluster_G', 'pan_cluster_F', 'pan_cluster_E', 'pan_cluster_D'], - 'pan_cluster_B--pan_cluster_C': ['pan_cluster_B', 'pan_cluster_H', 'pan_cluster_I', 'pan_cluster_J', 'pan_cluster_C']}#,} - #'pan_cluster_A--pan_cluster_C': ['pan_cluster_A', 'pan_cluster_K', 'pan_cluster_C'], - #'pan_cluster_B--pan_cluster_D': ['pan_cluster_B', 'pan_cluster_L', 'pan_cluster_D']} - - core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 4, - 'pan_cluster_A--pan_cluster_K': 4, - 'pan_cluster_A--pan_cluster_G': 2, - 'pan_cluster_B--pan_cluster_H': 2, - 'pan_cluster_B--pan_cluster_L': 4, - 'pan_cluster_C--pan_cluster_J': 2, - 'pan_cluster_C--pan_cluster_K': 4, - 'pan_cluster_D--pan_cluster_C': 4, - 'pan_cluster_D--pan_cluster_L': 4, - 'pan_cluster_D--pan_cluster_E': 2, - 'pan_cluster_E--pan_cluster_F': 2, - 'pan_cluster_F--pan_cluster_G': 2, - 'pan_cluster_H--pan_cluster_I': 2, - 'pan_cluster_I--pan_cluster_J': 2, - 'pan_cluster_K--pan_cluster_L': 1 - } - - - core_graph = construct_core_graph(core_neighbour_pairs) - double_edge_segements, connect_dict, multi_edge_nodes = identify_segments(core_graph, 10) - - print(double_edge_segements) - - self.assertEqual(expected_segments, double_edge_segements) - -class TestNoAccessorySegmentIdentifcation(unittest.TestCase): - def test_no_accessory_genes_in_segment(self): - expected_sub_sgments = {'pan_cluster_1~~pan_cluster_5': [['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5']], - 'pan_cluster_2~~pan_cluster_4': [['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']]} - - double_edge_segements = {'pan_cluster_1~~pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], - 'pan_cluster_2~~pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} - combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 0, 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} - - sub_segment_dict = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - - print(sub_segment_dict) - - self.assertEqual(sub_segment_dict, expected_sub_sgments) - - def test_accessory_genes_in_segment_first_gene_lonely(self): - expected_sub_sgments = {'pan_cluster_1~~pan_cluster_5': [['pan_cluster_1'], ['pan_cluster_6', 'pan_cluster_5']]} - - double_edge_segements = {'pan_cluster_1~~pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5']} - combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 1, 'pan_cluster_5--pan_cluster_6': 0} - - sub_segment_dict = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - - self.assertEqual(sub_segment_dict, expected_sub_sgments) - - def test_accessory_genes_in_segment_last_gene_lonely(self): - expected_sub_sgments = {'pan_cluster_1~~pan_cluster_5': [['pan_cluster_1', 'pan_cluster_6'], ['pan_cluster_5']], - 'pan_cluster_2~~pan_cluster_4': [['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']]} - - double_edge_segements = {'pan_cluster_1~~pan_cluster_5': ['pan_cluster_1', 'pan_cluster_6', 'pan_cluster_5'], - 'pan_cluster_2~~pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} - combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 1, 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} - - sub_segment_dict = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - - self.assertEqual(sub_segment_dict, expected_sub_sgments) - - def test_accessory_genes_in_segment_middle(self): - expected_sub_sgments = {'pan_cluster_1~~pan_cluster_4': [['pan_cluster_1', 'pan_cluster_2'], ['pan_cluster_3', 'pan_cluster_4']]} - - double_edge_segements = {'pan_cluster_1~~pan_cluster_4': ['pan_cluster_1', 'pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4']} - combined_acc_gene_count = {'pan_cluster_1--pan_cluster_2': 0, 'pan_cluster_2--pan_cluster_3': 1, 'pan_cluster_3--pan_cluster_4': 0} - - sub_segment_dict = identify_no_accessory_segments(double_edge_segements, combined_acc_gene_count) - - self.assertEqual(sub_segment_dict, expected_sub_sgments) - -if __name__ == '__main__': - unittest.main() diff --git a/Code_to_transfer/time_calculator.py b/Code_to_transfer/time_calculator.py deleted file mode 100644 index 96651bc..0000000 --- a/Code_to_transfer/time_calculator.py +++ /dev/null @@ -1,19 +0,0 @@ -import datetime - - -def time_calculator(start_time, end_time, task): - total_time = int(end_time - start_time) - - seconds = total_time - minutes = 0 - hours = 0 - - if total_time > 59: - minutes = total_time // 60 - seconds = total_time % 60 - - if minutes > 59: - hours = minutes // 60 - minutes = minutes % 60 - print(f"Time used on {task}: {datetime.time(hours, minutes, seconds)}\n") - print(f"--------------------------------------------------------------\n") diff --git a/Corekaburra.cwl b/Corekaburra.cwl deleted file mode 100644 index a98d2ef..0000000 --- a/Corekaburra.cwl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env cwl-runner - -baseCommand: [Corekaburra] -class: CommandLineTool -cwlVersion: v1.0 -id: Corekaburra -inputs: - - doc: Input FASTA files - id: fasta_file - inputBinding: {position: 0} - type: File - - doc: Minimum length sequence to include in stats (default 0) - id: min_len - inputBinding: {prefix: --minlen} - type: long? - - doc: record program progress in LOG_FILE - id: log - inputBinding: {prefix: --log} - type: string? -outputs: - - doc: Stats file - id: stats - type: stdout -requirements: - - class: DockerRequirement - dockerPull: corekaburra diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index e0b8a01..1b0ce98 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -395,7 +395,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # acc_genes_in_region.append(acc_genes[gff_name][line[8]]) try: acc_genes_in_region.append(acc_genes[gff_name][line[8]]) - except KeyError: # TODO - WHAT DOES THIS DO? - Likely search for fragment within composite, as fragments were previously storred in their composit strings. + except KeyError: gene_key = [key for key in acc_genes[gff_name].keys() if line[8] in key] if len(gene_key) > 1: acc_genes_in_region.append(acc_genes[gff_name][gene_key][0]) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 90e9d84..39ad362 100755 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -103,7 +103,6 @@ def segment_writer(segments, out_path, prefix): :param prefix: Prefix for any output files :return: Nothing """ - # TODO - Maybe include presence of core genes in segment output? # Generate file name out_file_name = 'core_segments.csv' if prefix is not None: diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 4b4764b..aeb1e4a 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -14,7 +14,6 @@ except ModuleNotFoundError: from exit_with_error import exit_with_error - def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): """ Function to add a gene to a given dictionary @@ -25,7 +24,7 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): :return: returns the dict to be used further """ if ';' in gene: - for gene_part in gene.split(';'): # TODO - NOTE! HERE BOTH GENES IN A PAIR IS ADDED as separate key/value-pairs + for gene_part in gene.split(';'): main_dict[genome][gene_part] = pan_gene_name else: main_dict[genome][gene] = pan_gene_name @@ -42,9 +41,9 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: - refound_fregments = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] - if refound_fregments: - for i, gene_gff in refound_fregments: + refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + if refound_genes: + for i, gene_gff in refound_genes: gene, gff = gene_gff gff_name = None @@ -135,7 +134,6 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ fragments_close.append(False) return fragments_close - # TODO - find out what the non-closed file problem is here! Can be seen when running unit-tests. def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): @@ -219,7 +217,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, # Check that each annotation is neighboring the other annotation. fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, - corrected_dir, logger) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? + corrected_dir, logger) # Check if gene was found to be a core gene if all(fragments_close): # Add the gene to the annotation dict diff --git a/Corekaburra/summary_table.py b/Corekaburra/summary_table.py index 5c4b6d9..3e4ca4e 100755 --- a/Corekaburra/summary_table.py +++ b/Corekaburra/summary_table.py @@ -58,12 +58,12 @@ def calculate_n_create_summaries(master_info, core_gene_dict): pair_occurrence, occurrence_dict[core_pair][gene_list[0]], occurrence_dict[core_pair][gene_list[1]], - occurrence_dict[core_pair]['co_occurrence'], # TODO - Add neighbour ratio? + occurrence_dict[core_pair]['co_occurrence'], min_values[0], max_values[0], round(mean_values[0], 1), round(median_values[0], 1), min_values[1], max_values[1], round(mean_values[1], 1), round(median_values[1], 1) - ] # TODO - at the moment all sequence breaks are reported as zero and gives a co-occurrence equal to zero - is this acceptable? + ] return summary_dict diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index d380834..eb860f7 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -1098,9 +1098,6 @@ def test_gene_not_found(self): with self.assertRaises(SystemExit): correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) - # TODO - Add test for annotating of second contig - - class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} @@ -2999,8 +2996,6 @@ def test_single_fragmented_gene_on_either_side_of_core_gene(self): self.assertEqual(master_info, return_master_info) self.assertEqual(coreless_contigs, return_coreless_contigs) - def test_something(self): # TODO - What other wired and wonderfull examples can we come up with? - pass class TestMergingDicts(unittest.TestCase): @@ -3310,6 +3305,27 @@ def test_count_gene_co_occurrence_no_occurence(self): self.assertEqual(b_occurrence, individual_occurrences["B"]) +# class TestMultiProcessing(unittest.TestCase): +# @classmethod +# def setUpClass(cls): +# cls.logger = logging.getLogger('test_logger.log') +# cls.logger.setLevel(logging.INFO) +# +# def test_ttt(self): +# core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, +# 'pan_cluster_2--pan_cluster_3': 10, +# 'pan_cluster_4--pan_cluster_5': 10, +# 'pan_cluster_5--pan_cluster_6': 10, +# 'pan_cluster_1--Sequence_break': 10, +# 'pan_cluster_3--Sequence_break': 10, +# 'pan_cluster_4--Sequence_break': 10, +# 'pan_cluster_6--Sequence_break': 10} +# +# combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 1, +# 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} +# +# consesus_core_genome.determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, 10, {}, 1, self.logger) + class TestSegmentationIdentification(unittest.TestCase): """ Test the function that identifies core gene segments from a pan-genome. @@ -3633,8 +3649,6 @@ def test_multiple_component_core_graph(self): # Test of all returned segments look as expected self.assertTrue(all(comparisons)) - # TODO - Chat to Andrew about this function how it works and how we can test it more - possibly just run some things to see if it breaks - class TestNoAccessorySegmentIdentifcation(unittest.TestCase): """ From 87415e1dd08b2bb9603432b0cc3e74bdc24c13a8 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:14:06 +1000 Subject: [PATCH 099/135] Correct unit test script --- unit_tests/Corekaburra_test.py | 58 +--------------------------------- 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 8e483a0..b22d486 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -30,35 +30,6 @@ except FileNotFoundError: os.chdir('unit_test_data/') -<<<<<<< HEAD -======= - -class TestExitWithError(unittest.TestCase): - """ Test for the function carrying out a nice exit """ - @classmethod - def setUpClass(cls): - cls.logger = logging.getLogger('test_logger.log') - cls.logger.setLevel(logging.INFO) - - def test_exit_w_tmp_folder_deletion(self): - ''' Test the exit function is able to remove the temporary folder ''' - - # copy the placeholder tmp folder to replace it afterwards - tmp_folder = 'TestExitWithError/tmp_folder' - tmp_folder_copy = 'TestExitWithError/tmp_folder_copy' - os.mkdir(tmp_folder_copy) - - tmp_files = os.listdir(tmp_folder) - for file in tmp_files: - copyfile(os.path.join(tmp_folder, file), os.path.join(tmp_folder_copy, file)) - - with self.assertRaises(SystemExit): - exit_with_error.exit_with_error(exit_status=2, message='test msg', logger=self.logger, tmp_folder=tmp_folder) - - os.rename(tmp_folder_copy, tmp_folder) - - ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b class TestCutOffViolations(unittest.TestCase): """ Test for the function that examines the cutoffs given for core and low-frequency genes""" @classmethod @@ -1127,12 +1098,6 @@ def test_gene_not_found(self): with self.assertRaises(SystemExit): correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) -<<<<<<< HEAD -======= - # TODO - Add test for annotating of second contig - - ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} @@ -1161,7 +1126,6 @@ def test_gff_generator_generation_not_corrected(self): ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"]] -<<<<<<< HEAD return_generator = [] for line in gff_parser.parse_gff(input_gff_file): return_generator += [line] @@ -1185,9 +1149,6 @@ def test_gff_generator_generation_gzipped_input(self): return_generator = [] for line in gff_parser.parse_gff(input_gff_file): return_generator += [line] -======= - return_generator = gff_parser.parse_gff(input_gff_file) ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b for expected, generated in zip(expected_output, return_generator): self.assertEqual(expected, generated) @@ -1206,13 +1167,9 @@ def test_gff_generator_generation_corrected_gff(self): ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"], ['contig_1', 'Panaroo', 'CDS', '900', '1000', '.', '+', '0', 'refound_gene_1']] -<<<<<<< HEAD return_generator = [] for line in gff_parser.parse_gff(input_gff_file): return_generator += [line] -======= - return_generator = gff_parser.parse_gff(input_gff_file) ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b for expected, generated in zip(expected_output, return_generator): self.assertEqual(expected, generated) @@ -3039,12 +2996,6 @@ def test_single_fragmented_gene_on_either_side_of_core_gene(self): self.assertEqual(master_info, return_master_info) self.assertEqual(coreless_contigs, return_coreless_contigs) -<<<<<<< HEAD -======= - def test_something(self): # TODO - What other wired and wonderfull examples can we come up with? - pass ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b - class TestMergingDicts(unittest.TestCase): """ Functions to merge dictionaries and lists into dictionaries """ @@ -3353,7 +3304,6 @@ def test_count_gene_co_occurrence_no_occurence(self): self.assertEqual(b_occurrence, individual_occurrences["B"]) -<<<<<<< HEAD # class TestMultiProcessing(unittest.TestCase): # @classmethod # def setUpClass(cls): @@ -3375,8 +3325,7 @@ def test_count_gene_co_occurrence_no_occurence(self): # # consesus_core_genome.determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, 10, {}, 1, self.logger) -======= ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b + class TestSegmentationIdentification(unittest.TestCase): """ Test the function that identifies core gene segments from a pan-genome. @@ -3700,11 +3649,6 @@ def test_multiple_component_core_graph(self): # Test of all returned segments look as expected self.assertTrue(all(comparisons)) -<<<<<<< HEAD -======= - # TODO - Chat to Andrew about this function how it works and how we can test it more - possibly just run some things to see if it breaks - ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b class TestNoAccessorySegmentIdentifcation(unittest.TestCase): """ From 6e7fbd4b1efa9359f66f1fa7c55fab87081f9c7d Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:24:13 +1000 Subject: [PATCH 100/135] Remove merge conflicts --- Corekaburra/summary_table.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Corekaburra/summary_table.py b/Corekaburra/summary_table.py index 3b94d0e..3e4ca4e 100755 --- a/Corekaburra/summary_table.py +++ b/Corekaburra/summary_table.py @@ -58,20 +58,12 @@ def calculate_n_create_summaries(master_info, core_gene_dict): pair_occurrence, occurrence_dict[core_pair][gene_list[0]], occurrence_dict[core_pair][gene_list[1]], -<<<<<<< HEAD occurrence_dict[core_pair]['co_occurrence'], -======= - occurrence_dict[core_pair]['co_occurrence'], # TODO - Add neighbour ratio? ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b min_values[0], max_values[0], round(mean_values[0], 1), round(median_values[0], 1), min_values[1], max_values[1], round(mean_values[1], 1), round(median_values[1], 1) -<<<<<<< HEAD ] -======= - ] # TODO - at the moment all sequence breaks are reported as zero and gives a co-occurrence equal to zero - is this acceptable? ->>>>>>> c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b return summary_dict From d2d891cd529b88e0517dc5c4bd9596dc1e22c978 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:33:40 +1000 Subject: [PATCH 101/135] Add in tmp_test_folder --- unit_tests/Corekaburra_test.py | 25 +++++++++++-------- .../test_tmp_folder/empty_file.txt | 0 2 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 unit_tests/unit_test_data/test_tmp_folder/empty_file.txt diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index b22d486..299f472 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -3330,6 +3330,11 @@ class TestSegmentationIdentification(unittest.TestCase): """ Test the function that identifies core gene segments from a pan-genome. """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) + def test_double_edge_segment_identification_all_2_degree_input(self): core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, 'pan_cluster_2--pan_cluster_3': 10, @@ -3341,7 +3346,7 @@ def test_double_edge_segment_identification_all_2_degree_input(self): core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) + return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) self.assertEqual(None, return_1) @@ -3371,7 +3376,7 @@ def test_double_edge_segment_identification_two_segments(self): core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3397,7 +3402,7 @@ def test_double_edge_segment_identification_four_segments(self): core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3425,7 +3430,7 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3452,7 +3457,7 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3483,7 +3488,7 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths_2( core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3516,7 +3521,7 @@ def test_double_edge_segment_identification_segments_node_w_all_challenging_path core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3542,7 +3547,7 @@ def test_double_edge_segment_identification_segments_node_w_less_than_all_presen core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3571,7 +3576,7 @@ def test_double_edge_segment_identification_segments_node_w_two_gene_segment(sel core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, num_components) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, num_components, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3624,7 +3629,7 @@ def test_multiple_component_core_graph(self): component_graph = core_graph.subgraph(component).copy() double_edge_segements = double_edge_segements | consesus_core_genome.identify_segments(component_graph, 2, core_gene_dict, - num_components) + num_components, self.logger) # comparisons = [True for x in double_edge_segements # if diff --git a/unit_tests/unit_test_data/test_tmp_folder/empty_file.txt b/unit_tests/unit_test_data/test_tmp_folder/empty_file.txt new file mode 100644 index 0000000..e69de29 From d770dd8050c7916621edc71ce1ef7ece438f8324 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:36:35 +1000 Subject: [PATCH 102/135] Remove test around reannotation --- unit_tests/Corekaburra_test.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 299f472..1948405 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -878,17 +878,17 @@ def test_some_files_annotated(self): self.assertEqual(expected_gffs, corrected_files_return) - def test_all_files_annotated(self): - input_gffs = ['Mock_1.gff', 'Mock_2.gff'] - gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( - 'TestPrepairForReannotation/Mock_gene_data.csv', - 'TestPrepairForReannotation/All_genomes', - input_gffs, self.logger) - - expected_gffs = ['TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff', - 'TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff'] - - self.assertEqual(expected_gffs, corrected_files_return) + # def test_all_files_annotated(self): + # input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + # gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( + # 'TestPrepairForReannotation/Mock_gene_data.csv', + # 'TestPrepairForReannotation/All_genomes', + # input_gffs, self.logger) + # + # expected_gffs = ['TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff', + # 'TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff'] + # + # self.assertEqual(expected_gffs, corrected_files_return) class TestAddGeneToGff(unittest.TestCase): From c9974193bec4362c7679a563b70a569ff130f16f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:42:19 +1000 Subject: [PATCH 103/135] Add in pylint mute on summary table dict being changed in loop --- Corekaburra/summary_table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Corekaburra/summary_table.py b/Corekaburra/summary_table.py index 3e4ca4e..440d975 100755 --- a/Corekaburra/summary_table.py +++ b/Corekaburra/summary_table.py @@ -1,4 +1,5 @@ import numpy as np +# pylint: disable=E4702 try: from Corekaburra.consesus_core_genome import count_gene_co_occurrence From 9d26dc5d4f65dc11d8262f0968420b32df363e3f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:51:41 +1000 Subject: [PATCH 104/135] Remove the clean up after reannotation --- Corekaburra/parse_gene_presence_absence.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 0b6c00f..927621d 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -266,9 +266,9 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, f"{acc_gene_number} intermediate accessory gene clusters were identified\n") # Remove gff databases - files_in_tmp = os.listdir(tmp_folder_path) - gff_dbs = [file for file in files_in_tmp if '_db' in file] - [os.remove(os.path.join(tmp_folder_path, db)) for db in gff_dbs] + # files_in_tmp = os.listdir(tmp_folder_path) + # gff_dbs = [file for file in files_in_tmp if '_db' in file] + # [os.remove(os.path.join(tmp_folder_path, db)) for db in gff_dbs] return core_gene_dict, low_freq_gene_dict, acc_gene_dict From 9a8371c089030fc33b1a4ee91a4312db26cba10b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 30 May 2022 16:59:31 +1000 Subject: [PATCH 105/135] Use the name of the temp_dir instead of the object when passed to functions --- Corekaburra/__main__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 55d4114..df47c65 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -193,7 +193,7 @@ def main(): # TODO - Add in so that the user can give a list of genes that they wish to use as 'core genes' core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, - args.input_gffs, tmp_folder_path, + args.input_gffs, tmp_folder_path.name, gene_data_dict, corrected_dir, logger) @@ -219,9 +219,8 @@ def main(): with concurrent.futures.ProcessPoolExecutor(max_workers=args.cpu) as executor: logger.info(f"------Start core region identification of given gff files-----\n") logger.info(f'{len(args.input_gffs)} GFF files to process') - results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes, - source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path, args.discard_gffs, logger) + source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path.name, args.discard_gffs, logger) for gff in args.input_gffs] for output in concurrent.futures.as_completed(results): From 8c2b34cedd11ee8346e73fe24ad615aaebd90b8f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 31 May 2022 17:11:51 +1000 Subject: [PATCH 106/135] Add in search for small segments between multi connected core genes where the segment is missing in few genomes --- Corekaburra/consesus_core_genome.py | 137 +++++++++++++++------------ functional_tests/Corekaburra-test.sh | 39 ++++++-- unit_tests/Corekaburra_test.py | 118 ++++++++++++----------- 3 files changed, 169 insertions(+), 125 deletions(-) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 9d680be..c8d9ec7 100755 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -117,6 +117,56 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun return sub_segment_dict +def search_for_path(core_graph_copy, source_node, target_node, multi_edge_nodes): + # Counter to stop loop + counter = 0 + # Identifier to see if path has been found, to stop loop + path_identified = False + while not path_identified: + counter += 1 + + # Get all shortest path between source and target. + all_shortest_paths = nx.all_shortest_paths(core_graph_copy, source_node, target_node) + + # Go through each path to see if is satisfies the criteria + try: + for index, path in enumerate(all_shortest_paths): + # Get length of path + segment_length = len(path) + + # Get length of segment with multi nodes removed + two_degree_segment_length = len([node for node in path if node not in multi_edge_nodes]) + + # Check that the path does not contain nodes with >2 degrees outside of source and target, + # if then add path, + # else then find nodes that has >2 edges and remove an edge that leads to the node, to break the path for next run through loop + if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: + # double_edge_segements[suspected_pair] = path + return path + + else: + # Check if path is length >2, + # if then find >2 degree nodes and remove an edge to them, + # else just remove edge found between nodes. + if len(path) > 2: + multi_node_in_path = [[path[index], path[index + 1]] for index, node in enumerate(path) if + node in multi_edge_nodes and node != source_node and node != target_node] + for multi_node_pair in multi_node_in_path: + # Try to remove edge found to multi node, if already removed move on. + try: + core_graph_copy.remove_edge(*multi_node_pair) + except nx.exception.NetworkXError: + continue + else: + core_graph_copy.remove_edge(*path) + + if counter == 1000: + raise IndexError( + f"Counter reached limit in detecting a new path for pair, with name {target_node = } and {source_node = }") + except nx.NetworkXNoPath: + # No simple paths could be found for the source and target thus the while loop is terminated. + return + def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components, logger): """ @@ -129,26 +179,21 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo """ # TODO - Describe missing parameters in docstring - # TODO - Handle multiple chromosomes - # Identify all nodes that contain more than two degrees. + # Identify all nodes that contain more than two degrees and only one degree. multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] - # Check if multiple components in core graph, if then find single edge core_genes - if num_core_graph_components > 1: - singe_edge_nodes = [node for node, connections in core_graph.degree if connections == 1] - else: - singe_edge_nodes = [] + single_edge_nodes = [node for node, connections in core_graph.degree if connections == 1] # Check if any node have multiple edges, if not then return. - if len(multi_edge_nodes+singe_edge_nodes) == 0: + if len(multi_edge_nodes+single_edge_nodes) == 0: return None # Dict to hold connections between >2 edge nodes connect_dict = {} # for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees - for node in multi_edge_nodes+singe_edge_nodes: + for node in multi_edge_nodes+single_edge_nodes: connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) - if neighbor in multi_edge_nodes or neighbor in singe_edge_nodes] + if neighbor in multi_edge_nodes or neighbor in single_edge_nodes] # Turn the weight into a 'distance' or number of times not found together. for edge in core_graph.edges(data=True): @@ -160,18 +205,17 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo # Go through all source and taget nodes, # see if a path can be found where all nodes between them have only two degrees - for source_node in multi_edge_nodes+singe_edge_nodes: - for target_node in multi_edge_nodes+singe_edge_nodes: + for source_node in multi_edge_nodes+single_edge_nodes: + for target_node in multi_edge_nodes+single_edge_nodes: if target_node != source_node: # Get path (segment) from source to target segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') - # Get length of path segment_length = len(segment) # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+singe_edge_nodes]) + two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+single_edge_nodes]) # Check if no node between the source and target has more than two edges, # if then move to record the segment/path @@ -232,8 +276,9 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo # Go through nodes that are missing at least one path and try to identify missing paths for source_node in nodes_missing_connections: for target_node in nodes_missing_connections: - # Check that the current target node is not a neighbouring node or the current node itself - if target_node not in connect_dict[source_node] and target_node != source_node: + + # Check that the source and target are not the same node + if target_node != source_node: # Copy the graph to manipulate it core_graph_copy = core_graph.copy() @@ -241,54 +286,20 @@ def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_compo suspected_pair = sorted([source_node, target_node]) suspected_pair = f'{suspected_pair[0]}--{suspected_pair[1]}' - # Check that the pair has not been found in a previous run - if suspected_pair not in double_edge_segements: - # Counter to stop loop - counter = 0 - # Identifier to see if path has been found, to stop loop - path_identified = False - while not path_identified: - counter += 1 - - # Get all shortest path between source and target. - all_shortest_paths = nx.all_shortest_paths(core_graph_copy, source_node, target_node) + # Check that the current target node is not a neighbouring node + if target_node not in connect_dict[source_node]: + # Search for path + return_path = search_for_path(core_graph_copy, source_node, target_node, multi_edge_nodes) - # Go through each path to see if is satisfies the criteria - try: - for index, path in enumerate(all_shortest_paths): - # Get length of path - segment_length = len(path) - - # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in path if node not in multi_edge_nodes]) - - # Check that the path does not contain nodes with >2 degrees outside of source and target, - # if then add path, - # else then find nodes that has >2 edges and remove an edge that leads to the node, to break the path for next run through loop - if segment_length - 2 == two_degree_segment_length and two_degree_segment_length != 0: - double_edge_segements[suspected_pair] = path - path_identified = True - continue - else: - # Check if path is length >2, - # if then find >2 degree nodes and remove an edge to them, - # else just remove edge found between nodes. - if len(path) > 2: - multi_node_in_path = [[path[index], path[index+1]] for index, node in enumerate(path) if node in multi_edge_nodes and node != source_node and node != target_node] - for multi_node_pair in multi_node_in_path: - # Try to remove edge found to multi node, if already removed move on. - try: - core_graph_copy.remove_edge(*multi_node_pair) - except nx.exception.NetworkXError: - continue - else: - core_graph_copy.remove_edge(*path) - - if counter == 1000: - raise IndexError("Counter reached limit! in detecting a new path for pair.") - except nx.NetworkXNoPath: - # No simple paths could be found for the source and target thus the while loop is terminated. - path_identified = True + else: + # Remove the link between the two core genes that are neighbours + core_graph_copy.remove_edge(*suspected_pair.split('--')) + # Search for path + return_path = search_for_path(core_graph_copy, source_node, target_node, multi_edge_nodes) + + # Check if proper path is returned and insert it + if return_path is not None: + double_edge_segements[suspected_pair] = return_path return double_edge_segements diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index ce81743..47b4f72 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -247,7 +247,39 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. +# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. - not done paper graph 2 - implemented remove if successfull +Corekaburra -ip Less_than_all_core_simple/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_simple_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_gffs_simple_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_gffs_simple_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_simple_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_simple_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + + + +# TODO - Test that segments are produced for instances where a core gene may be left out. - not done paper graph 1 +Corekaburra -ip Less_than_all_core_complex/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_complex_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_gffs_complex_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_gffs_complex_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_complex_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_complex_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + +# TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. + +# TODO - test gzipped input gffs + + +# TODO - Test using two complete chromosomes (complete genome) when >1 contig test. - implemented remove if successful +Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cc 0.9 -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_complete_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multi_complete_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multi_complete_chromosome_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multi_complete_chromosome_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multi_complete_chromosome_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ > /dev/null 2>&1 @@ -332,7 +364,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_cor test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO Test a fragmented core gene not accepted as core call_new_test "Test a fragmented core gene not accepted as core" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected @@ -340,7 +371,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_cor test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO Test with part of fragmented gene being a refound gene call_new_test "Test with part of fragmented gene being a refound gene" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected @@ -373,7 +403,6 @@ test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder -# TODO - Test with a genome that have been corrected and one that have not - with fragmented refound gene (Resume run) call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -386,7 +415,6 @@ rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv rm Resume_refound_run_fragment/core_pair_summary.csv rm Resume_refound_run_fragment/Corekaburra.log -# TODO!! - Test with all genomes that have been corrected (Resume run) call_new_test "Test with all genomes that have been corrected (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -397,7 +425,6 @@ rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.t rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log -# TODO - Test recognition of corrected gff files in output folder (Resume run) call_new_test "Test recognition of corrected gff files in output folder (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 1948405..52cf9ce 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -3304,28 +3304,6 @@ def test_count_gene_co_occurrence_no_occurence(self): self.assertEqual(b_occurrence, individual_occurrences["B"]) -# class TestMultiProcessing(unittest.TestCase): -# @classmethod -# def setUpClass(cls): -# cls.logger = logging.getLogger('test_logger.log') -# cls.logger.setLevel(logging.INFO) -# -# def test_ttt(self): -# core_neighbour_pairs = {'pan_cluster_1--pan_cluster_2': 10, -# 'pan_cluster_2--pan_cluster_3': 10, -# 'pan_cluster_4--pan_cluster_5': 10, -# 'pan_cluster_5--pan_cluster_6': 10, -# 'pan_cluster_1--Sequence_break': 10, -# 'pan_cluster_3--Sequence_break': 10, -# 'pan_cluster_4--Sequence_break': 10, -# 'pan_cluster_6--Sequence_break': 10} -# -# combined_acc_gene_count = {'pan_cluster_1--pan_cluster_6': 0, 'pan_cluster_5--pan_cluster_6': 1, -# 'pan_cluster_2--pan_cluster_3': 0, 'pan_cluster_3--pan_cluster_4': 0} -# -# consesus_core_genome.determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, 10, {}, 1, self.logger) - - class TestSegmentationIdentification(unittest.TestCase): """ Test the function that identifies core gene segments from a pan-genome. @@ -3434,8 +3412,69 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): self.assertEqual(expected_segments, double_edge_segements) + def test_segments_w_segment_between_multi_connect_genes(self): + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_B', 'pan_cluster_A'], + 'pan_cluster_C--pan_cluster_I': ['pan_cluster_C', 'pan_cluster_D', 'pan_cluster_H', 'pan_cluster_I'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_C', 'pan_cluster_Q', 'pan_cluster_B']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 3, + 'pan_cluster_B--pan_cluster_C': 1, + 'pan_cluster_B--pan_cluster_Q': 2, + 'pan_cluster_C--pan_cluster_Q': 2, + 'pan_cluster_C--pan_cluster_D': 3, + 'pan_cluster_D--pan_cluster_H': 3, + 'pan_cluster_H--pan_cluster_I': 3, + } + core_gene_dict = {'genome_1': {'tag_7': 'pan_cluster_Q', 'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}, + 'genome_2': {'tag_7': 'pan_cluster_Q', 'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}, + 'genome_3': {'tag_7': 'pan_cluster_Q', 'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, + self.logger) + + self.assertEqual(expected_segments, double_edge_segements) + + def test_segments_w_large_segment_between_multi_connect_genes(self): + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_B', 'pan_cluster_A'], + 'pan_cluster_H--pan_cluster_I': ['pan_cluster_H', 'pan_cluster_I'], + 'pan_cluster_C--pan_cluster_D': ['pan_cluster_D', 'pan_cluster_Q', 'pan_cluster_Z', 'pan_cluster_Y', 'pan_cluster_X', 'pan_cluster_C']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 3, + 'pan_cluster_B--pan_cluster_C': 2, + 'pan_cluster_B--pan_cluster_D': 1, + 'pan_cluster_C--pan_cluster_D': 2, + 'pan_cluster_C--pan_cluster_H': 1, + 'pan_cluster_C--pan_cluster_X': 1, + 'pan_cluster_D--pan_cluster_H': 2, + 'pan_cluster_D--pan_cluster_Q': 1, + 'pan_cluster_H--pan_cluster_I': 3, + 'pan_cluster_X--pan_cluster_Y': 1, + 'pan_cluster_Y--pan_cluster_Z': 1, + 'pan_cluster_Z--pan_cluster_Q': 1 + } + core_gene_dict = {'genome_1': {'tag_10': 'pan_cluster_Q', 'tag_9': 'pan_cluster_Z', 'tag_8': 'pan_cluster_Y', 'tag_7': 'pan_cluster_X', 'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}, + 'genome_2': {'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}, + 'genome_3': {'tag_6': 'pan_cluster_I', 'tag_5': 'pan_cluster_H', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', + 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, + self.logger) + + self.assertEqual(expected_segments, double_edge_segements) + def test_double_edge_segment_identification_segments_node_w_challenging_paths(self): - expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_A', 'pan_cluster_E', 'pan_cluster_F', 'pan_cluster_G', 'pan_cluster_B'], + expected_segments = {'pan_cluster_A--pan_cluster_B': ['pan_cluster_B', 'pan_cluster_G', 'pan_cluster_F', 'pan_cluster_E', 'pan_cluster_A'], 'pan_cluster_B--pan_cluster_C': ['pan_cluster_C', 'pan_cluster_B']} core_neighbour_pairs = {'pan_cluster_A--pan_cluster_C': 4, @@ -3492,39 +3531,6 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths_2( self.assertEqual(expected_segments, double_edge_segements) - def test_double_edge_segment_identification_segments_node_w_all_challenging_paths(self): - expected_segments = {'pan_cluster_A--pan_cluster_D': ['pan_cluster_A', 'pan_cluster_G', 'pan_cluster_F', 'pan_cluster_E', 'pan_cluster_D'], - 'pan_cluster_B--pan_cluster_C': ['pan_cluster_B', 'pan_cluster_H', 'pan_cluster_I', 'pan_cluster_J', 'pan_cluster_C']}#,} - - core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 4, - 'pan_cluster_A--pan_cluster_K': 4, - 'pan_cluster_A--pan_cluster_G': 2, - 'pan_cluster_B--pan_cluster_H': 2, - 'pan_cluster_B--pan_cluster_L': 4, - 'pan_cluster_C--pan_cluster_J': 2, - 'pan_cluster_C--pan_cluster_K': 4, - 'pan_cluster_D--pan_cluster_C': 4, - 'pan_cluster_D--pan_cluster_L': 4, - 'pan_cluster_D--pan_cluster_E': 2, - 'pan_cluster_E--pan_cluster_F': 2, - 'pan_cluster_F--pan_cluster_G': 2, - 'pan_cluster_H--pan_cluster_I': 2, - 'pan_cluster_I--pan_cluster_J': 2, - 'pan_cluster_K--pan_cluster_L': 1 - } - core_gene_dict = {'genome_1': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, - 'genome_2': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, - 'genome_3': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, - 'genome_4': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}, - 'genome_5': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}} - - core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components, self.logger) - - self.assertEqual(expected_segments, double_edge_segements) - def test_double_edge_segment_identification_segments_node_w_less_than_all_present(self): expected_segments = {'pan_cluster_B--pan_cluster_D': ['pan_cluster_B', 'pan_cluster_C', 'pan_cluster_D'], 'pan_cluster_F--pan_cluster_H': ['pan_cluster_H', 'pan_cluster_G', 'pan_cluster_F'], From feaf2a8b4f4f0341c602cbe8a17d3f618ac18275 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 31 May 2022 17:14:31 +1000 Subject: [PATCH 107/135] Add in data for functional tests.... --- .../gene_presence_absence.csv | 16 ++++++ .../gene_presence_absence.csv | 16 ++++++ .../core_core_accessory_gene_content.expected | 1 + .../core_pair_summary.expected | 20 ++++++++ .../core_segments.expected | 20 ++++++++ .../low_frequency_gene_placement.expected | 50 +++++++++++++++++++ .../no_accessory_core_segments.expected | 20 ++++++++ .../core_core_accessory_gene_content.expected | 1 + .../core_pair_summary.expected | 18 +++++++ .../core_segments.expected | 14 ++++++ .../low_frequency_gene_placement.expected | 40 +++++++++++++++ .../no_accessory_core_segments.expected | 14 ++++++ ...lete_genome_double_chrom_3_extra_large.gff | 21 ++++++++ ...mplete_genome_double_chrom_extra_large.gff | 21 ++++++++ 14 files changed, 272 insertions(+) create mode 100755 functional_tests/test_data/Less_than_all_core_complex/gene_presence_absence.csv create mode 100755 functional_tests/test_data/Less_than_all_core_simple/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected create mode 100644 functional_tests/test_data/Less_than_all_core_simple_expected/core_pair_summary.expected create mode 100644 functional_tests/test_data/Less_than_all_core_simple_expected/core_segments.expected create mode 100644 functional_tests/test_data/Less_than_all_core_simple_expected/low_frequency_gene_placement.expected create mode 100644 functional_tests/test_data/Less_than_all_core_simple_expected/no_accessory_core_segments.expected create mode 100644 functional_tests/test_data/Multi_complete_chromosome_expected/core_core_accessory_gene_content.expected create mode 100644 functional_tests/test_data/Multi_complete_chromosome_expected/core_pair_summary.expected create mode 100644 functional_tests/test_data/Multi_complete_chromosome_expected/core_segments.expected create mode 100644 functional_tests/test_data/Multi_complete_chromosome_expected/low_frequency_gene_placement.expected create mode 100644 functional_tests/test_data/Multi_complete_chromosome_expected/no_accessory_core_segments.expected create mode 100755 functional_tests/test_data/complete_genome_double_chrom_3_extra_large.gff create mode 100755 functional_tests/test_data/complete_genome_double_chrom_extra_large.gff diff --git a/functional_tests/test_data/Less_than_all_core_complex/gene_presence_absence.csv b/functional_tests/test_data/Less_than_all_core_complex/gene_presence_absence.csv new file mode 100755 index 0000000..180056b --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_complex/gene_presence_absence.csv @@ -0,0 +1,16 @@ +","","","","","","","","","","","","","","complete_genome_double_chrom_extra_large","complete_genome_double_chrom_2_larger","complete_genome_double_chrom_3_extra_large" +"A","","","3","3","1","","","","","","","","","dub_chrom_A","dub_chrom_2_A","dub_chrom_A" +"B","","","3","3","1","","","","","","","","","dub_chrom_B","dub_chrom_2_B","dub_chrom_B" +"C","","","3","3","1","","","","","","","","","dub_chrom_C","dub_chrom_2_C","dub_chrom_C" +"D","","","3","3","1","","","","","","","","","dub_chrom_D","dub_chrom_2_D","dub_chrom_D" +"E","","","3","3","1","","","","","","","","","dub_chrom_E","dub_chrom_2_E","dub_chrom_E" +"F","","","3","3","1","","","","","","","","","dub_chrom_F","dub_chrom_2_F","dub_chrom_F" +"G","","","3","3","1","","","","","","","","","dub_chrom_G","dub_chrom_2_G","dub_chrom_G" +"H","","","3","3","1","","","","","","","","","dub_chrom_H","dub_chrom_2_H","dub_chrom_H" +"I","","","3","3","1","","","","","","","","","dub_chrom_I","dub_chrom_2_I","dub_chrom_I" +"J","","","3","3","1","","","","","","","","","dub_chrom_J","dub_chrom_2_J","dub_chrom_J" +"K","","","3","3","1","","","","","","","","","dub_chrom_K","dub_chrom_2_K","dub_chrom_K" +"L","","","3","3","1","","","","","","","","","dub_chrom_L","dub_chrom_2_L","dub_chrom_L" +"M","","","3","3","1","","","","","","","","","dub_chrom_M","dub_chrom_2_M","dub_chrom_M" +"Z","","","2","2","1","","","","","","","","","dub_chrom_Z","","dub_chrom_Z" +"Q","","","2","2","1","","","","","","","","","dub_chrom_Q","","dub_chrom_Q" \ No newline at end of file diff --git a/functional_tests/test_data/Less_than_all_core_simple/gene_presence_absence.csv b/functional_tests/test_data/Less_than_all_core_simple/gene_presence_absence.csv new file mode 100755 index 0000000..a8450a1 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple/gene_presence_absence.csv @@ -0,0 +1,16 @@ +","","","","","","","","","","","","","","complete_genome_double_chrom_extra_large","complete_genome_double_chrom_larger","complete_genome_double_chrom_3_extra_large" +"A","","","3","3","1","","","","","","","","","dub_chrom_A","dub_chrom_A","dub_chrom_A" +"B","","","3","3","1","","","","","","","","","dub_chrom_B","dub_chrom_B","dub_chrom_B" +"C","","","3","3","1","","","","","","","","","dub_chrom_C","dub_chrom_C","dub_chrom_C" +"D","","","3","3","1","","","","","","","","","dub_chrom_D","dub_chrom_D","dub_chrom_D" +"E","","","3","3","1","","","","","","","","","dub_chrom_E","dub_chrom_E","dub_chrom_E" +"F","","","3","3","1","","","","","","","","","dub_chrom_F","dub_chrom_F","dub_chrom_F" +"G","","","3","3","1","","","","","","","","","dub_chrom_G","dub_chrom_G","dub_chrom_G" +"H","","","3","3","1","","","","","","","","","dub_chrom_H","dub_chrom_H","dub_chrom_H" +"I","","","3","3","1","","","","","","","","","dub_chrom_I","dub_chrom_I","dub_chrom_I" +"J","","","3","3","1","","","","","","","","","dub_chrom_J","dub_chrom_J","dub_chrom_J" +"K","","","3","3","1","","","","","","","","","dub_chrom_K","dub_chrom_K","dub_chrom_K" +"L","","","3","3","1","","","","","","","","","dub_chrom_L","dub_chrom_L","dub_chrom_L" +"M","","","3","3","1","","","","","","","","","dub_chrom_M","dub_chrom_M","dub_chrom_M" +"Z","","","2","2","1","","","","","","","","","dub_chrom_Z","","dub_chrom_Z" +"Q","","","2","2","1","","","","","","","","","dub_chrom_Q","","dub_chrom_Q" \ No newline at end of file diff --git a/functional_tests/test_data/Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected b/functional_tests/test_data/Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Less_than_all_core_simple_expected/core_pair_summary.expected b/functional_tests/test_data/Less_than_all_core_simple_expected/core_pair_summary.expected new file mode 100644 index 0000000..c2e0303 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple_expected/core_pair_summary.expected @@ -0,0 +1,20 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-Q,2,3,2,2,1,1,1.0,1.0,0,0,0.0,0.0 +C-D,3,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-Q,2,3,2,2,-3,-3,-3.0,-3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,0,-2.0,-3.0,0,0,0.0,0.0 +E-F,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-Z,2,3,2,2,1,1,1.0,1.0,0,0,0.0,0.0 +G-H,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-Z,2,3,2,2,-3,-3,-3.0,-3.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 +M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Less_than_all_core_simple_expected/core_segments.expected b/functional_tests/test_data/Less_than_all_core_simple_expected/core_segments.expected new file mode 100644 index 0000000..c576538 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple_expected/core_segments.expected @@ -0,0 +1,20 @@ +Segment_name,Segment_position,Core_gene +B-C,1,B +B-C,2,Q +B-C,3,C +B-I,1,B +B-I,2,A +B-I,3,I +C-J,1,C +C-J,2,D +C-J,3,J +F-G,1,F +F-G,2,Z +F-G,3,G +F-K,1,F +F-K,2,E +F-K,3,K +G-M,1,G +G-M,2,H +G-M,3,L +G-M,4,M diff --git a/functional_tests/test_data/Less_than_all_core_simple_expected/low_frequency_gene_placement.expected b/functional_tests/test_data/Less_than_all_core_simple_expected/low_frequency_gene_placement.expected new file mode 100644 index 0000000..00b7150 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple_expected/low_frequency_gene_placement.expected @@ -0,0 +1,50 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_extra_large A B 9 0 +complete_genome_double_chrom_extra_large A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_3_extra_large A I 0 0 +complete_genome_double_chrom_extra_large A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_3_extra_large B Q 1 0 +complete_genome_double_chrom_extra_large B Q 1 0 +complete_genome_double_chrom_3_extra_large C D 3 0 +complete_genome_double_chrom_extra_large C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_3_extra_large C Q -3 0 +complete_genome_double_chrom_extra_large C Q -3 0 +complete_genome_double_chrom_3_extra_large D J 0 0 +complete_genome_double_chrom_extra_large D J -3 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_extra_large E F 9 0 +complete_genome_double_chrom_extra_large E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_3_extra_large E K 0 0 +complete_genome_double_chrom_extra_large E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_3_extra_large F Z 1 0 +complete_genome_double_chrom_extra_large F Z 1 0 +complete_genome_double_chrom_3_extra_large G H 0 0 +complete_genome_double_chrom_extra_large G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_3_extra_large G Z -3 0 +complete_genome_double_chrom_extra_large G Z -3 0 +complete_genome_double_chrom_3_extra_large H L 0 0 +complete_genome_double_chrom_extra_large H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_3_extra_large J Sequence_break 3 0 +complete_genome_double_chrom_extra_large J Sequence_break 1 0 +complete_genome_double_chrom_larger J Sequence_break 1 0 +complete_genome_double_chrom_3_extra_large L M -700 0 +complete_genome_double_chrom_extra_large L M 0 0 +complete_genome_double_chrom_larger L M 0 0 +complete_genome_double_chrom_3_extra_large M Sequence_break 698 0 +complete_genome_double_chrom_extra_large M Sequence_break 2 0 +complete_genome_double_chrom_larger M Sequence_break 2 0 +complete_genome_double_chrom_3_extra_large Sequence_break I 0 0 +complete_genome_double_chrom_extra_large Sequence_break I 0 0 +complete_genome_double_chrom_larger Sequence_break I 0 0 +complete_genome_double_chrom_3_extra_large Sequence_break K 0 0 +complete_genome_double_chrom_extra_large Sequence_break K 0 0 +complete_genome_double_chrom_larger Sequence_break K 0 0 diff --git a/functional_tests/test_data/Less_than_all_core_simple_expected/no_accessory_core_segments.expected b/functional_tests/test_data/Less_than_all_core_simple_expected/no_accessory_core_segments.expected new file mode 100644 index 0000000..10ea9b1 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_simple_expected/no_accessory_core_segments.expected @@ -0,0 +1,20 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +B-C,B-C,1,1,B +B-C,B-C,1,2,Q +B-C,B-C,1,3,C +B-I,B-I,1,1,B +B-I,B-I,1,2,A +B-I,B-I,1,3,I +C-J,C-J,1,1,C +C-J,C-J,1,2,D +C-J,C-J,1,3,J +F-G,F-G,1,1,F +F-G,F-G,1,2,Z +F-G,F-G,1,3,G +F-K,F-K,1,1,F +F-K,F-K,1,2,E +F-K,F-K,1,3,K +G-M,G-M,1,1,G +G-M,G-M,1,2,H +G-M,G-M,1,3,L +G-M,G-M,1,4,M diff --git a/functional_tests/test_data/Multi_complete_chromosome_expected/core_core_accessory_gene_content.expected b/functional_tests/test_data/Multi_complete_chromosome_expected/core_core_accessory_gene_content.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Multi_complete_chromosome_expected/core_core_accessory_gene_content.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Multi_complete_chromosome_expected/core_pair_summary.expected b/functional_tests/test_data/Multi_complete_chromosome_expected/core_pair_summary.expected new file mode 100644 index 0000000..66b9420 --- /dev/null +++ b/functional_tests/test_data/Multi_complete_chromosome_expected/core_pair_summary.expected @@ -0,0 +1,18 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +I-J,3,3,3,3,1,3,1.7,1.0,0,0,0.0,0.0 +K-M,3,3,3,3,-698,2,-231.3,2.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multi_complete_chromosome_expected/core_segments.expected b/functional_tests/test_data/Multi_complete_chromosome_expected/core_segments.expected new file mode 100644 index 0000000..6450104 --- /dev/null +++ b/functional_tests/test_data/Multi_complete_chromosome_expected/core_segments.expected @@ -0,0 +1,14 @@ +Segment_name,Segment_position,Core_gene +A-D,1,A +A-D,2,I +A-D,3,J +A-D,4,D +B-C,1,B +B-C,2,C +E-H,1,E +E-H,2,K +E-H,3,M +E-H,4,L +E-H,5,H +F-G,1,F +F-G,2,G diff --git a/functional_tests/test_data/Multi_complete_chromosome_expected/low_frequency_gene_placement.expected b/functional_tests/test_data/Multi_complete_chromosome_expected/low_frequency_gene_placement.expected new file mode 100644 index 0000000..5d41d78 --- /dev/null +++ b/functional_tests/test_data/Multi_complete_chromosome_expected/low_frequency_gene_placement.expected @@ -0,0 +1,40 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_larger A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_larger A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_3_larger B C 9 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_larger C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_larger D J 0 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_larger E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_larger E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_3_larger F G 9 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_larger G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_larger H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_2_larger I J 1 0 +complete_genome_double_chrom_3_larger I J 3 0 +complete_genome_double_chrom_larger I J 1 0 +complete_genome_double_chrom_2_larger K M 2 0 +complete_genome_double_chrom_3_larger K M -698 0 +complete_genome_double_chrom_larger K M 2 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_larger L M -700 0 +complete_genome_double_chrom_larger L M 0 0 diff --git a/functional_tests/test_data/Multi_complete_chromosome_expected/no_accessory_core_segments.expected b/functional_tests/test_data/Multi_complete_chromosome_expected/no_accessory_core_segments.expected new file mode 100644 index 0000000..2ba270c --- /dev/null +++ b/functional_tests/test_data/Multi_complete_chromosome_expected/no_accessory_core_segments.expected @@ -0,0 +1,14 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-D,A-D,1,1,A +A-D,A-D,1,2,I +A-D,A-D,1,3,J +A-D,A-D,1,4,D +B-C,B-C,1,1,B +B-C,B-C,1,2,C +E-H,E-H,1,1,E +E-H,E-H,1,2,K +E-H,E-H,1,3,M +E-H,E-H,1,4,L +E-H,E-H,1,5,H +F-G,F-G,1,1,F +F-G,F-G,1,2,G diff --git a/functional_tests/test_data/complete_genome_double_chrom_3_extra_large.gff b/functional_tests/test_data/complete_genome_double_chrom_3_extra_large.gff new file mode 100755 index 0000000..9fb9fc3 --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_3_extra_large.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 192 202 . . . ID=dub_chrom_Q;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 295 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 296 297 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 192 202 . . . ID=dub_chrom_Z;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 996 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 998 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom_extra_large.gff b/functional_tests/test_data/complete_genome_double_chrom_extra_large.gff new file mode 100755 index 0000000..af5f073 --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_extra_large.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 192 202 . . . ID=dub_chrom_Q;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 299 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 297 299 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 192 202 . . . ID=dub_chrom_Z;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 296 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 298 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file From 688f0c6f9f86188b9d17905d8bd1c6e9ba3c4f97 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 08:41:49 +1000 Subject: [PATCH 108/135] Add in small changes in functional test --- functional_tests/Corekaburra-test.sh | 33 ++++++++++--------- .../no_accessory_core_segments.csv.expected | 5 +-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 47b4f72..39c3fec 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -248,23 +248,25 @@ test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_ru rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. - not done paper graph 2 - implemented remove if successfull +call_new_test "Test that segmnets can be identified with a core-cutoff that is less than all genomes" Corekaburra -ip Less_than_all_core_simple/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_simple_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_gffs_simple_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Less_than_all_gffs_simple_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_simple_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_simple_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_simple_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_simple_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_simple_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_core_simple_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_simple_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder # TODO - Test that segments are produced for instances where a core gene may be left out. - not done paper graph 1 +call_new_test "Test that segments are produced for instances where a core gene may be left out" Corekaburra -ip Less_than_all_core_complex/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_gffs_complex_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_gffs_complex_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Less_than_all_gffs_complex_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_complex_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_complex_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_complex/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_complex/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex/no_accessory_core_segments.csv.expected rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. @@ -273,12 +275,13 @@ rm -r test_out_folder # TODO - Test using two complete chromosomes (complete genome) when >1 contig test. - implemented remove if successful +call_new_test "Test using two complete chromosomes (complete genome) when >1 contig test" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cc 0.9 -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_complete_chromosome_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Multi_complete_chromosome_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Multi_complete_chromosome_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Multi_complete_chromosome_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Multi_complete_chromosome_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multiple_component_graph_complete_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multiple_component_graph_complete_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected index 8cdcbeb..d994679 100755 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected @@ -1,3 +1,4 @@ Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene -B-C,B-C,1,1,B -B-C,B-C,1,2,C +B-C,B-A,1,1,B +B-C,B-A,1,2,A +B-C,C-C,2,1,C From 7f38ee207ecc34cf1f772de405cfb14a0c27471e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 08:46:22 +1000 Subject: [PATCH 109/135] Make small change in functional test output and point to correct expected files --- functional_tests/Corekaburra-test.sh | 20 +++++++++---------- .../core_segments.csv.expected | 5 +++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 39c3fec..e74c594 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -250,11 +250,11 @@ rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. - not done paper graph 2 - implemented remove if successfull call_new_test "Test that segmnets can be identified with a core-cutoff that is less than all genomes" Corekaburra -ip Less_than_all_core_simple/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_simple_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_simple_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_simple_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Less_than_all_core_simple_expected/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_simple_expected/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_simple_expected/low_frequency_gene_placement.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_simple_expected/core_pair_summary.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_core_simple_expected/core_segments.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_simple_expected/no_accessory_core_segments.expected rm -r test_out_folder @@ -262,11 +262,11 @@ rm -r test_out_folder # TODO - Test that segments are produced for instances where a core gene may be left out. - not done paper graph 1 call_new_test "Test that segments are produced for instances where a core gene may be left out" Corekaburra -ip Less_than_all_core_complex/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_complex/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_complex/core_pair_summary.csv.expected -test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex/core_segments.csv.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex/no_accessory_core_segments.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex/core_core_accessory_gene_content.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_complex/low_frequency_gene_placement.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_complex/core_pair_summary.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex/core_segments.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex/no_accessory_core_segments.expected rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected index a380ae3..420f81a 100755 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected @@ -1,3 +1,4 @@ Segment_name,Segment_position,Core_gene -B-C,1,B -B-C,2,C +B-C,1,B +B-C,2,A +B-C,3,C From 2beeb0a90618a324911be16e5364e524b96d6e87 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 08:57:01 +1000 Subject: [PATCH 110/135] Remove test_folder, add in test results --- ...core_core_accessory_gene_content.expected} | 0 .../core_pair_summary.expected | 24 +++++++++ .../core_segments.expected | 16 ++++++ .../low_frequency_gene_placement.expected | 50 +++++++++++++++++++ .../no_accessory_core_segments.expected | 16 ++++++ .../core_segments.csv.expected | 2 +- .../test_out_folder/Corekaburra.log | 40 --------------- .../test_out_folder/core_pair_summary.csv | 4 -- .../test_out_folder/core_segments.csv | 1 - ...coreless_contig_accessory_gene_content.tsv | 3 -- .../low_frequency_gene_placement.tsv | 7 --- .../no_accessory_core_segments.csv | 1 - 12 files changed, 107 insertions(+), 57 deletions(-) rename functional_tests/{test_out_folder/core_core_accessory_gene_content.tsv => test_data/Less_than_all_core_complex_expected/core_core_accessory_gene_content.expected} (100%) mode change 100755 => 100644 create mode 100644 functional_tests/test_data/Less_than_all_core_complex_expected/core_pair_summary.expected create mode 100644 functional_tests/test_data/Less_than_all_core_complex_expected/core_segments.expected create mode 100644 functional_tests/test_data/Less_than_all_core_complex_expected/low_frequency_gene_placement.expected create mode 100644 functional_tests/test_data/Less_than_all_core_complex_expected/no_accessory_core_segments.expected delete mode 100755 functional_tests/test_out_folder/Corekaburra.log delete mode 100755 functional_tests/test_out_folder/core_pair_summary.csv delete mode 100755 functional_tests/test_out_folder/core_segments.csv delete mode 100755 functional_tests/test_out_folder/coreless_contig_accessory_gene_content.tsv delete mode 100755 functional_tests/test_out_folder/low_frequency_gene_placement.tsv delete mode 100755 functional_tests/test_out_folder/no_accessory_core_segments.csv diff --git a/functional_tests/test_out_folder/core_core_accessory_gene_content.tsv b/functional_tests/test_data/Less_than_all_core_complex_expected/core_core_accessory_gene_content.expected old mode 100755 new mode 100644 similarity index 100% rename from functional_tests/test_out_folder/core_core_accessory_gene_content.tsv rename to functional_tests/test_data/Less_than_all_core_complex_expected/core_core_accessory_gene_content.expected diff --git a/functional_tests/test_data/Less_than_all_core_complex_expected/core_pair_summary.expected b/functional_tests/test_data/Less_than_all_core_complex_expected/core_pair_summary.expected new file mode 100644 index 0000000..c17bace --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_complex_expected/core_pair_summary.expected @@ -0,0 +1,24 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +B-Q,2,3,2,2,1,1,1.0,1.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-Q,2,3,2,2,-3,-3,-3.0,-3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-Z,2,3,2,2,1,1,1.0,1.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-Z,2,3,2,2,-3,-3,-3.0,-3.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 +M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Less_than_all_core_complex_expected/core_segments.expected b/functional_tests/test_data/Less_than_all_core_complex_expected/core_segments.expected new file mode 100644 index 0000000..205602d --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_complex_expected/core_segments.expected @@ -0,0 +1,16 @@ +Segment_name,Segment_position,Core_gene +A-I,1,A +A-I,2,I +B-C,1,B +B-C,2,Q +B-C,3,C +D-J,1,D +D-J,2,J +E-K,1,E +E-K,2,K +F-G,1,F +F-G,2,Z +F-G,3,G +H-M,1,H +H-M,2,L +H-M,3,M diff --git a/functional_tests/test_data/Less_than_all_core_complex_expected/low_frequency_gene_placement.expected b/functional_tests/test_data/Less_than_all_core_complex_expected/low_frequency_gene_placement.expected new file mode 100644 index 0000000..afa3ba0 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_complex_expected/low_frequency_gene_placement.expected @@ -0,0 +1,50 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_extra_large A B 9 0 +complete_genome_double_chrom_extra_large A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_extra_large A I 0 0 +complete_genome_double_chrom_extra_large A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_extra_large B Q 1 0 +complete_genome_double_chrom_extra_large B Q 1 0 +complete_genome_double_chrom_3_extra_large C D 3 0 +complete_genome_double_chrom_extra_large C D 3 0 +complete_genome_double_chrom_3_extra_large C Q -3 0 +complete_genome_double_chrom_extra_large C Q -3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_extra_large D J 0 0 +complete_genome_double_chrom_extra_large D J -3 0 +complete_genome_double_chrom_3_extra_large E F 9 0 +complete_genome_double_chrom_extra_large E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_extra_large E K 0 0 +complete_genome_double_chrom_extra_large E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_extra_large F Z 1 0 +complete_genome_double_chrom_extra_large F Z 1 0 +complete_genome_double_chrom_3_extra_large G H 0 0 +complete_genome_double_chrom_extra_large G H 0 0 +complete_genome_double_chrom_3_extra_large G Z -3 0 +complete_genome_double_chrom_extra_large G Z -3 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_extra_large H L 0 0 +complete_genome_double_chrom_extra_large H L 0 0 +complete_genome_double_chrom_2_larger J Sequence_break 1 0 +complete_genome_double_chrom_3_extra_large J Sequence_break 3 0 +complete_genome_double_chrom_extra_large J Sequence_break 1 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_extra_large L M -700 0 +complete_genome_double_chrom_extra_large L M 0 0 +complete_genome_double_chrom_2_larger M Sequence_break 2 0 +complete_genome_double_chrom_3_extra_large M Sequence_break 698 0 +complete_genome_double_chrom_extra_large M Sequence_break 2 0 +complete_genome_double_chrom_2_larger Sequence_break I 0 0 +complete_genome_double_chrom_3_extra_large Sequence_break I 0 0 +complete_genome_double_chrom_extra_large Sequence_break I 0 0 +complete_genome_double_chrom_2_larger Sequence_break K 0 0 +complete_genome_double_chrom_3_extra_large Sequence_break K 0 0 +complete_genome_double_chrom_extra_large Sequence_break K 0 0 diff --git a/functional_tests/test_data/Less_than_all_core_complex_expected/no_accessory_core_segments.expected b/functional_tests/test_data/Less_than_all_core_complex_expected/no_accessory_core_segments.expected new file mode 100644 index 0000000..79af895 --- /dev/null +++ b/functional_tests/test_data/Less_than_all_core_complex_expected/no_accessory_core_segments.expected @@ -0,0 +1,16 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-I,A-I,1,1,A +A-I,A-I,1,2,I +B-C,B-C,1,1,B +B-C,B-C,1,2,Q +B-C,B-C,1,3,C +D-J,D-J,1,1,D +D-J,D-J,1,2,J +E-K,E-K,1,1,E +E-K,E-K,1,2,K +F-G,F-G,1,1,F +F-G,F-G,1,2,Z +F-G,F-G,1,3,G +H-M,H-M,1,1,H +H-M,H-M,1,2,L +H-M,H-M,1,3,M diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected index 420f81a..5950ca0 100755 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_segments.csv.expected @@ -1,4 +1,4 @@ Segment_name,Segment_position,Core_gene -B-C,1,B +B-C,1,B B-C,2,A B-C,3,C diff --git a/functional_tests/test_out_folder/Corekaburra.log b/functional_tests/test_out_folder/Corekaburra.log deleted file mode 100755 index 5d1b83b..0000000 --- a/functional_tests/test_out_folder/Corekaburra.log +++ /dev/null @@ -1,40 +0,0 @@ -[2022-01-25 16:53:46+1100] - INFO - __main__: command line: Corekaburra/__main__.py -ig functional_tests/test_data/complete_genome_double_chrom_2.gff functional_tests/test_data/complete_genome_double_chrom.gff -ip functional_tests/test_data/Coreless_contig_run/ -o functional_tests/test_out_folder/ -cg functional_tests/test_data/Complete_double_chromosomes.txt -[2022-01-25 16:53:46+1100] - INFO - __main__: -----------------------Processing started---------------------- - -[2022-01-25 16:53:47+1100] - INFO - __main__: Initial checks successful - -[2022-01-25 16:53:47+1100] - INFO - parse_gene_presence_absence: ------------Opening the gene presence/absence file------------ -Core genes must be found in 2 or more genomes -Low frequency genes must be found in less than 1 genomes - -[2022-01-25 16:53:47+1100] - INFO - parse_gene_presence_absence: A total of: -3 core gene clusters were identified -6 low frequency gene clusters were identified -0 intermediate accessory gene clusters were identified - -[2022-01-25 16:53:47+1100] - INFO - __main__: ------Start core region identification of given gff files----- - -[2022-01-25 16:53:47+1100] - INFO - __main__: 2 GFF files to process -[2022-01-25 16:53:47+1100] - INFO - __main__: GFF file #1 has been processed -[2022-01-25 16:53:47+1100] - INFO - __main__: GFF file #2 has been processed -[2022-01-25 16:55:11+1100] - INFO - __main__: command line: Corekaburra/__main__.py -ig functional_tests/test_data/complete_genome_double_chrom_2.gff functional_tests/test_data/complete_genome_double_chrom.gff -ip functional_tests/test_data/Coreless_contig_run/ -o functional_tests/test_out_folder/ -cg functional_tests/test_data/Complete_double_chromosomes.txt -[2022-01-25 16:55:11+1100] - INFO - __main__: -----------------------Processing started---------------------- - -[2022-01-25 16:55:11+1100] - INFO - __main__: Initial checks successful - -[2022-01-25 16:55:11+1100] - INFO - parse_gene_presence_absence: ------------Opening the gene presence/absence file------------ -Core genes must be found in 2 or more genomes -Low frequency genes must be found in less than 1 genomes - -[2022-01-25 16:55:11+1100] - INFO - parse_gene_presence_absence: A total of: -3 core gene clusters were identified -6 low frequency gene clusters were identified -0 intermediate accessory gene clusters were identified - -[2022-01-25 16:55:11+1100] - INFO - __main__: ------Start core region identification of given gff files----- - -[2022-01-25 16:55:11+1100] - INFO - __main__: 2 GFF files to process -[2022-01-25 16:55:11+1100] - INFO - __main__: GFF file #1 has been processed -[2022-01-25 16:55:11+1100] - INFO - __main__: GFF file #2 has been processed diff --git a/functional_tests/test_out_folder/core_pair_summary.csv b/functional_tests/test_out_folder/core_pair_summary.csv deleted file mode 100755 index c62e46d..0000000 --- a/functional_tests/test_out_folder/core_pair_summary.csv +++ /dev/null @@ -1,4 +0,0 @@ -Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc -A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-C,2,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 -B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 diff --git a/functional_tests/test_out_folder/core_segments.csv b/functional_tests/test_out_folder/core_segments.csv deleted file mode 100755 index 4048d85..0000000 --- a/functional_tests/test_out_folder/core_segments.csv +++ /dev/null @@ -1 +0,0 @@ -Segment_name,Segment_position,Core_gene diff --git a/functional_tests/test_out_folder/coreless_contig_accessory_gene_content.tsv b/functional_tests/test_out_folder/coreless_contig_accessory_gene_content.tsv deleted file mode 100755 index 8fa9065..0000000 --- a/functional_tests/test_out_folder/coreless_contig_accessory_gene_content.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Gff Contig Accessory_count Intermediate_cunt low_frequency_count -complete_genome_double_chrom contig_2 3 0 3 -complete_genome_double_chrom_2 contig_2 3 0 3 diff --git a/functional_tests/test_out_folder/low_frequency_gene_placement.tsv b/functional_tests/test_out_folder/low_frequency_gene_placement.tsv deleted file mode 100755 index 0e7126e..0000000 --- a/functional_tests/test_out_folder/low_frequency_gene_placement.tsv +++ /dev/null @@ -1,7 +0,0 @@ -Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count -complete_genome_double_chrom A B 9 0 -complete_genome_double_chrom_2 A B 9 0 -complete_genome_double_chrom A C 10 0 -complete_genome_double_chrom_2 A C 10 0 -complete_genome_double_chrom B C 9 0 -complete_genome_double_chrom_2 B C 9 0 diff --git a/functional_tests/test_out_folder/no_accessory_core_segments.csv b/functional_tests/test_out_folder/no_accessory_core_segments.csv deleted file mode 100755 index 81f6eb1..0000000 --- a/functional_tests/test_out_folder/no_accessory_core_segments.csv +++ /dev/null @@ -1 +0,0 @@ -Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene From a927ca5d67bc6b38b4d48398eb20369ff783095a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 09:00:29 +1000 Subject: [PATCH 111/135] Point to correct expected files --- functional_tests/Corekaburra-test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index e74c594..5f9b555 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -262,11 +262,11 @@ rm -r test_out_folder # TODO - Test that segments are produced for instances where a core gene may be left out. - not done paper graph 1 call_new_test "Test that segments are produced for instances where a core gene may be left out" Corekaburra -ip Less_than_all_core_complex/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex/core_core_accessory_gene_content.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_complex/low_frequency_gene_placement.expected -test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_complex/core_pair_summary.expected -test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex/core_segments.expected -test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex/no_accessory_core_segments.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex_expected/core_core_accessory_gene_content.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Less_than_all_core_complex_expected/low_frequency_gene_placement.expected +test_output_file test_out_folder/core_pair_summary.csv Less_than_all_core_complex_expected/core_pair_summary.expected +test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex_expected/core_segments.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex_expected/no_accessory_core_segments.expected rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. From 57e3a0e4acf4e9ac4be135c214f25312d5e68167 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 09:19:47 +1000 Subject: [PATCH 112/135] Add in functional test using a "candidate gene" annotation and gzipped input gff --- Corekaburra/gff_parser.py | 2 +- functional_tests/Corekaburra-test.sh | 25 ++++++++++++------ .../test_data/candidate_gene_1.gff | 7 +++++ .../test_data/candidate_gene_2.gff | 7 +++++ .../complete_genome_single_chrom.gff.gz | Bin 0 -> 184 bytes .../complete_genome_single_chrom_2.gff.gz | Bin 0 -> 194 bytes 6 files changed, 32 insertions(+), 9 deletions(-) create mode 100755 functional_tests/test_data/candidate_gene_1.gff create mode 100755 functional_tests/test_data/candidate_gene_2.gff create mode 100755 functional_tests/test_data/complete_genome_single_chrom.gff.gz create mode 100755 functional_tests/test_data/complete_genome_single_chrom_2.gff.gz diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 2131a38..db010e6 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -30,7 +30,7 @@ def parse_gff(input_file): # FASTA found - close file and end loop open_file.close() break - if "#" not in line and 'CDS' in line: + if "#" not in line and ('CDS' in line or 'candidate_gene' in line): # Strip line for newline and split columns into list line = line.strip() line = line.split("\t") diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 5f9b555..ef16702 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -217,6 +217,23 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder +# TODO - test gzipped input gffs - implemented - delete if successfull +call_new_test "Test complete genome with single contig and single complete genome among input" +Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull +call_new_test "Test complete genome with single contig and single complete genome among input" +Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +rm -r test_out_folder + + call_new_test "Test complete genome with single contig and single complete genome among input" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected @@ -247,7 +264,6 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. - not done paper graph 2 - implemented remove if successfull call_new_test "Test that segmnets can be identified with a core-cutoff that is less than all genomes" Corekaburra -ip Less_than_all_core_simple/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_simple_expected/core_core_accessory_gene_content.expected @@ -258,8 +274,6 @@ test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_co rm -r test_out_folder - -# TODO - Test that segments are produced for instances where a core gene may be left out. - not done paper graph 1 call_new_test "Test that segments are produced for instances where a core gene may be left out" Corekaburra -ip Less_than_all_core_complex/ -ig complete_genome_double_chrom_extra_large.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_extra_large.gff -o test_out_folder/ -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Less_than_all_core_complex_expected/core_core_accessory_gene_content.expected @@ -269,12 +283,7 @@ test_output_file test_out_folder/core_segments.csv Less_than_all_core_complex_ex test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_core_complex_expected/no_accessory_core_segments.expected rm -r test_out_folder -# TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - -# TODO - test gzipped input gffs - -# TODO - Test using two complete chromosomes (complete genome) when >1 contig test. - implemented remove if successful call_new_test "Test using two complete chromosomes (complete genome) when >1 contig test" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cc 0.9 -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/candidate_gene_1.gff b/functional_tests/test_data/candidate_gene_1.gff new file mode 100755 index 0000000..05041c8 --- /dev/null +++ b/functional_tests/test_data/candidate_gene_1.gff @@ -0,0 +1,7 @@ +##gff-version3 +contig_1 . candidate_gene 1 90 . . . ID=single_comp_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . candidate_gene 100 190 . . . ID=single_comp_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . candidate_gene 200 290 . . . ID=single_comp_C;Other_info;locus_tag=locus_tag_0003 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATAGT \ No newline at end of file diff --git a/functional_tests/test_data/candidate_gene_2.gff b/functional_tests/test_data/candidate_gene_2.gff new file mode 100755 index 0000000..d2763c2 --- /dev/null +++ b/functional_tests/test_data/candidate_gene_2.gff @@ -0,0 +1,7 @@ +##gff-version3 +contig_1 . candidate_gene 1 90 . . . ID=single_comp_2_A;Other_info;locus_tag=locus_tag_0001 +contig_1 . candidate_gene 100 190 . . . ID=single_comp_2_B;Other_info;locus_tag=locus_tag_0002 +contig_1 . candidate_gene 200 290 . . . ID=single_comp_2_C;Other_info;locus_tag=locus_tag_0003 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCATA \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_single_chrom.gff.gz b/functional_tests/test_data/complete_genome_single_chrom.gff.gz new file mode 100755 index 0000000000000000000000000000000000000000..1b2a5c69ec66b1445efaf474568d4a31573093c4 GIT binary patch literal 184 zcmV;p07w5HiwFqOS(IV`17mM(aBO9CWnX7yZf|X6Uvp`0XKZC(V`y@3Z7yeKW&l%G zPESkIElVvb&dkp<=1R`bE6GfcH{{ggban~mG~~2205U<))5W$pGcP?SH9k2%w;6XCqKEgIKCt?-4@P@H!v_T#4yglfYT6{iB3eAXoPN}5zs^v6?>vw|0ssJVD@thq literal 0 HcmV?d00001 diff --git a/functional_tests/test_data/complete_genome_single_chrom_2.gff.gz b/functional_tests/test_data/complete_genome_single_chrom_2.gff.gz new file mode 100755 index 0000000000000000000000000000000000000000..11b983cf258ed47f705d581ae7513b394423708b GIT binary patch literal 194 zcmV;z06qU7iwFqOS(IV`17mM(aBO9CWnX7yZf|X6Uvp`0XKZC(V`y@3ZC^4jXJ%#q zQ&vt-OVce&Eh^5;&okyq&d)2!OpiC@)Z=t^3Fb89v@`%RLD191wm35{Jts9jIX|}` z-YDMD+P@?twJ1I_FD>6XCqKEgIKCt?-4@P@H!v_T#4ymnfYT7SnNCEQX@qX35ztH{ w+-5oxVWu&cva*|FaEK$99n5epKk5NzAfTdgg9?N=JBBy{0P@Q^MMeSu0H|wFA^-pY literal 0 HcmV?d00001 From 3f2937beabb6bea0057047c1a5b71be16e20f720 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 09:27:53 +1000 Subject: [PATCH 113/135] Add in changes to accept "candidate_gene" annotations and unit test for reading a gff with this type in it --- functional_tests/Corekaburra-test.sh | 10 +++++++++- unit_tests/Corekaburra_test.py | 14 ++++++++------ .../Silas_the_Salmonella_corrected.gff | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index ef16702..b1b9509 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -218,13 +218,21 @@ test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_ rm -r test_out_folder # TODO - test gzipped input gffs - implemented - delete if successfull -call_new_test "Test complete genome with single contig and single complete genome among input" +call_new_test "test gzipped input gffs" Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected rm -r test_out_folder +# TODO - test mixed gzipped and non-gzipped input gffs - implemented - delete if successfull +call_new_test "test mixed gzipped and non-gzipped input gffs" +Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +rm -r test_out_folder + # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test complete genome with single contig and single complete genome among input" Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 52cf9ce..9ec78e3 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -30,6 +30,7 @@ except FileNotFoundError: os.chdir('unit_test_data/') + class TestCutOffViolations(unittest.TestCase): """ Test for the function that examines the cutoffs given for core and low-frequency genes""" @classmethod @@ -839,7 +840,7 @@ def test_read_file(self): self.assertEqual(expected_dict, return_dict) - +# TODO - likely obsolete after addition of Panaroo functions class TestPrepairForReannotation(unittest.TestCase): """ Test for pre-pairing a folder for corrected genomes, and testing if any are present from previous runs """ @classmethod @@ -890,7 +891,7 @@ def test_some_files_annotated(self): # # self.assertEqual(expected_gffs, corrected_files_return) - +# TODO - likely obsolete after addition of Panaroo functions class TestAddGeneToGff(unittest.TestCase): """ Test of the function used to add a gene annotation (line) to a gff file @@ -973,7 +974,7 @@ def test_adding_a_gene_name_and_annotation(self): with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: self.assertEqual(expected_lines, added_gff.readlines()) - +# TODO - likely obsolete after addition of Panaroo functions class TestWriteContig(unittest.TestCase): """ Test of the function used to write a contig in a gff file. @@ -1003,7 +1004,7 @@ def test_writing_a_contig(self): with open('TestWriteContig/mocky_test_gff.gff', 'r') as added_gff: self.assertEqual(expected_lines, added_gff.readlines()) - +# TODO - likely obsolete after addition of Panaroo functions class TestAnnotateRefoundGenomes(unittest.TestCase): """ Test of the function used to reannotate refound genes identified by panaroo in a gff file. @@ -1098,6 +1099,7 @@ def test_gene_not_found(self): with self.assertRaises(SystemExit): correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) + class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} @@ -1153,7 +1155,7 @@ def test_gff_generator_generation_gzipped_input(self): for expected, generated in zip(expected_output, return_generator): self.assertEqual(expected, generated) - def test_gff_generator_generation_corrected_gff(self): + def test_gff_generator_generation_panaroo_produced_gff(self): input_gff_file = 'TestParsingGffFile/Silas_the_Salmonella_corrected.gff' expected_output = [['contig_1', '.', 'CDS', '1', '90', '.', '.', '.', 'Silas_the_Salmonella_tag-1-1'], @@ -1165,7 +1167,7 @@ def test_gff_generator_generation_corrected_gff(self): ['contig_1', '.', 'CDS', '600', '690', '.', '.', '.', 'Silas_the_Salmonella_tag-1-5.1'], ['contig_1', '.', 'CDS', '700', '790', '.', '.', '.', 'Silas_the_Salmonella_tag-1.7'], ['contig_1', '.', 'CDS', '800', '890', '.', '.', '.', "Silas_the_Salmonella_tag-1-5.2"], - ['contig_1', 'Panaroo', 'CDS', '900', '1000', '.', '+', '0', 'refound_gene_1']] + ['contig_1', '.', 'candidate_gene', '900', '1000', '.', '+', '0', 'refound_gene_1']] return_generator = [] for line in gff_parser.parse_gff(input_gff_file): diff --git a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff index 0979504..831b7ef 100755 --- a/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff +++ b/unit_tests/unit_test_data/TestParsingGffFile/Silas_the_Salmonella_corrected.gff @@ -7,7 +7,7 @@ contig_1 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_t contig_1 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 contig_1 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag-1.7;locus_tag=Silas_the_Salmonella_tag-1.7 contig_1 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 -contig_1 Panaroo CDS 900 1000 . + 0 ID=Silas_the_Salmonella_tag-1000;annotaitons=CureToCancer;locus_tag=Silas_the_Salmonella_tag-1000;old_locus_tag=refound_gene_1 +contig_1 . candidate_gene 900 1000 . + 0 ID=refound_gene_1;annotaitons=CureToCancer;locus_tag=refound_gene_1 ##FASTA >contigrom e12f25982296efd851b372a3c55c8f2aadf40af8 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 09:31:39 +1000 Subject: [PATCH 114/135] Point to correct output folder --- functional_tests/Corekaburra-test.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index b1b9509..2c2752a 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -220,25 +220,25 @@ rm -r test_out_folder # TODO - test gzipped input gffs - implemented - delete if successfull call_new_test "test gzipped input gffs" Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - test mixed gzipped and non-gzipped input gffs - implemented - delete if successfull call_new_test "test mixed gzipped and non-gzipped input gffs" Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test complete genome with single contig and single complete genome among input" Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Single_comple_chromosome_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Single_comple_chromosome_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder From cd09d9ad9068191f67b8562e7562b98514c49304 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 11:06:19 +1000 Subject: [PATCH 115/135] Add functionallity to make gene_presence_absence and gff match nicely --- Corekaburra/check_inputs.py | 3 ++- Corekaburra/parse_gene_presence_absence.py | 5 ----- unit_tests/Corekaburra_test.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/Corekaburra/check_inputs.py b/Corekaburra/check_inputs.py index 5035cfb..bd07e87 100755 --- a/Corekaburra/check_inputs.py +++ b/Corekaburra/check_inputs.py @@ -93,7 +93,8 @@ def check_gff_in_pan(file_list, gene_presence_absence_path, logger): genome_names = [name.replace('"', '') for name in genome_names] file_list = [os.path.basename(file) for file in file_list] - file_list_no_suffix = [file.rstrip('.gff') for file in file_list] + file_list_no_suffix = [file.rstrip('.gz') for file in file_list] + file_list_no_suffix = [file.rstrip('.gff') for file in file_list_no_suffix] # Check if all or subset of GFFs from pan genome have been supplied, # if only a subset then raise warning diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 927621d..d06d808 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -265,11 +265,6 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, f"{low_freq_gene_number} low frequency gene clusters were identified\n" f"{acc_gene_number} intermediate accessory gene clusters were identified\n") - # Remove gff databases - # files_in_tmp = os.listdir(tmp_folder_path) - # gff_dbs = [file for file in files_in_tmp if '_db' in file] - # [os.remove(os.path.join(tmp_folder_path, db)) for db in gff_dbs] - return core_gene_dict, low_freq_gene_dict, acc_gene_dict diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 9ec78e3..34f77c3 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -200,6 +200,22 @@ def test_input_gff_pres_abs_some_file_not_in_pan(self): with self.assertRaises(SystemExit): check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) + def test_input_gff_pres_abs_pairing_all_gffs_gzipped(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['Silas_the_Salmonella.gff.gz', 'Christina_the_Streptococcus.gff.gz', 'Ajwa_the_Shigella.gff.gz'] + + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) + + self.assertEqual(return_bool, True) + + def test_input_gff_pres_abs_pairing_all_gffs_mixed_gzipped(self): + input_pres_abs = 'TestPresenceOfGffsInPresAbsFile/gene_presence_absence_roary.csv' + input_file_list = ['Silas_the_Salmonella', 'Christina_the_Streptococcus.gff', 'Ajwa_the_Shigella.gff.gz'] + + return_bool = check_inputs.check_gff_in_pan(input_file_list, input_pres_abs, self.logger) + + self.assertEqual(return_bool, True) + class TestAddingGeneToDict(unittest.TestCase): """ From 9ae5e2cd4e2e6536e16fc96f7a8b72dd81ec802a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 13:13:11 +1000 Subject: [PATCH 116/135] Add in differential reading of file depending on gzipped state --- Corekaburra/gff_parser.py | 75 +++++++++++++++++++--------------- unit_tests/Corekaburra_test.py | 45 ++++++++++++++++++++ 2 files changed, 86 insertions(+), 34 deletions(-) diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index db010e6..f54172b 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -7,6 +7,23 @@ from correct_gffs import annotate_refound_genes +def open_file_generator(input_file_path): + # Open input file as if it was gzipped + try: + with gzip.open(input_file_path, 'rt') as open_file: + # Test if gzipped by reading line + open_file.readline() + + for line in open_file: + yield line + + except (OSError, gzip.BadGzipFile): + # Open input as if normal + with open(input_file_path, 'r') as open_file: + for line in open_file: + yield line + + def parse_gff(input_file): """ Try to read a GFF file as gzipped then normal @@ -15,20 +32,12 @@ def parse_gff(input_file): :param input_file: File-path to a given gff file to be processed :return: line from generator object returning CDS from a gff file """ + file_generator = open_file_generator(input_file) - # Open input file as if it was gzipped - open_file = gzip.open(input_file, 'rt') - try: - # Test if gzipped by reading line - open_file.readline() - except (OSError, gzip.BadGzipFile): - # Open inout as if normal - open_file = open(input_file, 'r') - - for line in open_file: + for line in file_generator: if "##FASTA" in line: - # FASTA found - close file and end loop - open_file.close() + # FASTA found - end loop + file_generator.close() break if "#" not in line and ('CDS' in line or 'candidate_gene' in line): # Strip line for newline and split columns into list @@ -61,29 +70,28 @@ def get_contig_lengths(input_file): contig_size_dir = {} # Open the given gff file, find the fasta section of the file and count the length of each contig. - with open(input_file, 'r', ) as gff_file: - for line in gff_file: - if fasta_reached and '>' not in line: - contig_size += len(line.rstrip()) - if fasta_reached and '>' in line: - if contig_size > 0: - # Record the previous contig - if contig_name not in contig_size_dir: - contig_size_dir[contig_name] = contig_size - else: - raise ValueError(f"contig name: {contig_name}, in file {input_file} is duplicated! Please fix this") - - # Set the contig name to the next contig - contig_name = line.strip().split(' ')[0].replace('>', '') - contig_size = 0 + for line in open_file_generator(input_file): + if fasta_reached and '>' not in line: + contig_size += len(line.rstrip()) + if fasta_reached and '>' in line: + if contig_size > 0: + # Record the previous contig + if contig_name not in contig_size_dir: + contig_size_dir[contig_name] = contig_size else: - contig_name = line.strip().split(' ')[0].replace('>', '') + raise ValueError(f"contig name: {contig_name}, in file {input_file} is duplicated! Please fix this") - if "##FASTA" in line: - fasta_reached = True + # Set the contig name to the next contig + contig_name = line.strip().split(' ')[0].replace('>', '') + contig_size = 0 + else: + contig_name = line.strip().split(' ')[0].replace('>', '') - # Record last contig - contig_size_dir[contig_name] = contig_size + if "##FASTA" in line: + fasta_reached = True + + # Record last contig + contig_size_dir[contig_name] = contig_size return contig_size_dir @@ -304,7 +312,6 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc :return master_info: A dict of multiple pieces of info for each core gene pair (Gff file, core gene 1, core gene 2, distnace between them, genes between them, list of accesspry genes, list of low-frequency genes) :return coreless_contigs: Dict of contigs found to not encode any core genes on them. The accessory and low-frequency genes are recorded. """ - # Initialize data structures to be returned core_gene_pairs = [] core_gene_pair_distance = {} @@ -317,7 +324,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc if complete_genomes is None: complete_genome = False else: - if os.path.basename(gff_path).replace('.gff', '').replace('_corrected', '') in complete_genomes: + if os.path.basename(gff_path).replace('.gz', '').replace('.gff', '') in complete_genomes: complete_genome = True else: complete_genome = False diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 34f77c3..188176d 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -1116,6 +1116,43 @@ def test_gene_not_found(self): correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) +class TestOpeningFileToGenerator(unittest.TestCase): + def test_opening_refular_file(self): + input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt.gz' + + expected_output = ['contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', + '##FASTA\n', + '>contig_1\n', + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] + + return_generator = gff_parser.open_file_generator(input_file_path) + + given_output = [] + for line in return_generator: + given_output.append(line) + + self.assertEqual(expected_output, given_output) + + def test_opening_gzipped(self): + input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt' + + expected_output = ['##gff version\n', + 'contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', + '##FASTA\n', + '>contig_1\n', + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] + + return_generator = gff_parser.open_file_generator(input_file_path) + + given_output = [] + for line in return_generator: + given_output.append(line) + + self.assertEqual(expected_output, given_output) + + class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} @@ -1231,6 +1268,14 @@ def test_multiple_wrapped_contigs(self): self.assertEqual(expected_dict, return_dict) + def test_multiple_wrapped_contigs_gz(self): + input_gff_path = 'TestGetContigLenth/multi_contig_wrapped.txt.gz' + expected_dict = {'contig_1': 1300, + 'contig_2': 1300} + + return_dict = gff_parser.get_contig_lengths(input_gff_path) + + self.assertEqual(expected_dict, return_dict) class TestRecordCoreCoreRegion(unittest.TestCase): """ From c8bcc32e03831e4a8b615c8c5ec54ae7ce0a825b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 13:15:30 +1000 Subject: [PATCH 117/135] Actually add in unit test data.... --- .../multi_contig_wrapped.txt.gz | Bin 0 -> 173 bytes .../TestOpeningFileToGenerator/test_text_file.txt | 6 ++++++ .../test_text_file.txt.gz | Bin 0 -> 137 bytes 3 files changed, 6 insertions(+) create mode 100755 unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt.gz create mode 100755 unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt create mode 100755 unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt.gz diff --git a/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt.gz b/unit_tests/unit_test_data/TestGetContigLenth/multi_contig_wrapped.txt.gz new file mode 100755 index 0000000000000000000000000000000000000000..e165f5550bf4272734c43fcfc75fe933bcc7cbe8 GIT binary patch literal 173 zcmV;e08;-SiwFqOS(IV`18sF|bZK8>Z*FvHXJ2=6VQ_F|WG-}gbO1}v&nwAHk2mDh z<8*cj<}~EAGypO|(9^{>I5Q`)IKCt!H9j~oCpSMYH76%Az9cbS*HG8cIwwE5v>3$0 ztBgxo+08LH#F5J`8DoQWH1q7J!J`IK$6$DoX+*0c b6Jgb;$)hHZnmlUqP&XL>?(*h1&k6tlB^yaP literal 0 HcmV?d00001 diff --git a/unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt b/unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt new file mode 100755 index 0000000..39d033a --- /dev/null +++ b/unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt @@ -0,0 +1,6 @@ +##gff version +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt.gz b/unit_tests/unit_test_data/TestOpeningFileToGenerator/test_text_file.txt.gz new file mode 100755 index 0000000000000000000000000000000000000000..1303f090e119d9f2fce01d7081a7cea380d2d6d8 GIT binary patch literal 137 zcmV;40CxW$iwFpf)0ScY19W9`bYFC3cywQ8X>4UKba-?CQ&vt-OH(LIEh^5;&*MtY z&nwAHk2mDh<8*cj<}~EAGypO|(9^{>I5Q`)IKCt!H9j~oCpSMYH76%Az9cbS*HG8c rIwwE5v>3$0tBgxo+08LH#F5JmW)qhml>yZagc=I~DlS*bDFFZg?%y@U literal 0 HcmV?d00001 From 791ff3cef6fb48e73918a37e5c0a93c89f7aac63 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 13:18:28 +1000 Subject: [PATCH 118/135] Simplify commands for functional test --- functional_tests/Corekaburra-test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 2c2752a..71bfcf4 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -219,7 +219,7 @@ rm -r test_out_folder # TODO - test gzipped input gffs - implemented - delete if successfull call_new_test "test gzipped input gffs" -Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected @@ -227,7 +227,7 @@ rm -r test_out_folder # TODO - test mixed gzipped and non-gzipped input gffs - implemented - delete if successfull call_new_test "test mixed gzipped and non-gzipped input gffs" -Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected @@ -235,7 +235,7 @@ rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test complete genome with single contig and single complete genome among input" -Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder -cg Complete_single_chromosome.txt > /dev/null 2>&1 +Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected From 109b06632c9ca36bcee5a40f01ce5888740aadfc Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 13:27:08 +1000 Subject: [PATCH 119/135] add in functional expected output for candidat_gene test --- functional_tests/Corekaburra-test.sh | 8 ++++---- .../candidate_gene_pan_input/gene_presence_absence.csv | 4 ++++ .../core_core_accessory_gene_content.expected | 1 + .../core_pair_summary.expected | 5 +++++ .../low_frequency_gene_placement.expected | 9 +++++++++ 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100755 functional_tests/test_data/candidate_gene_pan_input/gene_presence_absence.csv create mode 100644 functional_tests/test_data/candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected create mode 100644 functional_tests/test_data/candidate_gene_pan_input_expected/core_pair_summary.expected create mode 100644 functional_tests/test_data/candidate_gene_pan_input_expected/low_frequency_gene_placement.expected diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 71bfcf4..e9d804e 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -234,11 +234,11 @@ test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_ rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull -call_new_test "Test complete genome with single contig and single complete genome among input" +call_new_test "Test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS" Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv candidate_gene_pan_input_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv candidate_gene_pan_input_expected/core_pair_summary.csv.expected rm -r test_out_folder diff --git a/functional_tests/test_data/candidate_gene_pan_input/gene_presence_absence.csv b/functional_tests/test_data/candidate_gene_pan_input/gene_presence_absence.csv new file mode 100755 index 0000000..df704ae --- /dev/null +++ b/functional_tests/test_data/candidate_gene_pan_input/gene_presence_absence.csv @@ -0,0 +1,4 @@ +"","","","","","","","","","","","","","","candidate_gene_1","candidate_gene_2" +"A","","","2","2","1","","","","","","","","","single_comp_A","single_comp_2_A" +"B","","","2","2","1","","","","","","","","","single_comp_B","single_comp_2_B" +"C","","","2","2","1","","","","","","","","","single_comp_C","single_comp_2_C" \ No newline at end of file diff --git a/functional_tests/test_data/candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected b/functional_tests/test_data/candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/candidate_gene_pan_input_expected/core_pair_summary.expected b/functional_tests/test_data/candidate_gene_pan_input_expected/core_pair_summary.expected new file mode 100644 index 0000000..0ad556a --- /dev/null +++ b/functional_tests/test_data/candidate_gene_pan_input_expected/core_pair_summary.expected @@ -0,0 +1,5 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/candidate_gene_pan_input_expected/low_frequency_gene_placement.expected b/functional_tests/test_data/candidate_gene_pan_input_expected/low_frequency_gene_placement.expected new file mode 100644 index 0000000..92a7cd3 --- /dev/null +++ b/functional_tests/test_data/candidate_gene_pan_input_expected/low_frequency_gene_placement.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +candidate_gene_1 A B 9 0 +candidate_gene_2 A B 9 0 +candidate_gene_1 B C 9 0 +candidate_gene_2 B C 9 0 +candidate_gene_1 C Sequence_break 10 0 +candidate_gene_2 C Sequence_break 10 0 +candidate_gene_1 Sequence_break A 0 0 +candidate_gene_2 Sequence_break A 0 0 From 7ab3d6c164b6500dabfd66da98234f01529e7bda Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 13:30:14 +1000 Subject: [PATCH 120/135] Point to the correct expected files --- functional_tests/Corekaburra-test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index e9d804e..93939c7 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -236,9 +236,9 @@ rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS" Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv candidate_gene_pan_input_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv candidate_gene_pan_input_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv candidate_gene_pan_input_expected/low_frequency_gene_placement.expected +test_output_file test_out_folder/core_pair_summary.csv candidate_gene_pan_input_expected/core_pair_summary.expected rm -r test_out_folder From 8a220bbdec26cb6d8cea13fe97483b61fc69137f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 16:17:07 +1000 Subject: [PATCH 121/135] point to correct pan-genome folder --- functional_tests/Corekaburra-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 93939c7..839e06e 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -235,7 +235,7 @@ rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS" -Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 +Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip candidate_gene_pan_input -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv candidate_gene_pan_input_expected/low_frequency_gene_placement.expected test_output_file test_out_folder/core_pair_summary.csv candidate_gene_pan_input_expected/core_pair_summary.expected From 622bba6265bb75ee94b85d0cf029fdb7b4b9fd86 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 16:20:21 +1000 Subject: [PATCH 122/135] Change input file for functional test --- functional_tests/Corekaburra-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 839e06e..475d944 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -235,7 +235,7 @@ rm -r test_out_folder # TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull call_new_test "Test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS" -Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff.gz -ip candidate_gene_pan_input -o test_out_folder > /dev/null 2>&1 +Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff -ip candidate_gene_pan_input -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv candidate_gene_pan_input_expected/low_frequency_gene_placement.expected test_output_file test_out_folder/core_pair_summary.csv candidate_gene_pan_input_expected/core_pair_summary.expected From 045413174259f02a5e467f39564e9c0db55cc140 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 16:32:09 +1000 Subject: [PATCH 123/135] Remove TODO tags for successfull functional tests --- functional_tests/Corekaburra-test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 475d944..6026e54 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -217,7 +217,7 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test gzipped input gffs - implemented - delete if successfull + call_new_test "test gzipped input gffs" Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff.gz -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected @@ -225,7 +225,7 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - test mixed gzipped and non-gzipped input gffs - implemented - delete if successfull + call_new_test "test mixed gzipped and non-gzipped input gffs" Corekaburra -ig complete_genome_single_chrom.gff.gz complete_genome_single_chrom_2.gff -ip Roray_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected @@ -233,7 +233,7 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_exp test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -# TODO - Do functional test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS. - delete if successfull + call_new_test "Test with GFF where reannotated in panaroo is called 'candidate_gene' instead of CDS" Corekaburra -ig candidate_gene_1.gff candidate_gene_2.gff -ip candidate_gene_pan_input -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv candidate_gene_pan_input_expected/core_core_accessory_gene_content.expected From d33ed87d7bc9cea719872f20f0b82fb22303584f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 1 Jun 2022 16:34:10 +1000 Subject: [PATCH 124/135] Comment out code for reannotating GFF files --- Corekaburra/__main__.py | 23 +++++++++++---------- Corekaburra/commandline_interface.py | 30 ++++++++++++++-------------- Corekaburra/gff_parser.py | 12 +++++------ 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index df47c65..36fc5ab 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -164,10 +164,10 @@ def main(): # Check if gene_data file is present if Panaroo input is given an gffs should be annotated # TODO Likely not needed anymore with new implementations in Panaroo - if args.annotate and source_program == 'Panaroo': - gene_data_path = check_gene_data(args.input_pan, logger) - else: - gene_data_path = None + # if args.annotate and source_program == 'Panaroo': + # gene_data_path = check_gene_data(args.input_pan, logger) + # else: + # gene_data_path = None # Check that all GFF files given can be found in the pan-genome check_gff_in_pan(args.input_gffs, input_pres_abs_file_path, logger) @@ -182,12 +182,15 @@ def main(): time_start_read_files = time.time() # Prepair folder for reannotated genes and examine if any are already present # TODO - likely not required after new implementations in Panaroo. - if source_program == "Panaroo" and args.annotate: - gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, - args.input_gffs, logger) - else: - gene_data_dict = None - corrected_dir = None + # if source_program == "Panaroo" and args.annotate: + # gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, + # args.input_gffs, logger) + # else: + # gene_data_dict = None + # corrected_dir = None + # TODO - remove if script works without reannotation + gene_data_dict = None + corrected_dir = None # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] # TODO - Add in so that the user can give a list of genes that they wish to use as 'core genes' diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 62cbb40..6869e2d 100755 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -47,14 +47,14 @@ def get_commandline_arguments(args, version): default=None, dest='comp_genomes') - run_mods.add_argument('-a', - '--no_annotate_refound', - help='Flag to toggle off the creation of new gff files, with annotation of refound genes.\n' - 'Only done if input pangenome is detected as coming from Panaroo', - required=False, - default=True, - action='store_false', - dest='annotate') + # run_mods.add_argument('-a', + # '--no_annotate_refound', + # help='Flag to toggle off the creation of new gff files, with annotation of refound genes.\n' + # 'Only done if input pangenome is detected as coming from Panaroo', + # required=False, + # default=True, + # action='store_false', + # dest='annotate') run_mods.add_argument('-cc', '--core_cutoff', @@ -88,13 +88,13 @@ def get_commandline_arguments(args, version): default=None, dest='output_prefix') - output_control.add_argument('-d', - '--discard_corrected', - help='Discard gff files corrected with refound genes identified by Panaroo - Only compativle if pan-genome comes from Panaroo [Default: Corrected files are kept]', - required=False, - default=False, - action='store_true', - dest='discard_gffs') + # output_control.add_argument('-d', + # '--discard_corrected', + # help='Discard gff files corrected with refound genes identified by Panaroo - Only compativle if pan-genome comes from Panaroo [Default: Corrected files are kept]', + # required=False, + # default=False, + # action='store_true', + # dest='discard_gffs') rem_args.add_argument('-c', '--cpu', diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index f54172b..976c664 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -608,10 +608,10 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ # Correct input gff file # Add in the refound genes into the gff files and print the corrected GFF files. - if source_program == "Panaroo" and annotate: - # check if not already corrected file and if any gene is to be inserted at all - if "_corrected" not in input_gff_file and any([x in input_gff_file for x in list(gene_data_dict)]): - input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dir, logger) + # if source_program == "Panaroo" and annotate: + # check if not already corrected file and if any gene is to be inserted at all + # if "_corrected" not in input_gff_file and any([x in input_gff_file for x in list(gene_data_dict)]): + # input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dir, logger) gff_generator = parse_gff(input_gff_file) return_data = segment_gff_content(gff_generator=gff_generator, @@ -621,7 +621,7 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ acc_genes=acc_gene_dict, complete_genomes=complete_genomes) - if "_corrected" in input_gff_file and discard_corrected: - os.remove(input_gff_file) + # if "_corrected" in input_gff_file and discard_corrected: + # os.remove(input_gff_file) return return_data From 84c17162cb548e9bc791d8ad3ea342a1675309d2 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 14 Jun 2022 11:10:50 +0200 Subject: [PATCH 125/135] Try to make Corekaburra run without the reannotations steps --- Corekaburra/__main__.py | 3 +- Corekaburra/gff_parser.py | 5 ++-- functional_tests/test_data/no_input.expected | 31 +++++--------------- 3 files changed, 11 insertions(+), 28 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 36fc5ab..fa9c951 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -222,8 +222,7 @@ def main(): with concurrent.futures.ProcessPoolExecutor(max_workers=args.cpu) as executor: logger.info(f"------Start core region identification of given gff files-----\n") logger.info(f'{len(args.input_gffs)} GFF files to process') - results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes, - source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path.name, args.discard_gffs, logger) + results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes) # ,source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path.name, args.discard_gffs, logger) for gff in args.input_gffs] for output in concurrent.futures.as_completed(results): diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 976c664..a7490f1 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -581,8 +581,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc low_freq_gene_content, master_info, coreless_contigs -def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes, source_program, - annotate, gene_data_dict, corrected_dir, tmp_folder_path, discard_corrected, logger): +def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes): #, source_program, annotate, gene_data_dict, corrected_dir, tmp_folder_path, discard_corrected, logger): """ Single function segmenting the gff into core gene regions to be used for simple multi processing :param input_gff_file: File-path to the given gff file to be segmented @@ -591,7 +590,7 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ :param acc_gene_dict: Dictionary over accessory genes :param complete_genomes: Bool indicating if this genome should be considered as a complete genome :param source_program: String indicating if program comes from Roary or Panaroo. - :param annotate: Bool to indicate if refound genes should be annotated + :param annotate: Bool to indicate if refound genes should be annotated # TODOD - remove! :param gene_data_dict: Dict of genes, annotations, names, and sequences found in the gene_data.csv file from Panaroo :param corrected_dir: File path to directory where corrected Gff files are to be stored. :param tmp_folder_path: Path to the temporary working folder. diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index f14ac07..799b5a1 100755 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,11 +1,8 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] [-h] + [-v] -Welcome to Corekaburra! An extension to pan-genome analyses that summarise -genomic regions between core genes and segments of neighbouring core genes -using gene synteny from a set of input genomes and a pan-genome folder. +Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes using gene synteny from a set of +input genomes and a pan-genome folder. Required arguments: -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...] @@ -15,29 +12,17 @@ Required arguments: Analysis modifiers: -cg complete_genomes.txt, --complete_genomes complete_genomes.txt - text file containing names of genomes that are to be - handled as complete genomes - -a, --no_annotate_refound - Flag to toggle off the creation of new gff files, with - annotation of refound genes. Only done if input - pangenome is detected as coming from Panaroo + text file containing names of genomes that are to be handled as complete genomes -cc 1.0, --core_cutoff 1.0 - Percentage of isolates in which a core gene must be - present [default: 1.0] + Percentage of isolates in which a core gene must be present [default: 1.0] -lc 0.05, --low_cutoff 0.05 - Percentage of isolates where genes found in less than - these are seen as low-frequency genes [default: 0.05] + Percentage of isolates where genes found in less than these are seen as low-frequency genes [default: 0.05] Output control: -o path/to/output, --output path/to/output - Path to where output files will be placed [default: - current folder] + Path to where output files will be placed [default: current folder] -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX Prefix for output files, if any is desired - -d, --discard_corrected - Discard gff files corrected with refound genes - identified by Panaroo - Only compativle if pan-genome - comes from Panaroo [Default: Corrected files are kept] Other arguments: -c int, --cpu int Give max number of CPUs [default: 1] From 82252449e1c8c4c27f2e14e1c17ce220115391db Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 20 Jun 2022 10:36:25 +0200 Subject: [PATCH 126/135] Change the expected output for the help function after commandline option changes --- functional_tests/test_data/no_input.expected | 23 +++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 799b5a1..a2650f2 100755 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,8 +1,11 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] [-h] - [-v] +usage: __main__.py -ig file.gff [file.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] + [-h] [-v] -Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes using gene synteny from a set of -input genomes and a pan-genome folder. +Welcome to Corekaburra! An extension to pan-genome analyses that summarise +genomic regions between core genes and segments of neighbouring core genes +using gene synteny from a set of input genomes and a pan-genome folder. Required arguments: -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...] @@ -12,15 +15,19 @@ Required arguments: Analysis modifiers: -cg complete_genomes.txt, --complete_genomes complete_genomes.txt - text file containing names of genomes that are to be handled as complete genomes + text file containing names of genomes that are to be + handled as complete genomes -cc 1.0, --core_cutoff 1.0 - Percentage of isolates in which a core gene must be present [default: 1.0] + Percentage of isolates in which a core gene must be + present [default: 1.0] -lc 0.05, --low_cutoff 0.05 - Percentage of isolates where genes found in less than these are seen as low-frequency genes [default: 0.05] + Percentage of isolates where genes found in less than + these are seen as low-frequency genes [default: 0.05] Output control: -o path/to/output, --output path/to/output - Path to where output files will be placed [default: current folder] + Path to where output files will be placed [default: + current folder] -p OUTPUT_PREFIX, --prefix OUTPUT_PREFIX Prefix for output files, if any is desired From 31e62214aadf217504a0fb8b9e73f57a0f3bb43b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 20 Jun 2022 10:39:29 +0200 Subject: [PATCH 127/135] Remove functional and unit test made obsolete by Panaroo's new function --- functional_tests/Corekaburra-test.sh | 32 +-- unit_tests/Corekaburra_test.py | 314 ++++++++++++++------------- 2 files changed, 174 insertions(+), 172 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 6026e54..3e80ff1 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -197,8 +197,8 @@ test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_p call_new_test "Test exit upon unsuccessful identification of source program" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder > /dev/null 2>&1" 1 -call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" -test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder > /dev/null 2>&1" 1 +#call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" #TODO - likely remove when gffs are no longer corrected +#test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder > /dev/null 2>&1" 1 call_new_test "Test exit upon gff not found in pan is provided as input" test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Crash_gff_folder > /dev/null 2>&1" 1 @@ -211,7 +211,7 @@ test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_ rm -r test_out_folder call_new_test "Test Panaroo input" -Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder -a > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff -ip Panaroo_run -o test_out_folder > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Simple_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Simple_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Simple_run_expected/core_pair_summary.csv.expected @@ -349,19 +349,19 @@ test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_run_expect test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -call_new_test "Test unsuccessful reannotation of Panaroo" -test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 -rm -r test_out_folder - -call_new_test "Test Panaroo input with correction of gff files" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected -test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected -test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected -rm -r test_out_folder +#call_new_test "Test unsuccessful reannotation of Panaroo" # TODO - remove after Panaroo reannotates gffs +#test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 +#rm -r test_out_folder + +#call_new_test "Test Panaroo input with correction of gff files" # TODO - remove after Panaroo reannotates gffs +#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ > /dev/null 2>&1 +#test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected +#test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +#test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected +#test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected +#test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected +#test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected +#rm -r test_out_folder call_new_test "Test with a single core gene on a contig that is not complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ > /dev/null 2>&1 diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 188176d..606746b 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -991,166 +991,167 @@ def test_adding_a_gene_name_and_annotation(self): self.assertEqual(expected_lines, added_gff.readlines()) # TODO - likely obsolete after addition of Panaroo functions -class TestWriteContig(unittest.TestCase): - """ - Test of the function used to write a contig in a gff file. - """ +# class TestWriteContig(unittest.TestCase): +# """ +# Test of the function used to write a contig in a gff file. +# """ # Make a setup and a teardown that copies and renames the mock file - def setUp(self): - """ Class to copy the mock gff before modifying""" - copyfile('TestWriteContig/mocky_test_gff.gff', 'TestWriteContig/mocky_test_gff.gff_copy') - - def tearDown(self): - """ Class to remove modified gff and rename the original""" - os.remove('TestWriteContig/mocky_test_gff.gff') - os.rename('TestWriteContig/mocky_test_gff.gff_copy', 'TestWriteContig/mocky_test_gff.gff') - - def test_writing_a_contig(self): - file_path = 'TestWriteContig/mocky_test_gff.gff' - contig_name = 'Test_contig_name space' - sequence = 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC' - - expected_lines = ['##gff-version 3\n', '#test comment line\n', '>Test_contig_name space\n', - 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGC\n', - 'GGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC\n'] - - with open(file_path, 'a') as file: - correct_gffs.write_contig(file, contig_name, sequence) + # def setUp(self): + # """ Class to copy the mock gff before modifying""" + # copyfile('TestWriteContig/mocky_test_gff.gff', 'TestWriteContig/mocky_test_gff.gff_copy') + # + # def tearDown(self): + # """ Class to remove modified gff and rename the original""" + # os.remove('TestWriteContig/mocky_test_gff.gff') + # os.rename('TestWriteContig/mocky_test_gff.gff_copy', 'TestWriteContig/mocky_test_gff.gff') + # + # def test_writing_a_contig(self): + # file_path = 'TestWriteContig/mocky_test_gff.gff' + # contig_name = 'Test_contig_name space' + # sequence = 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC' + # + # expected_lines = ['##gff-version 3\n', '#test comment line\n', '>Test_contig_name space\n', + # 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGC\n', + # 'GGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC\n'] + # + # with open(file_path, 'a') as file: + # correct_gffs.write_contig(file, contig_name, sequence) + # + # with open('TestWriteContig/mocky_test_gff.gff', 'r') as added_gff: + # self.assertEqual(expected_lines, added_gff.readlines()) - with open('TestWriteContig/mocky_test_gff.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) # TODO - likely obsolete after addition of Panaroo functions -class TestAnnotateRefoundGenomes(unittest.TestCase): - """ - Test of the function used to reannotate refound genes identified by panaroo in a gff file. - """ - @classmethod - def setUpClass(cls): - cls.logger = logging.getLogger('test_logger.log') - cls.logger.setLevel(logging.INFO) - - def tearDown(self): - """ Class to remove modified gff and rename the original""" - try: - os.remove('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff') - except FileNotFoundError: - os.remove('TestAnnotateRefoundGenomes/reannotate_gff_tmp.gff') - os.remove('TestAnnotateRefoundGenomes/reannotate_gff.gff_db') - - def test_annotation_of_pos_stand_gene(self): - gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' - gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], - '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', '', 'gene_function'], - '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', '']}} - tmp_folder_path = 'TestAnnotateRefoundGenomes' - corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' - - expected_lines = \ - ['##gff-version 3\n', - '#test comment line\n', - 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', - 'test_contig\tPanaroo\tCDS\t16\t158\t.\t+\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', - 'test_contig\tPanaroo\tCDS\t174\t316\t.\t+\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', - 'test_contig\tPanaroo\tCDS\t332\t469\t.\t+\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', - 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', - '##FASTA\n', - '>test_contig\n', - 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', - 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', - 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', - 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', - 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', - 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', - 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', - 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', - 'TTTT\n' - ] - - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) - - with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - - def test_annotation_of_neg_stand_gene(self): - gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' - gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAATAAAAACAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', 'gene_function'], - '0_refound_100': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAAAAAAAAAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', '', 'gene_function'], - '0_refound_10': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAGGCGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', '']}} - tmp_folder_path = 'TestAnnotateRefoundGenomes' - corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' - expected_lines = ['##gff-version 3\n', - '#test comment line\n', - 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', - 'test_contig\tPanaroo\tCDS\t16\t158\t.\t-\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', - 'test_contig\tPanaroo\tCDS\t174\t316\t.\t-\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', - 'test_contig\tPanaroo\tCDS\t332\t469\t.\t-\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', - 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', - '##FASTA\n', - '>test_contig\n', - 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', - 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', - 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', - 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', - 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', - 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', - 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', - 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', - 'TTTT\n'] - - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) - - with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - - def test_gene_not_found(self): - gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' - gene_data_dict = {'reannotate_gff': {'0_refound_0': [ - 'CCCCCCCCCCCCGGGGGGGGGGGGGGGCGGCGCGCGCGCGCGCGGCGCGCGCGGCGCGC', - 'gene_name', 'gene_function']}} - - tmp_folder_path = 'TestAnnotateRefoundGenomes' - corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' - - with self.assertRaises(SystemExit): - correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) - - -class TestOpeningFileToGenerator(unittest.TestCase): - def test_opening_refular_file(self): - input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt.gz' - - expected_output = ['contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', - '##FASTA\n', - '>contig_1\n', - 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', - 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] - - return_generator = gff_parser.open_file_generator(input_file_path) - - given_output = [] - for line in return_generator: - given_output.append(line) - - self.assertEqual(expected_output, given_output) - - def test_opening_gzipped(self): - input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt' - - expected_output = ['##gff version\n', - 'contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', - '##FASTA\n', - '>contig_1\n', - 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', - 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] - - return_generator = gff_parser.open_file_generator(input_file_path) - - given_output = [] - for line in return_generator: - given_output.append(line) - - self.assertEqual(expected_output, given_output) +# class TestAnnotateRefoundGenomes(unittest.TestCase): +# """ +# Test of the function used to reannotate refound genes identified by panaroo in a gff file. +# """ +# @classmethod +# def setUpClass(cls): +# cls.logger = logging.getLogger('test_logger.log') +# cls.logger.setLevel(logging.INFO) +# +# def tearDown(self): +# """ Class to remove modified gff and rename the original""" +# try: +# os.remove('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff') +# except FileNotFoundError: +# os.remove('TestAnnotateRefoundGenomes/reannotate_gff_tmp.gff') +# os.remove('TestAnnotateRefoundGenomes/reannotate_gff.gff_db') +# +# def test_annotation_of_pos_stand_gene(self): +# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' +# gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], +# '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', '', 'gene_function'], +# '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', '']}} +# tmp_folder_path = 'TestAnnotateRefoundGenomes' +# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' +# +# expected_lines = \ +# ['##gff-version 3\n', +# '#test comment line\n', +# 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', +# 'test_contig\tPanaroo\tCDS\t16\t158\t.\t+\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', +# 'test_contig\tPanaroo\tCDS\t174\t316\t.\t+\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', +# 'test_contig\tPanaroo\tCDS\t332\t469\t.\t+\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', +# 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', +# '##FASTA\n', +# '>test_contig\n', +# 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', +# 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', +# 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', +# 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', +# 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', +# 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', +# 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', +# 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', +# 'TTTT\n' +# ] +# +# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) +# +# with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: +# self.assertEqual(expected_lines, added_gff.readlines()) +# +# def test_annotation_of_neg_stand_gene(self): +# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' +# gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAATAAAAACAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', 'gene_function'], +# '0_refound_100': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAAAAAAAAAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', '', 'gene_function'], +# '0_refound_10': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAGGCGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', '']}} +# tmp_folder_path = 'TestAnnotateRefoundGenomes' +# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' +# expected_lines = ['##gff-version 3\n', +# '#test comment line\n', +# 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', +# 'test_contig\tPanaroo\tCDS\t16\t158\t.\t-\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', +# 'test_contig\tPanaroo\tCDS\t174\t316\t.\t-\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', +# 'test_contig\tPanaroo\tCDS\t332\t469\t.\t-\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', +# 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', +# '##FASTA\n', +# '>test_contig\n', +# 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', +# 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', +# 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', +# 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', +# 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', +# 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', +# 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', +# 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', +# 'TTTT\n'] +# +# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) +# +# with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: +# self.assertEqual(expected_lines, added_gff.readlines()) +# +# def test_gene_not_found(self): +# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' +# gene_data_dict = {'reannotate_gff': {'0_refound_0': [ +# 'CCCCCCCCCCCCGGGGGGGGGGGGGGGCGGCGCGCGCGCGCGCGGCGCGCGCGGCGCGC', +# 'gene_name', 'gene_function']}} +# +# tmp_folder_path = 'TestAnnotateRefoundGenomes' +# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' +# +# with self.assertRaises(SystemExit): +# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) +# +# +# class TestOpeningFileToGenerator(unittest.TestCase): +# def test_opening_refular_file(self): +# input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt.gz' +# +# expected_output = ['contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', +# '##FASTA\n', +# '>contig_1\n', +# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', +# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] +# +# return_generator = gff_parser.open_file_generator(input_file_path) +# +# given_output = [] +# for line in return_generator: +# given_output.append(line) +# +# self.assertEqual(expected_output, given_output) +# +# def test_opening_gzipped(self): +# input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt' +# +# expected_output = ['##gff version\n', +# 'contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', +# '##FASTA\n', +# '>contig_1\n', +# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', +# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] +# +# return_generator = gff_parser.open_file_generator(input_file_path) +# +# given_output = [] +# for line in return_generator: +# given_output.append(line) +# +# self.assertEqual(expected_output, given_output) class TestExtractGenomeFasta(unittest.TestCase): @@ -1277,6 +1278,7 @@ def test_multiple_wrapped_contigs_gz(self): self.assertEqual(expected_dict, return_dict) + class TestRecordCoreCoreRegion(unittest.TestCase): """ Test function that is used to record information of a region identified between two core genes. From 820a3966855f3a6a4dc4223dbb5ffac576feea29 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 20 Jun 2022 10:43:31 +0200 Subject: [PATCH 128/135] Remove additional tests and change expected help function --- functional_tests/Corekaburra-test.sh | 76 ++++++++++---------- functional_tests/test_data/no_input.expected | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 3e80ff1..933c646 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -391,13 +391,13 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_cor test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -call_new_test "Test with part of fragmented gene being a refound gene" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 -test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file test_out_folder/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file test_out_folder/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected -test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected -rm -r test_out_folder +#call_new_test "Test with part of fragmented gene being a refound gene" +#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 +#test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +#test_output_file test_out_folder/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +#test_output_file test_out_folder/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +#test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected +#rm -r test_out_folder call_new_test "Test for accessory genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 @@ -423,37 +423,37 @@ test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder -call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" -Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_run_fragment/core_pair_summary.csv Resume_refound_run_fragment/core_pair_summary.csv.expected -test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected -rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff -rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv -rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv -rm Resume_refound_run_fragment/core_pair_summary.csv -rm Resume_refound_run_fragment/Corekaburra.log - -call_new_test "Test with all genomes that have been corrected (Resume run)" -Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected -rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv -rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv -rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv -rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log - -call_new_test "Test recognition of corrected gff files in output folder (Resume run)" -Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 -test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv Resume_refound_gene/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_gene/core_pair_summary.csv Resume_refound_gene/core_pair_summary.csv.expected -rm Resume_refound_gene/low_frequency_gene_placement.tsv -rm Resume_refound_gene/core_core_accessory_gene_content.tsv -rm Resume_refound_gene/core_pair_summary.csv -rm Resume_refound_gene/Corekaburra.log +#call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" +#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 +#test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +#test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected +#test_output_file Resume_refound_run_fragment/core_pair_summary.csv Resume_refound_run_fragment/core_pair_summary.csv.expected +#test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected +#rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff +#rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv +#rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv +#rm Resume_refound_run_fragment/core_pair_summary.csv +#rm Resume_refound_run_fragment/Corekaburra.log + +#call_new_test "Test with all genomes that have been corrected (Resume run)" +#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 +#test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +#test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected +#test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected +#rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv +#rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv +#rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv +#rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log + +#call_new_test "Test recognition of corrected gff files in output folder (Resume run)" +#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 +#test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected +#test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv Resume_refound_gene/low_frequency_gene_placement.tsv.expected +#test_output_file Resume_refound_gene/core_pair_summary.csv Resume_refound_gene/core_pair_summary.csv.expected +#rm Resume_refound_gene/low_frequency_gene_placement.tsv +#rm Resume_refound_gene/core_core_accessory_gene_content.tsv +#rm Resume_refound_gene/core_pair_summary.csv +#rm Resume_refound_gene/Corekaburra.log # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index a2650f2..13a155e 100755 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: __main__.py -ig file.gff [file.gff ...] -ip path/to/pan_genome +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-c int] [-l | -q] [-h] [-v] From 12e60d121f58b80b8564c72d11d6622e162bda04 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 20 Jun 2022 10:56:15 +0200 Subject: [PATCH 129/135] Delete obsolete functions and parts of code --- Corekaburra/__main__.py | 2 +- Corekaburra/gff_parser.py | 14 +- functional_tests/Corekaburra-test.sh | 57 ----- unit_tests/Corekaburra_test.py | 312 --------------------------- 4 files changed, 3 insertions(+), 382 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index fa9c951..c3ada78 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -222,7 +222,7 @@ def main(): with concurrent.futures.ProcessPoolExecutor(max_workers=args.cpu) as executor: logger.info(f"------Start core region identification of given gff files-----\n") logger.info(f'{len(args.input_gffs)} GFF files to process') - results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes) # ,source_program, args.annotate, gene_data_dict, corrected_dir, tmp_folder_path.name, args.discard_gffs, logger) + results = [executor.submit(segment_genome_content, gff, core_dict, low_freq_dict, acc_gene_dict, comp_genomes) for gff in args.input_gffs] for output in concurrent.futures.as_completed(results): diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index a7490f1..88b3d95 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -581,8 +581,8 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc low_freq_gene_content, master_info, coreless_contigs -def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes): #, source_program, annotate, gene_data_dict, corrected_dir, tmp_folder_path, discard_corrected, logger): - """ +def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes): + """ # TODO - Update parameters Single function segmenting the gff into core gene regions to be used for simple multi processing :param input_gff_file: File-path to the given gff file to be segmented :param core_genes: Dictionary over core genes @@ -605,13 +605,6 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ :return complete_genomes: List of genomes given as complete by the user. """ - # Correct input gff file - # Add in the refound genes into the gff files and print the corrected GFF files. - # if source_program == "Panaroo" and annotate: - # check if not already corrected file and if any gene is to be inserted at all - # if "_corrected" not in input_gff_file and any([x in input_gff_file for x in list(gene_data_dict)]): - # input_gff_file = annotate_refound_genes(input_gff_file, gene_data_dict, tmp_folder_path, corrected_dir, logger) - gff_generator = parse_gff(input_gff_file) return_data = segment_gff_content(gff_generator=gff_generator, gff_path=input_gff_file, @@ -620,7 +613,4 @@ def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_ acc_genes=acc_gene_dict, complete_genomes=complete_genomes) - # if "_corrected" in input_gff_file and discard_corrected: - # os.remove(input_gff_file) - return return_data diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 933c646..402a048 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -197,9 +197,6 @@ test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_p call_new_test "Test exit upon unsuccessful identification of source program" test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_pan_folder > /dev/null 2>&1" 1 -#call_new_test "Test exit upon unsuccessful identification of gene_data, when -a is not given for Panaroo" #TODO - likely remove when gffs are no longer corrected -#test_exit_status "$test_program -ig complete_genome_double_chrom.gff -ip Crash_panaroo_folder > /dev/null 2>&1" 1 - call_new_test "Test exit upon gff not found in pan is provided as input" test_exit_status "$test_program -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Crash_gff_folder > /dev/null 2>&1" 1 @@ -349,20 +346,6 @@ test_output_file test_out_folder/core_segments.csv Less_than_all_gffs_run_expect test_output_file test_out_folder/no_accessory_core_segments.csv Less_than_all_gffs_run_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder -#call_new_test "Test unsuccessful reannotation of Panaroo" # TODO - remove after Panaroo reannotates gffs -#test_exit_status "$test_program -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_fail -o test_out_folder > /dev/null 2>&1" 3 -#rm -r test_out_folder - -#call_new_test "Test Panaroo input with correction of gff files" # TODO - remove after Panaroo reannotates gffs -#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Reannotate_run_succes/ -o test_out_folder/ > /dev/null 2>&1 -#test_output_file test_out_folder/core_core_accessory_gene_content.tsv Reannotation_sucessful_expected/core_core_accessory_gene_content.tsv.expected -#test_output_file test_out_folder/low_frequency_gene_placement.tsv Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected -#test_output_file test_out_folder/core_pair_summary.csv Reannotation_sucessful_expected/core_pair_summary.csv.expected -#test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_2_corrected.gff.expected -#test_output_file test_out_folder/Corrected_gff_files/complete_genome_single_chrom_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/complete_genome_single_chrom_corrected.gff.expected -#test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff Reannotation_sucessful_expected/Corrected_gff_files/genome_single_chrom_larger_rearrange_corrected.gff.expected -#rm -r test_out_folder - call_new_test "Test with a single core gene on a contig that is not complete" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_double_chrom.gff -ip Single_core_contig/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -391,14 +374,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_cor test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected rm -r test_out_folder -#call_new_test "Test with part of fragmented gene being a refound gene" -#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 -#test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -#test_output_file test_out_folder/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -#test_output_file test_out_folder/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected -#test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected -#rm -r test_out_folder - call_new_test "Test for accessory genes being fragmented" Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected @@ -406,7 +381,6 @@ test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_acc test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected rm -r test_out_folder - call_new_test "Test with a core-less contig draft" Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -423,37 +397,6 @@ test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder -#call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" -#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 -#test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected -#test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected -#test_output_file Resume_refound_run_fragment/core_pair_summary.csv Resume_refound_run_fragment/core_pair_summary.csv.expected -#test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected -#rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff -#rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv -#rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv -#rm Resume_refound_run_fragment/core_pair_summary.csv -#rm Resume_refound_run_fragment/Corekaburra.log - -#call_new_test "Test with all genomes that have been corrected (Resume run)" -#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 -#test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected -#test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected -#test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected -#rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv -#rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv -#rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv -#rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log - -#call_new_test "Test recognition of corrected gff files in output folder (Resume run)" -#Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 -#test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected -#test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv Resume_refound_gene/low_frequency_gene_placement.tsv.expected -#test_output_file Resume_refound_gene/core_pair_summary.csv Resume_refound_gene/core_pair_summary.csv.expected -#rm Resume_refound_gene/low_frequency_gene_placement.tsv -#rm Resume_refound_gene/core_core_accessory_gene_content.tsv -#rm Resume_refound_gene/core_pair_summary.csv -#rm Resume_refound_gene/Corekaburra.log # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 606746b..ce04a9d 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -842,318 +842,6 @@ def test_parsign_fragmented_gene_w_refound_component(self): self.assertEqual(expected_acc_gene_dict, acc_gene_dict) -class TestReadGeneData(unittest.TestCase): - """ Function to test the passing of gene_data.csv file from Panaroo """ - def test_read_file(self): - expected_dict = {'PY_40': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], - '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], - '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function']}, - 'PY_41': {'0_refound_1': ['ATGTTGTAGGAAAATACTTGGAAGAATACGTTGACAGGGGTATTTTTGATAAGGAGCCGTTCCAGACCTTTGATCAGAAAGGGATTGGCCGTCTCTTTAGCCCGTTCGGTTAAGCCTGAGTTGAAACTTGGTATTTGTGGGGAACATGGTGGCGATCCTGCTTCCATTGACTTTTACCACAGCCAAGGCCTGACCTACGTTTCTTGTTCGCCATTTAGAGTGCCGCTTACTCGCTTGGCGGCTGCTCAGGCTGCCATCAAAGCTTCAGGCCACAGTCTTACCCAAGACAAATAG', 'gene_name', 'gene_function']}, - 'PY_42': {'0_refound_2': ['ATGTCACTACTGCATATTCATCACAATAAAAAAAAGACAATAGCCCTAATCGTGCTATTGTCTCAAAATCATTTATTTACTTGAAACTTTATCGTGTTACACCAACAGTTTAA', 'gene_name', 'gene_function']}, - 'PY_43': {'0_refound_4': ['ATGAAACGCTATCAACAAGATGCCCTGCTTTTCAAAAAAAATAGATAAAGAAAAGGCTGCGACAGTATCTGCAAGCAGGGCAAAAGAACTAGAAGATAGGCTCAGTCATCAGCCATTAATTGATGATTATCGAGAAAAGATGCAAGATGCAAGATGCAAGTGATGTGACTCAGTATATCACCAAACGTATAGAAGATCAGTTAAACAAGGAGTTAACAAATGGCAAAAACTAA', 'gene_name', 'gene_function']}} - - return_dict = correct_gffs.read_gene_data('TestReadGeneData/Mock_gene_data.csv') - - self.assertEqual(expected_dict, return_dict) - -# TODO - likely obsolete after addition of Panaroo functions -class TestPrepairForReannotation(unittest.TestCase): - """ Test for pre-pairing a folder for corrected genomes, and testing if any are present from previous runs """ - @classmethod - def setUpClass(cls): - cls.logger = logging.getLogger('test_logger.log') - cls.logger.setLevel(logging.INFO) - - def tearDown(self): - try: - """ Class to remove created corrected output folder""" - os.rmdir('TestPrepairForReannotation/Corrected_gff_files') - except FileNotFoundError: - pass - - def test_no_files_annotated(self): - input_gffs = ['Mock_1.gff', 'Mock_2.gff'] - gene_data_dict_return, \ - corrected_gff_out_dir_return, \ - corrected_files_return = correct_gffs.prepair_for_reannotation('TestPrepairForReannotation/Mock_gene_data.csv', - 'TestPrepairForReannotation/', - input_gffs, self.logger) - - self.assertTrue(os.path.isdir('TestPrepairForReannotation/Corrected_gff_files')) - self.assertEqual(input_gffs, corrected_files_return) - - def test_some_files_annotated(self): - input_gffs = ['mock/test/path/Mock_1.gff', 'mock/test/path/Mock_2.gff', 'Mocky/mock/mock/path/Mock_3.gff'] - gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( - 'TestPrepairForReannotation/Mock_gene_data.csv', - 'TestPrepairForReannotation/Some_genomes', - input_gffs, self.logger) - - expected_gffs = ['mock/test/path/Mock_2.gff', - 'Mocky/mock/mock/path/Mock_3.gff', - 'TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff'] - - self.assertEqual(expected_gffs, corrected_files_return) - - # def test_all_files_annotated(self): - # input_gffs = ['Mock_1.gff', 'Mock_2.gff'] - # gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( - # 'TestPrepairForReannotation/Mock_gene_data.csv', - # 'TestPrepairForReannotation/All_genomes', - # input_gffs, self.logger) - # - # expected_gffs = ['TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff', - # 'TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff'] - # - # self.assertEqual(expected_gffs, corrected_files_return) - -# TODO - likely obsolete after addition of Panaroo functions -class TestAddGeneToGff(unittest.TestCase): - """ - Test of the function used to add a gene annotation (line) to a gff file - """ - # Make a setup and a teardown that copies and renames the mock file - def setUp(self): - """ Class to copy the mock gff before modifying""" - copyfile('TestAddGeneToGff/mocky_test_gff.gff', 'TestAddGeneToGff/mocky_test_gff.gff_copy') - - def tearDown(self): - """ Class to remove modified gff and rename the original""" - os.remove('TestAddGeneToGff/mocky_test_gff.gff') - os.rename('TestAddGeneToGff/mocky_test_gff.gff_copy', 'TestAddGeneToGff/mocky_test_gff.gff') - - def test_adding_a_gene_no_info(self): - tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' - gene_oi = ['TATA', '', ''] - genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' - contig = 'test_contig_1' - strand = '+' - refound_gene_tag = '0_refound_0' - largest_locus_tag = 'fer_1432' - - expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0\n'] - - with open(tmp_gff_file, 'a') as tmp_gff: - correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) - - with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - - def test_adding_a_gene_name(self): - tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' - gene_oi = ['TATA', 'Gene_name', ''] - genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' - contig = 'test_contig_1' - strand = '+' - refound_gene_tag = '0_refound_0' - largest_locus_tag = 'fer_1432' - - expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;name=Gene_name\n'] - - with open(tmp_gff_file, 'a') as tmp_gff: - correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) - - with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - - def test_adding_a_gene_annotation(self): - tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' - gene_oi = ['TATA', '', 'Gene_annotation'] - genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' - contig = 'test_contig_1' - strand = '+' - refound_gene_tag = '0_refound_0' - largest_locus_tag = 'fer_1432' - - expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;annotation=Gene_annotation\n'] - - with open(tmp_gff_file, 'a') as tmp_gff: - correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) - - with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - - def test_adding_a_gene_name_and_annotation(self): - tmp_gff_file = 'TestAddGeneToGff/mocky_test_gff.gff' - gene_oi = ['TATA', 'Gene_name', 'Gene_annotation'] - genome_oi = 'CCCCCCCCCCCCTATACCCCCCCC' - contig = 'test_contig_1' - strand = '+' - refound_gene_tag = '0_refound_0' - largest_locus_tag = 'fer_1432' - - expected_lines = ['##gff-version 3\n', '#test comment line\n', 'test_contig_1\tPanaroo\tCDS\t13\t16\t.\t+\t0\tID=fer_1433;locus_tag=fer_1433;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_annotation\n'] - - with open(tmp_gff_file, 'a') as tmp_gff: - correct_gffs.add_gene_to_gff(tmp_gff, gene_oi[0], genome_oi, contig, strand, refound_gene_tag, gene_oi[1:], largest_locus_tag) - - with open('TestAddGeneToGff/mocky_test_gff.gff', 'r') as added_gff: - self.assertEqual(expected_lines, added_gff.readlines()) - -# TODO - likely obsolete after addition of Panaroo functions -# class TestWriteContig(unittest.TestCase): -# """ -# Test of the function used to write a contig in a gff file. -# """ - # Make a setup and a teardown that copies and renames the mock file - # def setUp(self): - # """ Class to copy the mock gff before modifying""" - # copyfile('TestWriteContig/mocky_test_gff.gff', 'TestWriteContig/mocky_test_gff.gff_copy') - # - # def tearDown(self): - # """ Class to remove modified gff and rename the original""" - # os.remove('TestWriteContig/mocky_test_gff.gff') - # os.rename('TestWriteContig/mocky_test_gff.gff_copy', 'TestWriteContig/mocky_test_gff.gff') - # - # def test_writing_a_contig(self): - # file_path = 'TestWriteContig/mocky_test_gff.gff' - # contig_name = 'Test_contig_name space' - # sequence = 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC' - # - # expected_lines = ['##gff-version 3\n', '#test comment line\n', '>Test_contig_name space\n', - # 'AAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGCAAATAAATGGGC\n', - # 'GGGCAAATAAATGGGCGGGCAAATAAATGGGCGGGC\n'] - # - # with open(file_path, 'a') as file: - # correct_gffs.write_contig(file, contig_name, sequence) - # - # with open('TestWriteContig/mocky_test_gff.gff', 'r') as added_gff: - # self.assertEqual(expected_lines, added_gff.readlines()) - - -# TODO - likely obsolete after addition of Panaroo functions -# class TestAnnotateRefoundGenomes(unittest.TestCase): -# """ -# Test of the function used to reannotate refound genes identified by panaroo in a gff file. -# """ -# @classmethod -# def setUpClass(cls): -# cls.logger = logging.getLogger('test_logger.log') -# cls.logger.setLevel(logging.INFO) -# -# def tearDown(self): -# """ Class to remove modified gff and rename the original""" -# try: -# os.remove('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff') -# except FileNotFoundError: -# os.remove('TestAnnotateRefoundGenomes/reannotate_gff_tmp.gff') -# os.remove('TestAnnotateRefoundGenomes/reannotate_gff.gff_db') -# -# def test_annotation_of_pos_stand_gene(self): -# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' -# gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', 'gene_function'], -# '0_refound_100': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', '', 'gene_function'], -# '0_refound_10': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAG', 'gene_name', '']}} -# tmp_folder_path = 'TestAnnotateRefoundGenomes' -# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' -# -# expected_lines = \ -# ['##gff-version 3\n', -# '#test comment line\n', -# 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', -# 'test_contig\tPanaroo\tCDS\t16\t158\t.\t+\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', -# 'test_contig\tPanaroo\tCDS\t174\t316\t.\t+\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', -# 'test_contig\tPanaroo\tCDS\t332\t469\t.\t+\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', -# 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', -# '##FASTA\n', -# '>test_contig\n', -# 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', -# 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', -# 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', -# 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', -# 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', -# 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', -# 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', -# 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', -# 'TTTT\n' -# ] -# -# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) -# -# with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: -# self.assertEqual(expected_lines, added_gff.readlines()) -# -# def test_annotation_of_neg_stand_gene(self): -# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' -# gene_data_dict = {'reannotate_gff': {'0_refound_0': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAATAAAAACAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', 'gene_function'], -# '0_refound_100': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAAAAAAAAAGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', '', 'gene_function'], -# '0_refound_10': ['CTAAAATTCATTATAAATAAAGTTACTTTGACCATGTCCTAGATAACCAAAGAAATAGACCAGCAACATTAAAATCAGATAAAATAAAATTGTCTTGCCAAGGCGCAATTCCTCTCAATCTTGATTAGATCGGAAGAG', 'gene_name', '']}} -# tmp_folder_path = 'TestAnnotateRefoundGenomes' -# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' -# expected_lines = ['##gff-version 3\n', -# '#test comment line\n', -# 'test_contig\tProkka\tCDS\t1\t10\t.\t+\t0\tlocus_tag=locus_tag_0097\n', -# 'test_contig\tPanaroo\tCDS\t16\t158\t.\t-\t0\tID=locus_tag_0099;locus_tag=locus_tag_0099;old_locus_tag=0_refound_0;name=gene_name;annotation=gene_function\n', -# 'test_contig\tPanaroo\tCDS\t174\t316\t.\t-\t0\tID=locus_tag_0100;locus_tag=locus_tag_0100;old_locus_tag=0_refound_100;annotation=gene_function\n', -# 'test_contig\tPanaroo\tCDS\t332\t469\t.\t-\t0\tID=locus_tag_0101;locus_tag=locus_tag_0101;old_locus_tag=0_refound_10;name=gene_name\n', -# 'test_contig\tProkka\tCDS\t474\t484\t.\t+\t0\tlocus_tag=locus_tag_0098\n', -# '##FASTA\n', -# '>test_contig\n', -# 'TTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTG\n', -# 'GCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAG\n', -# 'GACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCC\n', -# 'GATCTAATCAAGATTGAGAGGAATTGCTTTTTTTTTTGGCAAGACAATTTTATTTTATCT\n', -# 'GATTTTAATGTTGCTGGTCTATTTCTTTGGTTATCTAGGACATGGTCAAAGTAACTTTAT\n', -# 'TTATAATGAATTTTAGTTTTTTTTTTTTTTTCTCTTCCGATCTAATCAAGATTGAGAGGA\n', -# 'ATTGCGCCTTGGCAAGACAATTTTATTTTATCTGATTTTAATGTTGCTGGTCTATTTCTT\n', -# 'TGGTTATCTAGGACATGGTCAAAGTAACTTTATTTATAATGAATTTTAGTTTTTTTTTTT\n', -# 'TTTT\n'] -# -# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) -# -# with open('TestAnnotateRefoundGenomes/reannotate_gff_corrected.gff', 'r') as added_gff: -# self.assertEqual(expected_lines, added_gff.readlines()) -# -# def test_gene_not_found(self): -# gff_name = 'TestAnnotateRefoundGenomes/reannotate_gff.gff' -# gene_data_dict = {'reannotate_gff': {'0_refound_0': [ -# 'CCCCCCCCCCCCGGGGGGGGGGGGGGGCGGCGCGCGCGCGCGCGGCGCGCGCGGCGCGC', -# 'gene_name', 'gene_function']}} -# -# tmp_folder_path = 'TestAnnotateRefoundGenomes' -# corrected_gff_out_dir = 'TestAnnotateRefoundGenomes' -# -# with self.assertRaises(SystemExit): -# correct_gffs.annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, self.logger) -# -# -# class TestOpeningFileToGenerator(unittest.TestCase): -# def test_opening_refular_file(self): -# input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt.gz' -# -# expected_output = ['contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', -# '##FASTA\n', -# '>contig_1\n', -# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', -# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] -# -# return_generator = gff_parser.open_file_generator(input_file_path) -# -# given_output = [] -# for line in return_generator: -# given_output.append(line) -# -# self.assertEqual(expected_output, given_output) -# -# def test_opening_gzipped(self): -# input_file_path = 'TestOpeningFileToGenerator/test_text_file.txt' -# -# expected_output = ['##gff version\n', -# 'contig_1\t.\tCDS\t1\t90\t.\t.\t.\tID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1\n', -# '##FASTA\n', -# '>contig_1\n', -# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n', -# 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n'] -# -# return_generator = gff_parser.open_file_generator(input_file_path) -# -# given_output = [] -# for line in return_generator: -# given_output.append(line) -# -# self.assertEqual(expected_output, given_output) - - class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): genome_fasta_dict_expected = {'contig} From 5b58f3e2019af254357071fc23158e9386fc167e Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Mon, 20 Jun 2022 13:13:57 +0200 Subject: [PATCH 130/135] Remove redundant code and tests --- Corekaburra/__main__.py | 23 +--- Corekaburra/gff_parser.py | 24 ++-- Corekaburra/parse_gene_presence_absence.py | 9 +- unit_tests/Corekaburra_test.py | 126 ++------------------- 4 files changed, 30 insertions(+), 152 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index c3ada78..2c03bb6 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -162,13 +162,6 @@ def main(): # Check if Panaroo or Roary input folder is given source_program, input_pres_abs_file_path = define_pangenome_program(args.input_pan, logger) - # Check if gene_data file is present if Panaroo input is given an gffs should be annotated - # TODO Likely not needed anymore with new implementations in Panaroo - # if args.annotate and source_program == 'Panaroo': - # gene_data_path = check_gene_data(args.input_pan, logger) - # else: - # gene_data_path = None - # Check that all GFF files given can be found in the pan-genome check_gff_in_pan(args.input_gffs, input_pres_abs_file_path, logger) @@ -180,24 +173,13 @@ def main(): ## Read in gene presence absence file time_start_read_files = time.time() - # Prepair folder for reannotated genes and examine if any are already present - # TODO - likely not required after new implementations in Panaroo. - # if source_program == "Panaroo" and args.annotate: - # gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, - # args.input_gffs, logger) - # else: - # gene_data_dict = None - # corrected_dir = None - # TODO - remove if script works without reannotation - gene_data_dict = None - corrected_dir = None # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] # TODO - Add in so that the user can give a list of genes that they wish to use as 'core genes' core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, args.input_gffs, tmp_folder_path.name, - gene_data_dict, corrected_dir, logger) + logger) time_end_read_files = time.time() @@ -287,11 +269,10 @@ def main(): # TODO - Make this work! if len(non_core_contig_info) > 0: + print("hello!") logger.debug("Non-core contig output") non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) - # time_calculator(time_start, time.time(), "writing output files") - # Finish up running total_time = round(time.time() - total_time_start, 1) initial_time = round(inital_check_time_end - inital_check_time_start, 1) diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 88b3d95..d26ab39 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -8,6 +8,11 @@ def open_file_generator(input_file_path): + """ + Function to read a gff file as either gzip or text + :param input_file_path: String path to file + :return: Generator with line by line + """ # Open input file as if it was gzipped try: with gzip.open(input_file_path, 'rt') as open_file: @@ -285,6 +290,16 @@ def connect_first_n_last_gene_on_contig(core_genes, gff_name, previous_core_gene def record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, contig_name): + """ + Function to record the presence and info of a contig without a core gene + :param coreless_contigs: List with info of other core less contigs + :param acc_genes_in_region: List of accessory genes on contig + :param low_freq_genes_in_region: List of low frequency genes on contig + :param gff_name: String name of input gff file. + :param contig_name: String name of the contg + + :return: Dict of information round coreless contigs as list. + """ acc_genes_in_region = sorted(list(set(acc_genes_in_region))) low_freq_genes_in_region = sorted(list(set(low_freq_genes_in_region))) if len(acc_genes_in_region) + len(low_freq_genes_in_region) > 0: @@ -582,20 +597,13 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc def segment_genome_content(input_gff_file, core_genes, low_freq_genes, acc_gene_dict, complete_genomes): - """ # TODO - Update parameters + """ Single function segmenting the gff into core gene regions to be used for simple multi processing :param input_gff_file: File-path to the given gff file to be segmented :param core_genes: Dictionary over core genes :param low_freq_genes: Dictionary over low-frequency genes :param acc_gene_dict: Dictionary over accessory genes :param complete_genomes: Bool indicating if this genome should be considered as a complete genome - :param source_program: String indicating if program comes from Roary or Panaroo. - :param annotate: Bool to indicate if refound genes should be annotated # TODOD - remove! - :param gene_data_dict: Dict of genes, annotations, names, and sequences found in the gene_data.csv file from Panaroo - :param corrected_dir: File path to directory where corrected Gff files are to be stored. - :param tmp_folder_path: Path to the temporary working folder. - :param discard_corrected: Bool indicating if corrected Gff files should be preserved as an output - :param logger: Progran logger :return input_gff_file: File path to the gff being searched :return core_genes: Dict of core genes passed to genomes and the pan-genome clusters. diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index d06d808..88ada09 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -33,7 +33,7 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): return main_dict -def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): +def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, logger): """ Function that check for that placement of fragmented gene parts, to determine if they are neighbouring or have some genomic feature between them :param fragment_info: List of genes that are found to be fragmented, one composite of fragments for each index @@ -67,7 +67,7 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ f'A problem occurred when trying to find a file for reannotation, when passing the ' f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - gff_name = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + # gff_name = annotate_refound_genes(gff_name, tmp_folder_path, logger) # TODO - This was commented out fragment_info[i][1] = gff_name @@ -137,7 +137,7 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ return fragments_close -def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): +def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, logger): """ Function that pass a Roary style gene presence/absence file. :param pres_abs_file: File path to the gene presence/absence file identified @@ -217,8 +217,7 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, fragment_info = [[genes, gff] for genes, gff in zip(line[14:], gff_file_names[14:]) if ';' in genes] # Check that each annotation is neighboring the other annotation. - fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, - corrected_dir, logger) + fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, logger) # Check if gene was found to be a core gene if all(fragments_close): # Add the gene to the annotation dict diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index ce04a9d..643cad8 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -277,13 +277,11 @@ def test_fragmented_gene_true(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' - gene_data_file = {} - corrected_dir = '' expected_return = [True] return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, - gene_data_file, corrected_dir, self.logger) + self.logger) self.assertEqual(expected_return, return_bool) @@ -301,13 +299,11 @@ def test_fragmented_gene_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' - gene_data_file = {} - corrected_dir = '' expected_return = [False] return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, - gene_data_file, corrected_dir, self.logger) + self.logger) self.assertEqual(expected_return, return_bool) @@ -326,13 +322,11 @@ def test_fragmented_gene_mutiple_genes_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' - gene_data_file = {} - corrected_dir = '' expected_return = [True, False] return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, - gene_data_file, corrected_dir, self.logger) + self.logger) self.assertEqual(expected_return, return_bool) @@ -345,13 +339,11 @@ def test_fragments_on_separate_contigs(self): 'TestCheckingFragmentedGenes/Silas_the_Legionella.gff', 'TestCheckingFragmentedGenes/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' - gene_data_file = {} - corrected_dir = '' expected_return = [False, False] return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, - gene_data_file, corrected_dir, self.logger) + self.logger) self.assertEqual(expected_return, return_bool) @@ -394,8 +386,6 @@ def test_parsing_w_100_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - gene_data_file = {} - corrected_dir = '' expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -464,7 +454,7 @@ def test_parsing_w_100_presence(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + input_gffs, tmp_folder_path, self.logger) self.assertEqual(expected_core_gene_dict, core_gene_dict) self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) @@ -486,8 +476,6 @@ def test_parsing_w_100_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - gene_data_file = {} - corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ @@ -495,7 +483,7 @@ def test_parsing_w_100_presence_roary(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -579,15 +567,13 @@ def test_parsing_w_90_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - gene_data_file = {} - corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -671,15 +657,13 @@ def test_parsing_w_90_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - gene_data_file = {} - corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + input_gffs, tmp_folder_path, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -747,100 +731,6 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) self.assertEqual(expected_acc_gene_dict, acc_gene_dict) - def test_parsign_fragmented_gene_w_refound_component(self): - file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv' - core_gene_presence = 0.9 - low_freq_gene = 0.1 - source_program = 'Panaroo' - input_gffs = ['TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', - 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', - 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', - 'TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff', - 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', - 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', - 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', - 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', - 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', - 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] - tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - gene_data_file = {'Silas_the_Salmonella_w_refound': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT', 'gene_name', 'gene_function']}} - corrected_dir ='TestParsingGenePresenceAbsenceFile/Corrected_gffs' - - core_gene_dict, low_freq_gene_dict, \ - acc_gene_dict = \ - parse_gene_presence_absence.read_gene_presence_absence( - file_name, core_gene_presence, - low_freq_gene, source_program, - input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) - - expected_core_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-1': "A", - '0_refound_0': "B", - 'Silas_the_Salmonella_tag-1-2.2': "B", - 'Silas_the_Salmonella_tag-1-3': 'C', - 'Silas_the_Salmonella_tag-1-4.1': 'D', - 'Silas_the_Salmonella_tag-1-4.2': 'D', }, - 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", - 'Christina_the_Streptococcus_tag-2-2': "B", - 'Christina_the_Streptococcus_tag-2-3': "C", - 'Christina_the_Streptococcus_tag-2-4': "D"}, - 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", - 'Ajwa_the_Shigella_tag-3-2': "B", - "Ajwa_the_Shigella_tag-3-3": "C", - "Ajwa_the_Shigella_tag-3-4": "D"}, - 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", - 'Ajwa_the_Legionella_tag-4-2': "B", - 'Ajwa_the_Legionella_tag-4-3': "C", - 'Ajwa_the_Legionella_tag-4-4': "D"}, - 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", - "Cari_the_Listeria_tag-5-4": "D", - 'Cari_the_Listeria_tag-5-1': "A", - 'Cari_the_Listeria_tag-5-2': "B"}, - 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", - 'Aman_the_Streptococcus_tag-6-2': "B", - "Aman_the_Streptococcus_tag-6-3": "C", - "Aman_the_Streptococcus_tag-6-4": "D"}, - 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", - "Zion_the_Streptococcus_tag-7-4": "D", - 'Zion_the_Streptococcus_tag-7-1': "A", - 'Zion_the_Streptococcus_tag-7-2': "B"}, - 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", - "Dina_the_Shigella_tag-8-4": "D", - 'Dina_the_Shigella_tag-8-1': "A", - 'Dina_the_Shigella_tag-8-2': "B"}, - 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", - "Silas_the_Legionella_tag-9-4": "D", - 'Silas_the_Legionella_tag-9-1': "A", - 'Silas_the_Legionella_tag-9-2': "B"}, - 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", - 'Lilly_the_Shigella_tag-10-2': "B"}} - - expected_low_freq_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag_2': "G"}, - 'Christina_the_Streptococcus': {}, - 'Ajwa_the_Shigella': {}, - 'Ajwa_the_Legionella': {}, - 'Cari_the_Listeria': {}, - 'Aman_the_Streptococcus': {}, - 'Zion_the_Streptococcus': {}, - 'Dina_the_Shigella': {}, - 'Silas_the_Legionella': {}, - 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} - - expected_acc_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-5.1': 'E', - 'Silas_the_Salmonella_tag-1-5.2': 'E'}, - 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-5': "E"}, - 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-5": "E"}, - 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-5': "E"}, - 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-5": "E"}, - 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-5": "E"}, - 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-5": "E"}, - 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-5": "E"}, - 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-5": "E"}, - 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} - - self.assertEqual(expected_core_gene_dict, core_gene_dict) - self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) - self.assertEqual(expected_acc_gene_dict, acc_gene_dict) - class TestExtractGenomeFasta(unittest.TestCase): def test_extract_genome_fasta(self): From 3291497470ee9544a78f9d000c49fbb712d53a0a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:17:44 +0200 Subject: [PATCH 131/135] Remove redundant code, imporove doc-strings --- Corekaburra/__main__.py | 14 ++--- Corekaburra/consesus_core_genome.py | 9 ++- Corekaburra/gff_parser.py | 8 +-- Corekaburra/parse_gene_presence_absence.py | 8 --- unit_tests/Corekaburra_test.py | 69 ++++++++-------------- 5 files changed, 39 insertions(+), 69 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 2c03bb6..e1f25fe 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -40,10 +40,10 @@ except ModuleNotFoundError: from parse_gene_presence_absence import read_gene_presence_absence -try: - from Corekaburra.correct_gffs import prepair_for_reannotation -except ModuleNotFoundError: - from correct_gffs import prepair_for_reannotation +# try: +# from Corekaburra.correct_gffs import prepair_for_reannotation +# except ModuleNotFoundError: +# from correct_gffs import prepair_for_reannotation try: from Corekaburra.gff_parser import segment_genome_content @@ -174,7 +174,6 @@ def main(): ## Read in gene presence absence file time_start_read_files = time.time() - # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] # TODO - Add in so that the user can give a list of genes that they wish to use as 'core genes' core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, @@ -197,7 +196,7 @@ def main(): progress_counter = 0 if len(args.input_gffs) > 10: - progress_update = len(args.input_gffs) / 10 + progress_update = int(len(args.input_gffs) / 10) else: progress_update = 1 @@ -209,6 +208,7 @@ def main(): for output in concurrent.futures.as_completed(results): progress_counter += 1 + print(progress_counter) if progress_counter % progress_update == 0 or progress_counter == 1: logger.info(f"GFF file #{progress_counter} has been processed") @@ -267,9 +267,7 @@ def main(): graph_name = f'{args.output_prefix}_core_gene_graph.gml' if args.output_prefix is not None else 'core_gene_graph.gml' write_gml(core_graph, path=os.path.join(args.output_path, graph_name)) - # TODO - Make this work! if len(non_core_contig_info) > 0: - print("hello!") logger.debug("Non-core contig output") non_core_contig_writer(non_core_contig_info, args.output_path, args.output_prefix) diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index c8d9ec7..b51bd4d 100755 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -168,16 +168,16 @@ def search_for_path(core_graph_copy, source_node, target_node, multi_edge_nodes) return -def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components, logger): +def identify_segments(core_graph, num_gffs, core_gene_dict, logger): """ Function to identify stretches of core genes between core genes neighbouring multiple different genes :param core_graph: Graph over core genes with weights being the number of connections between the genes :param num_gffs: Number of gffs inputted :param core_gene_dict: Dict with keys being genomes, each genome is a dict with keys being genes and values the mapped pan-genome gene cluster. + :param logger: Logger for the program :return: Dict over stretches of core genes found in the core gene graph. """ - # TODO - Describe missing parameters in docstring # Identify all nodes that contain more than two degrees and only one degree. multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] @@ -312,8 +312,9 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num :param core_neighbour_pairs: Dict of the number of times core pairs have been detected :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs :param num_gffs: Number of inputted gff files + :param core_gene_dict: A dictionary of core genes across genomes and their identifier + :param max_cpus: Int for the maximum number of cpus allowed to be used during graph component search :param logger: Program logger - # TODO - Add parameters :return double_edge_segements: :return no_acc_segments: @@ -361,5 +362,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num return double_edge_segements, no_acc_segments, core_graph + if __name__ == '__main__': + print('Nothing was computed. This is not a main program. Run __main__.py') pass diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index d26ab39..7a840bf 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -1,10 +1,10 @@ import os import gzip -try: - from Corekaburra.correct_gffs import annotate_refound_genes -except ModuleNotFoundError: - from correct_gffs import annotate_refound_genes +# try: +# from Corekaburra.correct_gffs import annotate_refound_genes +# except ModuleNotFoundError: +# from correct_gffs import annotate_refound_genes def open_file_generator(input_file_path): diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 88ada09..8a89c2d 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -2,12 +2,6 @@ import csv from math import ceil, floor import gffutils -EXIT_GFF_REANNOTATION_ERROR = 3 - -try: - from Corekaburra.correct_gffs import annotate_refound_genes -except ModuleNotFoundError: - from correct_gffs import annotate_refound_genes try: from Corekaburra.exit_with_error import exit_with_error @@ -67,8 +61,6 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, logger): f'A problem occurred when trying to find a file for reannotation, when passing the ' f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - # gff_name = annotate_refound_genes(gff_name, tmp_folder_path, logger) # TODO - This was commented out - fragment_info[i][1] = gff_name fragments_close = [] diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 643cad8..69666a4 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -732,17 +732,17 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_acc_gene_dict, acc_gene_dict) -class TestExtractGenomeFasta(unittest.TestCase): - def test_extract_genome_fasta(self): - genome_fasta_dict_expected = {'contig} - largest_locus_tag_expected = 'fer_006' - header_lines_expected = ['##gff-version3\n', '#test-line\n'] - - genome_fasta_dict, largest_locus_tag, header_lines = correct_gffs.extract_genome_fasta('TestExtractGenomeFasta/Mock_gff.gff') - - self.assertEqual(genome_fasta_dict_expected, genome_fasta_dict) - self.assertEqual(largest_locus_tag_expected, largest_locus_tag) - self.assertEqual(header_lines_expected, header_lines) +# class TestExtractGenomeFasta(unittest.TestCase): +# def test_extract_genome_fasta(self): +# genome_fasta_dict_expected = {'contig} +# largest_locus_tag_expected = 'fer_006' +# header_lines_expected = ['##gff-version3\n', '#test-line\n'] +# +# genome_fasta_dict, largest_locus_tag, header_lines = correct_gffs.extract_genome_fasta('TestExtractGenomeFasta/Mock_gff.gff') +# +# self.assertEqual(genome_fasta_dict_expected, genome_fasta_dict) +# self.assertEqual(largest_locus_tag_expected, largest_locus_tag) +# self.assertEqual(header_lines_expected, header_lines) class TestParsingGffFile(unittest.TestCase): @@ -2965,9 +2965,8 @@ def test_double_edge_segment_identification_all_2_degree_input(self): 'pan_cluster_6--pan_cluster_1': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) + return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, self.logger) self.assertEqual(None, return_1) @@ -2995,9 +2994,8 @@ def test_double_edge_segment_identification_two_segments(self): 'genome_10': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'},} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3021,9 +3019,8 @@ def test_double_edge_segment_identification_four_segments(self): 'pan_cluster_1--pan_cluster_10': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3049,9 +3046,8 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): 'pan_cluster_6--pan_cluster_1': 9} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3076,10 +3072,8 @@ def test_segments_w_segment_between_multi_connect_genes(self): 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, - self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3109,10 +3103,8 @@ def test_segments_w_large_segment_between_multi_connect_genes(self): 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, - self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3137,9 +3129,8 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se 'genome_5': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3168,9 +3159,8 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths_2( 'genome_8': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3194,9 +3184,8 @@ def test_double_edge_segment_identification_segments_node_w_less_than_all_presen } core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3223,9 +3212,8 @@ def test_double_edge_segment_identification_segments_node_w_two_gene_segment(sel 'genome_3': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, num_components, self.logger) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, self.logger) self.assertEqual(expected_segments, double_edge_segements) @@ -3271,23 +3259,12 @@ def test_multiple_component_core_graph(self): 'tag_16': 'pan_cluster_Q'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - num_components = number_connected_components(core_graph) - double_edge_segements = {} for component in connected_components(core_graph): component_graph = core_graph.subgraph(component).copy() double_edge_segements = double_edge_segements | consesus_core_genome.identify_segments(component_graph, 2, - core_gene_dict, - num_components, self.logger) - - # comparisons = [True for x in double_edge_segements - # if - # (x in expected_segments and - # (expected_segments[x] == double_edge_segements[x] or expected_segments[x][::-1] == double_edge_segements[x])) - # or - # (f"{x.split('--')[1]}'--'{x.split('--')[0]}" in expected_segments and - # (expected_segments[x] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"] or expected_segments[x][::-1] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"])) - # ] + core_gene_dict, self.logger) + key_forward = [x for x in double_edge_segements if x in expected_segments] key_reverse = [f"{x.split('--')[1]}--{x.split('--')[0]}" for x in double_edge_segements if f"{x.split('--')[1]}--{x.split('--')[0]}" in expected_segments] expected_key_match = key_forward+key_reverse From 65d563158af8c579dddbe07d785ab15ab3cb1e71 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:22:15 +0200 Subject: [PATCH 132/135] Remove redundant code --- Corekaburra/parse_gene_presence_absence.py | 52 +++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index 8a89c2d..db0715f 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -36,32 +36,32 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, logger): :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: - refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] - if refound_genes: - for i, gene_gff in refound_genes: - gene, gff = gene_gff - gff_name = None - - try: - gff_name = [gff_name for gff_name in input_gffs - if f"{gff}_corrected" in [os.path.basename(gff_name), - os.path.basename(gff_name).rsplit('.', 1)[0], - os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - except IndexError: - pass - - if gff_name is None: - try: - gff_name = [gff_name for gff_name in input_gffs - if gff in [os.path.basename(gff_name), - os.path.basename(gff_name).rsplit('.', 1)[0], - os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - except IndexError: - exit_with_error(EXIT_GFF_REANNOTATION_ERROR, - f'A problem occurred when trying to find a file for reannotation, when passing the ' - f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - - fragment_info[i][1] = gff_name + # refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + # if refound_genes: + # for i, gene_gff in refound_genes: + # gene, gff = gene_gff + # gff_name = None + # + # try: + # gff_name = [gff_name for gff_name in input_gffs + # if f"{gff}_corrected" in [os.path.basename(gff_name), + # os.path.basename(gff_name).rsplit('.', 1)[0], + # os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + # except IndexError: + # pass + # + # if gff_name is None: + # try: + # gff_name = [gff_name for gff_name in input_gffs + # if gff in [os.path.basename(gff_name), + # os.path.basename(gff_name).rsplit('.', 1)[0], + # os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + # except IndexError: + # exit_with_error(1, + # f'A problem occurred when trying to find a file for reannotation, when passing the ' + # f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') + # + # fragment_info[i][1] = gff_name fragments_close = [] for fragment in fragment_info: From 32660c42cdc1df8fc7f0329e59db21fde053d989 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:00:48 +0200 Subject: [PATCH 133/135] Remove redundant code and tidy up here and there --- Corekaburra/__main__.py | 6 - Corekaburra/consesus_core_genome.py | 2 +- Corekaburra/correct_gffs.py | 326 ------------------ Corekaburra/gff_parser.py | 11 +- Corekaburra/parse_gene_presence_absence.py | 27 -- unit_tests/Corekaburra_test.py | 17 +- .../TestExitWithError/tmp_folder/test_file | 0 7 files changed, 5 insertions(+), 384 deletions(-) delete mode 100755 Corekaburra/correct_gffs.py mode change 100644 => 100755 unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index e1f25fe..4601a5f 100755 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -40,11 +40,6 @@ except ModuleNotFoundError: from parse_gene_presence_absence import read_gene_presence_absence -# try: -# from Corekaburra.correct_gffs import prepair_for_reannotation -# except ModuleNotFoundError: -# from correct_gffs import prepair_for_reannotation - try: from Corekaburra.gff_parser import segment_genome_content except ModuleNotFoundError: @@ -208,7 +203,6 @@ def main(): for output in concurrent.futures.as_completed(results): progress_counter += 1 - print(progress_counter) if progress_counter % progress_update == 0 or progress_counter == 1: logger.info(f"GFF file #{progress_counter} has been processed") diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index b51bd4d..e4d6797 100755 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -336,7 +336,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num with concurrent.futures.ProcessPoolExecutor(max_workers=max_cpus) as executor: return_object = [executor.submit(identify_segments, core_graph.subgraph(component).copy(), num_gffs, - core_gene_dict, num_core_graph_components, logger) + core_gene_dict, logger) for component in nx.connected_components(core_graph)] # identify_segments(core_graph.subgraph(component).copy(), num_gffs, core_gene_dict, num_core_graph_components, logger) for output in concurrent.futures.as_completed(return_object): diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py deleted file mode 100755 index f0d809f..0000000 --- a/Corekaburra/correct_gffs.py +++ /dev/null @@ -1,326 +0,0 @@ -from Bio import Seq -from Bio.SeqRecord import SeqRecord -import gffutils -from gffutils.gffwriter import GFFWriter -import os -import concurrent.futures -# from Bio.Blast import -from time import time - -try: - from Corekaburra.exit_with_error import exit_with_error -except ModuleNotFoundError: - from exit_with_error import exit_with_error -EXIT_GFF_REANNOTATION_ERROR = 3 - - -def read_gene_data(gene_data_file): - """ - Function to read the gene_data.csv file outputted by Panaroo and - :param gene_data_file: File path to the gene_data.csv file - :return: A dict of genomes with their refound genes - """ - - # Construct dictionary to hold refound genes and sequences for these - gene_data_dict = {} - - # Read the gene_data.csv file and record all refound genes - with open(gene_data_file, 'r') as gene_data: - - for line in gene_data.readlines(): - # Split read line at commas - line = line.split(',') - # TODO - Scaffold (contig) name can be found in second position of a gene_data.csv line. This could possibly be used to speed things up so that the entire set of contigs isn't required for search. - - # Check if refound gene - if 'refound' in line[2]: - # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, name, and function in that order. - # if the first key (genome) is not found in gene_data dict, - # then construct dict for the genome and add the gene - try: - gene_data_dict[line[0]][line[2]] = [line[5], line[6], line[7].strip()] - except KeyError: - gene_data_dict[line[0]] = {line[2]: [line[5], line[6], line[7].strip()]} - - return gene_data_dict - - -def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): - """ - Function for creating an output folder for corrected genomes, check if any are present, and if then which. - :param gene_data_path: Path to the gene_data.csv file from Panaroo - :param output_folder: Folder designated as the output folder for Corekaburra - :param gffs: List of file-paths to gff files. - :param logger: Program logger - - :return gene_data_dict: Dict containing the information expected from the gene_data.csv file - :return corrected_gff_out_dir: File path to the created or identified directory of corrected gff files - :return gffs: List of gff files, some may be altered to be the corrected verison from prior runs/ - """ - - logger.debug('Initialise structures for reannotating genes found by Panaroo') - - # Read Gene_data.csv file into dict with a dict of refound genes for each genome - gene_data_dict = read_gene_data(gene_data_path) - - # Construct directory to hold corrected gff files: - corrected_gff_out_dir = os.path.join(output_folder, 'Corrected_gff_files') - # Try and construct folder, - # if present check if content matches input to avoid process - try: - os.mkdir(corrected_gff_out_dir) - except FileExistsError: - # Get path for input - input_path_dict = {os.path.basename(gff): os.path.split(gff)[0] for gff in gffs} - # input_path = os.path.split(gffs[0])[0] - - corrected_folder_content = os.listdir(corrected_gff_out_dir) - - gff_names = [os.path.basename(gff) for gff in gffs] - - corrected_files = [file for file in corrected_folder_content if - f'{file.split("_corrected")[0]}.gff' in gff_names] - - corrected_files_w_path = [os.path.join(corrected_gff_out_dir, file) for file in corrected_files] - - if len(corrected_files) > 0: - gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] - gffs = [os.path.join(input_path_dict[gff], gff) for gff in gffs] - gffs = gffs + corrected_files_w_path - - return gene_data_dict, corrected_gff_out_dir, gffs - - -def extract_genome_fasta(gff_name): - """ - Function to read and extract information from a gff3 file - :param gff_name: File path to a gff file - - :return genome_fasta_dict: Dict over contig names as keys, and values being the contig sequence - :return largest_locus_tag: the largest locus_tag identified in gff file - :return header_lines: the header lines proceeding annotations. - """ - - # Initialise the two return variables a dict for refound gene's annotaton and the largest locus_tag - genome_fasta_dict = {} - largest_locus_tag = '' - header_lines = [] - - # Open the gff file and indicate that the FASTA sequence has not beed reached - with open(gff_name, 'r') as gff_file: - found_fasta = False - - # Go through gff file and find and read the fasta seuqence at the end after the ##FASTA mark - for line in gff_file.readlines(): - if found_fasta: - # Check if line is fasta header, - # If then construct new dict key if not append sequence to current key - if '>' in line: - line = line.split(' ')[0] - try: - current_contig = line.strip() - current_contig = current_contig.split(">")[1] - genome_fasta_dict[current_contig] = '' - except KeyError: - raise KeyError("Some contig names contain redundant names when seperated by white space") - else: - line = line.split('\n')[0] - genome_fasta_dict[current_contig] = genome_fasta_dict[current_contig] + line - - # Check if FASTA part of gff file has been found, - # if then indicate to start recording sequences, - # else compare locus_tag of the line - elif '##FASTA' in line: - found_fasta = True - - else: - # Compare the locus_tag with the previously largest locus_tag, - # save the largest of the two - if '#' not in line: - line = line.split('\t') - line = line[8].split(';') - line = [element for element in line if 'locus_tag' in element] - # Examine non empty locus_tags - if len(line) > 0: - line = line[0] - line = line.split('locus_tag=')[1] - line = line.strip() - if line > largest_locus_tag: - largest_locus_tag = line - # Save header lines - else: - header_lines.append(line) - - return genome_fasta_dict, largest_locus_tag, header_lines - - -def add_gene_to_gff(tmp_gff, gene_oi, genome_oi, contig, strand, refound_gene_tag, annotation, largest_locus_tag): - """ - Function to construct and append a line to a file. - :param tmp_gff: An open file to append a line to - :param gene_oi: Gene in question - :param genome_oi: Genome in question - :param contig: Contig of the genome - :param strand: Strand of the gene in question - :param annotation: Any annotation found in Panaroo - :param refound_gene_tag: Tag given by Panaroo - :param largest_locus_tag: The current largest locus_tag - :return: The new largest locus_tag - """ - gene_start = genome_oi.find(gene_oi) + 1 - gene_end = gene_start + len(gene_oi) - 1 - locus_tag_parts = largest_locus_tag.rsplit('_', maxsplit=1) - tag_length = len(locus_tag_parts[1]) - - locus_tag_parts[1] = int(locus_tag_parts[1]) + 1 - preceding_zeros = str(0) * (tag_length - len(str(locus_tag_parts[1]))) - # Add preceding zeros - locus_tag_parts[1] = f'{preceding_zeros}{locus_tag_parts[1]}' - new_locus_tag = f'{locus_tag_parts[0]}_{locus_tag_parts[1]}' - description = annotation[1] - name = annotation[0] - - # Construct gff field 9 - info_field = f'ID={new_locus_tag};locus_tag={new_locus_tag};old_locus_tag={refound_gene_tag}' - - if name != '': - info_field += f';name={name}' - if description != '': - info_field += f';annotation={description}' - - # construct tab delimited string containing the features of the gene - gff_line = f'{contig}\t' \ - f'Panaroo\t' \ - f'CDS\t' \ - f'{gene_start}\t' \ - f'{gene_end}\t' \ - f'.\t' \ - f'{strand}\t' \ - f'0\t' \ - f'{info_field}' - - tmp_gff.write(gff_line + '\n') - - return new_locus_tag - - -def write_contig(file, contig_name, sequence): - """ - Write contig into file - :param file: Open file for appending contig to - :param contig_name: Name of the contig to be added - :param sequence: Sequence of contig to be added - :return: Nothing - """ - # Write contig name - file.write(f'>{contig_name}\n') - - # Write bulk of sequence - for i in range(len(sequence) // 60): - file.write(sequence[0+60*i:60+60*i] + '\n') - - # Write remainder of sequence - remainder = len(sequence) % 60 - genome_length = len(sequence) - file.write(sequence[len(sequence) - remainder:genome_length+1] + '\n') - - -def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_gff_out_dir, logger): - """ - Function to add back in genes that are refound by Panaroo into gff files. - :param gff_name: File path of gff to be corrected - :param gene_data_dict: Dict of refound genes identified from gene_presence_absence.csv file - :param tmp_folder_path: File path to the temporary folder - :param corrected_gff_out_dir: File path to the folder where corrected genomes should be place - :param logger: Program logger - :return: Nothing. - """ - """ Function to annotate the genes refound by Panaroo in a gff3 file""" - # Read in a gff file - # Get base name of gff file and construct path to database in temporary folder - gff_file_name = os.path.basename(gff_name) - data_base = os.path.join(tmp_folder_path, f'{gff_file_name}_db') - - # Create a database for the gff file - gffutils.create_db(gff_name, data_base) - # Attach database - gff_db = gffutils.FeatureDB(data_base) - - # Write quick tmp database for appending new genes. - tmp_gff = os.path.join(tmp_folder_path, f'{gff_file_name.split(".gff")[0]}_tmp.gff') - with open(tmp_gff, 'w') as gff_file: - for feature in gff_db.all_features(): - gff_file.writelines(str(feature) + '\n') - - # Pass the gff file manually to extract the genome fasta sequence(s) and the largest locus_tag - fasta_genome, largest_locus_tag, header_lines = extract_genome_fasta(gff_name) - - # Find all refound genes for given genome in gene data file - genome_name = gff_file_name.split('.')[0] - - # Search for the refound genes and record their coordinate, strand and add them to the gff file - with open(tmp_gff, 'a') as tmp_gff_file: - for refound_gene in gene_data_dict[genome_name]: # .keys() - gene_oi = gene_data_dict[genome_name][refound_gene][0] - - strand = None - contig_counter = 0 - contigs = list(fasta_genome) # .keys() - while strand is None and contig_counter < len(contigs) : - contig = contigs[contig_counter] - genome_oi = fasta_genome[contig] - - if gene_oi in genome_oi: - strand = '+' - - else: - # get reverse complement of the gene - gene_oi = Seq.reverse_complement(gene_oi) - if gene_oi in genome_oi: - strand = '-' - - contig_counter += 1 - - if strand is not None: - # Add the gene to the gff file. - largest_locus_tag = add_gene_to_gff(tmp_gff_file, gene_oi, genome_oi, contig, strand, - refound_gene, gene_data_dict[genome_name][refound_gene][1:], largest_locus_tag) - else: - exit_with_error(f"When correcting gff {gff_name}, the gene: {refound_gene} " - f"did not have any hit in the genome!", EXIT_GFF_REANNOTATION_ERROR, logger) - - # Construct a database from the temporary gff that contain the added annotations - path_tmp_gff_db = os.path.join(tmp_folder_path, f'{gff_file_name}_tmp_db') - # make database - gffutils.create_db(tmp_gff, path_tmp_gff_db) - # Attach database - tmp_gff_db = gffutils.FeatureDB(path_tmp_gff_db) - - # Print the final GFF3 file - corrected_gff_file = os.path.join(corrected_gff_out_dir, f'{gff_file_name.split(".gff")[0]}_corrected.gff') - with open(corrected_gff_file, 'w') as gff_file: - # Write initial lines - for line in header_lines: - gff_file.write(line) - - # ADD the gff lines - for feature in tmp_gff_db.all_features(order_by=('seqid', 'start')): - gff_file.writelines(str(feature) + '\n') - - # Write line to separate genome fasta - gff_file.write("##FASTA\n") - - # Write the genome fasta - for contig_name in fasta_genome.keys(): - write_contig(gff_file, contig_name, fasta_genome[contig_name]) - - # remove database for gff file and the temporary gff file in temporary folder - os.remove(data_base) - os.remove(tmp_gff) - os.remove(path_tmp_gff_db) - - return corrected_gff_file - - -if __name__ == '__main__': - pass diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 7a840bf..31b2f77 100755 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -1,11 +1,6 @@ import os import gzip -# try: -# from Corekaburra.correct_gffs import annotate_refound_genes -# except ModuleNotFoundError: -# from correct_gffs import annotate_refound_genes - def open_file_generator(input_file_path): """ @@ -414,10 +409,9 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc if line[8] in low_freq_genes[gff_name]: low_freq_genes_in_region.append(low_freq_genes[gff_name][line[8]]) else: - # acc_genes_in_region.append(acc_genes[gff_name][line[8]]) try: acc_genes_in_region.append(acc_genes[gff_name][line[8]]) - except KeyError: # TODO - WHAT DOES THIS DO? - Likely search for fragment within composite, as fragments were previously storred in their composit strings. + except KeyError: gene_key = [key for key in acc_genes[gff_name].keys() if line[8] in key] if len(gene_key) > 1: acc_genes_in_region.append(acc_genes[gff_name][gene_key][0]) @@ -590,7 +584,8 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc master_info) else: # Add a core-less contig if there has been accessory genes: - coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0]) + coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, + low_freq_genes_in_region, gff_name, line[0]) return core_gene_pairs, core_gene_pair_distance, accessory_gene_content, \ low_freq_gene_content, master_info, coreless_contigs diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index db0715f..06fbc33 100755 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -35,33 +35,6 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, logger): :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ - # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: - # refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] - # if refound_genes: - # for i, gene_gff in refound_genes: - # gene, gff = gene_gff - # gff_name = None - # - # try: - # gff_name = [gff_name for gff_name in input_gffs - # if f"{gff}_corrected" in [os.path.basename(gff_name), - # os.path.basename(gff_name).rsplit('.', 1)[0], - # os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - # except IndexError: - # pass - # - # if gff_name is None: - # try: - # gff_name = [gff_name for gff_name in input_gffs - # if gff in [os.path.basename(gff_name), - # os.path.basename(gff_name).rsplit('.', 1)[0], - # os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - # except IndexError: - # exit_with_error(1, - # f'A problem occurred when trying to find a file for reannotation, when passing the ' - # f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - # - # fragment_info[i][1] = gff_name fragments_close = [] for fragment in fragment_info: diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 69666a4..3468d5f 100755 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -7,13 +7,12 @@ # import import unittest import os -from shutil import copyfile import logging from networkx import number_connected_components, connected_components + # pylint: disable=no-name-in-module # import Corekaburra functions -from Corekaburra import exit_with_error from Corekaburra import read_complete_genome_file from Corekaburra import check_inputs from Corekaburra import parse_gene_presence_absence @@ -22,7 +21,6 @@ from Corekaburra import consesus_core_genome from Corekaburra import summary_table from Corekaburra import output_writer_functions -from Corekaburra import correct_gffs # move to folder with mock files. First try Github structure, then try pulled repository structure try: @@ -732,19 +730,6 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_acc_gene_dict, acc_gene_dict) -# class TestExtractGenomeFasta(unittest.TestCase): -# def test_extract_genome_fasta(self): -# genome_fasta_dict_expected = {'contig} -# largest_locus_tag_expected = 'fer_006' -# header_lines_expected = ['##gff-version3\n', '#test-line\n'] -# -# genome_fasta_dict, largest_locus_tag, header_lines = correct_gffs.extract_genome_fasta('TestExtractGenomeFasta/Mock_gff.gff') -# -# self.assertEqual(genome_fasta_dict_expected, genome_fasta_dict) -# self.assertEqual(largest_locus_tag_expected, largest_locus_tag) -# self.assertEqual(header_lines_expected, header_lines) - - class TestParsingGffFile(unittest.TestCase): """ Test of the function that is used to pass a gff file and return a generator object of CDS lines """ def test_gff_generator_generation_not_corrected(self): diff --git a/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file b/unit_tests/unit_test_data/TestExitWithError/tmp_folder/test_file old mode 100644 new mode 100755 From da9d14a8dc078b56dd3689012735a8ccfa0dd777 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:27:54 +0200 Subject: [PATCH 134/135] Bump version and update short description --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1371e4f..e9e41e3 100755 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='Corekaburra', - version='0.0.2', + version='0.0.3', author='Magnus Ganer Jespersen', author_email='magnus.ganer.j@gmail.com', packages=['Corekaburra'], @@ -22,10 +22,10 @@ }, url='https://github.com/milnus/Corekaburra', license='LICENSE', - description=('A prototypical bioinformatics command line tool'), + description=('A commandline bioinformatics tool to utilize syntenic information from genomes in the context of pan-genomes'), long_description=(LONG_DESCRIPTION), install_requires=["biopython", "networkx", "gffutils", "numpy"], - keywords=['Genomic', 'pan-genome', 'bacteria', 'prokaryotes', 'bioinformatics'], + keywords=['Genomics', 'pan-genome', 'bacteria', 'prokaryotes', 'bioinformatics'], classifiers=[ 'Programming Language :: Python :: 3.9', 'License :: OSI Approved :: MIT License', From f5014cc53d42b3774fe65943e7ea9a5762d5a587 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:30:58 +0200 Subject: [PATCH 135/135] Add in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e9e41e3..56d7b06 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ license='LICENSE', description=('A commandline bioinformatics tool to utilize syntenic information from genomes in the context of pan-genomes'), long_description=(LONG_DESCRIPTION), - install_requires=["biopython", "networkx", "gffutils", "numpy"], + install_requires=["biopython==1.79", "networkx", "gffutils", "numpy"], keywords=['Genomics', 'pan-genome', 'bacteria', 'prokaryotes', 'bioinformatics'], classifiers=[ 'Programming Language :: Python :: 3.9',