From 2cc55682d7593108f9d2fab4fa1ae2fc6947b012 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:27:40 +1100 Subject: [PATCH 01/20] Add in handling of refound genes in fragmented genes. Improve README with a conda environment installation guide. Add version print to commandline --- Corekaburra/__main__.py | 20 +-- Corekaburra/commandline_interface.py | 14 +- Corekaburra/correct_gffs.py | 8 +- Corekaburra/parse_gene_presence_absence.py | 62 ++++++-- README.md | 20 +-- functional_tests/test_data/no_input.expected | 12 +- unit_tests/Corekaburra_test.py | 149 ++++++++++++++++-- .../Corrected_gffs/place_holder | 0 .../Silas_the_Salmonella_w_refound.gff | 11 ++ ...ne_presence_absence_w_refound_fragment.csv | 8 + 10 files changed, 247 insertions(+), 57 deletions(-) create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff create mode 100644 unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index d0af37c..26668ab 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -146,7 +146,7 @@ def main(): inital_check_time_start = time.time() # get arguments from the commandline - args = get_commandline_arguments(sys.argv[1:]) + args = get_commandline_arguments(sys.argv[1:], PROGRAM_VERSION) # Construct output folder try: @@ -195,21 +195,23 @@ def main(): ## Read in gene presence absence file time_start_read_files = time.time() + # Prepair folder for reannotated genes and examine if any are already present + if source_program == "Panaroo" and args.annotate: + gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, + args.input_gffs, logger) + else: + gene_data_dict = None + corrected_dir = None + # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead? # - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes # - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files. # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670] core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff, args.low_cutoff, source_program, - args.input_gffs, tmp_folder_path, logger) + args.input_gffs, tmp_folder_path, + gene_data_dict, corrected_dir, logger) - # Prepair folder for reannotated genes and examine if any are already present - if source_program == "Panaroo" and args.annotate: - gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path, - args.input_gffs, logger) - else: - gene_data_dict = None - corrected_dir = None time_end_read_files = time.time() time_start_passing_gffs = time.time() diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py index 72614e6..62cbb40 100644 --- a/Corekaburra/commandline_interface.py +++ b/Corekaburra/commandline_interface.py @@ -4,15 +4,16 @@ EXIT_COMMAND_LINE_ERROR = 2 -def get_commandline_arguments(args): +def get_commandline_arguments(args, version): """ Function that takes the input given to the commandline and passes it. will check for no input and '-help' :param args: List of input arguments given to the commandline + :param version: Version of Corekaburra :return: matched argument object for passing in main function. """ # Set up parser - parser = argparse.ArgumentParser(description='Welcome to Corekaburra!' + parser = argparse.ArgumentParser(description='Welcome to Corekaburra! ' 'An extension to pan-genome analyses that summarise genomic regions ' 'between core genes and segments of neighbouring core genes using ' 'gene synteny from a set of input genomes and a pan-genome folder.', @@ -124,7 +125,10 @@ def get_commandline_arguments(args): action='help', help='Show help function') - + rem_args.add_argument('-v', + '--version', + action='version', + version=f'Corekaburra {version}') # Check if any thing is given as input otherwise warn and print help if len(args) < 1: @@ -137,3 +141,7 @@ def get_commandline_arguments(args): args = parser.parse_args(args) return args + + +if __name__ == '__main__': + get_commandline_arguments([], 666) \ No newline at end of file diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index dd6e5c7..40ad9ef 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -33,7 +33,7 @@ def read_gene_data(gene_data_file): # Check if refound gene if 'refound' in line[2]: - # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, + # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, name, and function in that order. # if the first key (genome) is not found in gene_data dict, # then construct dict for the genome and add the gene try: @@ -316,9 +316,3 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_ if __name__ == '__main__': pass - # _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv', - # 1, 0.05) - # - # correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv', - # "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict) - # # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff') diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index d680ad1..b1f1eba 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -3,6 +3,12 @@ from math import ceil, floor import gffutils +try: + from Corekaburra.correct_gffs import annotate_refound_genes +except ModuleNotFoundError: + from correct_gffs import annotate_refound_genes + + def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): """ @@ -22,7 +28,7 @@ def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): return main_dict -def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): +def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): """ Function that check for that placement of fragmented gene parts, to determine if they are neighbouring or have some genomic feature between them :param fragment_info: List of genes that are found to be fragmented, one composite of fragments for each index @@ -30,7 +36,20 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): :param tmp_folder_path: A file-path to the temporary folder of the Corekaburra run :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ - return_list = [] + # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: + refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + if refound_genes: + for i, gene_gff in refound_genes: + # TODO Check if corrected genome is already made if then skip and just correct genome to look in. + gene, gff = gene_gff + gff_name = [gff_name for gff_name in input_gffs + if gff in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + + fragment_info[i][1] = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + + fragments_close = [] for fragment in fragment_info: # split the two fragments fragment_pieces = fragment[0].split(';') @@ -39,21 +58,27 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): genome = fragment[1] # Get the gff and its path - try: - gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] - except IndexError: - raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') + if '.gff' not in genome: + try: + gff_file = [file for file in input_gffs if f'{genome}.gff' in file][0] + db_name = os.path.join(tmp_folder_path, f'{genome}_db') + except IndexError: + raise NotImplementedError(f'No gff match was found when searching fragments for genome: {genome}') + else: + gff_file = genome + db_name = f"{os.path.basename(genome)}_db" + db_name = os.path.join(tmp_folder_path, db_name) # Construct gff database to be searched - db_name = os.path.join(tmp_folder_path, f'{genome}_db') if not os.path.isfile(db_name): - gffutils.create_db(gff_file, db_name, force_gff=True) + gffutils.create_db(gff_file, db_name, force_gff=True, id_spec=['old_locus_tag', 'ID']) # Attach database gff_database = gffutils.FeatureDB(db_name) # Check that all fragments are on the same contig. first_fragment_contig = gff_database[fragment_pieces[0]][0] + frag_same_contig = all([first_fragment_contig == gff_database[fragment][0] for fragment in fragment_pieces]) if frag_same_contig: # Get all coordinates @@ -70,23 +95,29 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path): # Find all features that are completely within the region region_features = gff_database.region(region=region, completely_within=True) + # Find if some pieces are refound and change old_locus_tag to ID + refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] + if refound_pieces: + for piece in refound_pieces: + fragment_pieces[i] = gff_database[piece[1]]['ID'][0] + # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) excess_genes = region_locus_tags.difference(fragment_pieces) # check the number of excess genes, if any then False to being core if len(excess_genes) > 0: - return_list.append(False) + fragments_close.append(False) else: - return_list.append(True) + fragments_close.append(True) else: - return_list.append(False) + fragments_close.append(False) - return return_list + return fragments_close # TODO - find out what the non-closed file problem is here! Can be seen when running unit-tests. -def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, logger): +def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, source_program, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger): """ Function that pass a Roary style gene presence/absence file. :param pres_abs_file: File path to the gene presence/absence file identified @@ -166,10 +197,11 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, fragment_info = [[genes, gff] for genes, gff in zip(line[14:], gff_file_names[14:]) if ';' in genes] # Check that each annotation is neighboring the other annotation. - return_list = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? + fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, + corrected_dir, logger) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? # Check if gene was found to be a core gene - if all(return_list): + if all(fragments_close): # Add the gene to the annotation dict for genome in core_gene_dict: # Get the annoations for a specific genome diff --git a/README.md b/README.md index 720d766..e08541d 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,24 @@ and distance between core genes. Information from neighboring core genes is furt gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs from standard pan-genome pipelines: [Roary](academic.oup.com/bioinformatics/article/31/22/3691/240757) and [Panaroo](genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02090-4). -# When to use +# Why and When to use Corekaburra Corekaburra fits into the existing frameworks of bioinformatics pipelines for pan-genomes. It does not reinvent a new pan-genome pipeline, but leverages the existing ones. Because of this, Corekaburra is build to be a natural extension to the analysis of pan-genomes by summarising information and inferring relationships in the pan-genome otherwise not easily accessible via pan-genome graphs. Other tools provide similar outputs or information, but in their own standalone pan-genome analysis framework or pipeline. Such frameworks/pipelines are [PPanGGolin](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007732) and [Panakeia](https://www.biorxiv.org/content/biorxiv/early/2021/03/02/2021.03.02.433540.full.pdf). By building on top of existing tools Corekaburra frees users from potentially cross referencing beteween pan-genomes, which in itself is a challenging task. Corekaburra's workflow also allows it to be extended to any pan-genome tool, with an output similar to the gene_presence_absence.csv produced by Roary, making Corekaburra versatile for future implementations. -# Why use Corekaburra? - - # Installation -Corekaburra can be installed via pip and conda. A Docker container is also available. +Corekaburra is writen in Python 3.9, and can be installed via pip and conda. A Docker container is also available. ## pip -```Comming soon``` +```pip install corekaburra``` + +## building a Conda environment from scratch +```conda create -n Corekaburra python==3.9``` +```conda activate Corekaburra``` +```pip install corekaburra``` -## Conda +## Conda install ```Comming``` ## Docker -See the Wiki for more information (*** Link to wiki's Docker page ***)[] +See the (Wiki for more information)[https://github.com/milnus/Corekaburra/wiki/Docker.md] # Help ``` @@ -114,7 +116,7 @@ A folder containing Gff files that have been corrected by annotating the genes r **Notice this will duplicate your Gff files, meaning that ```-a``` or ```-d``` arguments should be used to avoid this, when dealing with memory issues or large datasets** # For more info -For more into on Corekaburra, its workings, inputs, outputs and more see the (wiki)[*** Wiki link ***] +For more into on Corekaburra, its workings, inputs, outputs and more see the (wiki)[https://github.com/milnus/Corekaburra/wiki] # Bug reporting and feature requests diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index f991755..bbc0960 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,9 +1,10 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] +usage: Corekaburra -ig file.gff [file.gff ...] -ip + path/to/pan_genome [-cg complete_genomes.txt] + [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] + [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] + [-h] [-v] -Welcome to Corekaburra!An extension to pan-genome analyses that summarise +Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes using gene synteny from a set of input genomes and a pan-genome folder. @@ -44,3 +45,4 @@ Other arguments: -l, --log Record program progress in for debugging purpose -q, --quiet Only print warnings -h, --help Show help function + -v, --version show program's version number and exit diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 2b073ea..15860ca 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -259,6 +259,10 @@ class TestCheckingFragmentedGenes(unittest.TestCase): """ Test of the function that examines the placement of a potential core gene's placement, if it is fragmented in at least one genome. """ + @classmethod + def setUpClass(cls): + cls.logger = logging.getLogger('test_logger.log') + cls.logger.setLevel(logging.INFO) def tearDown(self): """ Class to remove created database files of gff files in tmp-folder""" @@ -281,10 +285,13 @@ def test_fragmented_gene_true(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [True] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -302,10 +309,13 @@ def test_fragmented_gene_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -324,10 +334,13 @@ def test_fragmented_gene_mutiple_genes_fasle(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [True, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -340,10 +353,13 @@ def test_fragments_on_separate_contigs(self): 'TestCheckingFragmentedGenes/Silas_the_Legionella.gff', 'TestCheckingFragmentedGenes/Lilly_the_Shigella.gff'] tmp_folder_path = 'test_tmp_folder' + gene_data_file = {} + corrected_dir = '' expected_return = [False, False] - return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path) + return_bool = parse_gene_presence_absence.check_fragmented_gene(fragments_info, input_gffs, tmp_folder_path, + gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_return, return_bool) @@ -357,6 +373,20 @@ def setUpClass(cls): cls.logger = logging.getLogger('test_logger.log') cls.logger.setLevel(logging.INFO) + def tearDown(self): + try: + os.remove('TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound_db') + except FileNotFoundError: + pass + + try: + for file in os.listdir('TestParsingGenePresenceAbsenceFile/Corrected_gffs/'): + if '.gff' in file: + print(file) + os.remove(os.path.join('TestParsingGenePresenceAbsenceFile/Corrected_gffs/', file)) + except FileNotFoundError: + pass + def test_parsing_w_100_presence(self): file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_roary.csv' core_gene_presence = 1 @@ -373,6 +403,8 @@ def test_parsing_w_100_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -441,7 +473,7 @@ def test_parsing_w_100_presence(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) self.assertEqual(expected_core_gene_dict, core_gene_dict) self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) @@ -463,7 +495,8 @@ def test_parsing_w_100_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' - + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ @@ -471,7 +504,7 @@ def test_parsing_w_100_presence_roary(self): parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -555,13 +588,15 @@ def test_parsing_w_90_presence(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -645,13 +680,15 @@ def test_parsing_w_90_presence_roary(self): 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {} + corrected_dir = '' core_gene_dict, low_freq_gene_dict, \ acc_gene_dict = \ parse_gene_presence_absence.read_gene_presence_absence( file_name, core_gene_presence, low_freq_gene, source_program, - input_gffs, tmp_folder_path, self.logger) + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) expected_core_gene_dict = {'Silas_the_Salmonella': {'Silas_the_Salmonella_tag-1-1': "A", 'Silas_the_Salmonella_tag-1-2.1': "B", @@ -719,6 +756,100 @@ def test_parsing_w_90_presence_roary(self): self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + def test_parsign_fragmented_gene_w_refound_component(self): + file_name = 'TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv' + core_gene_presence = 0.9 + low_freq_gene = 0.1 + source_program = 'Panaroo' + input_gffs = ['TestParsingGenePresenceAbsenceFile/Christina_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Ajwa_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff', + 'TestParsingGenePresenceAbsenceFile/Cari_the_Listeria.gff', + 'TestParsingGenePresenceAbsenceFile/Aman_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Zion_the_Streptococcus.gff', + 'TestParsingGenePresenceAbsenceFile/Dina_the_Shigella.gff', + 'TestParsingGenePresenceAbsenceFile/Silas_the_Legionella.gff', + 'TestParsingGenePresenceAbsenceFile/Lilly_the_Shigella.gff'] + tmp_folder_path = 'TestParsingGenePresenceAbsenceFile/' + gene_data_file = {'Silas_the_Salmonella_w_refound': {'0_refound_0': ['CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT', 'gene_name', 'gene_function']}} + corrected_dir ='TestParsingGenePresenceAbsenceFile/Corrected_gffs' + + core_gene_dict, low_freq_gene_dict, \ + acc_gene_dict = \ + parse_gene_presence_absence.read_gene_presence_absence( + file_name, core_gene_presence, + low_freq_gene, source_program, + input_gffs, tmp_folder_path, gene_data_file, corrected_dir, self.logger) + + expected_core_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-1': "A", + '0_refound_0': "B", + 'Silas_the_Salmonella_tag-1-2.2': "B", + 'Silas_the_Salmonella_tag-1-3': 'C', + 'Silas_the_Salmonella_tag-1-4.1': 'D', + 'Silas_the_Salmonella_tag-1-4.2': 'D', }, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-1': "A", + 'Christina_the_Streptococcus_tag-2-2': "B", + 'Christina_the_Streptococcus_tag-2-3': "C", + 'Christina_the_Streptococcus_tag-2-4': "D"}, + 'Ajwa_the_Shigella': {'Ajwa_the_Shigella_tag-3-1': "A", + 'Ajwa_the_Shigella_tag-3-2': "B", + "Ajwa_the_Shigella_tag-3-3": "C", + "Ajwa_the_Shigella_tag-3-4": "D"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-1': "A", + 'Ajwa_the_Legionella_tag-4-2': "B", + 'Ajwa_the_Legionella_tag-4-3': "C", + 'Ajwa_the_Legionella_tag-4-4': "D"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-3": "C", + "Cari_the_Listeria_tag-5-4": "D", + 'Cari_the_Listeria_tag-5-1': "A", + 'Cari_the_Listeria_tag-5-2': "B"}, + 'Aman_the_Streptococcus': {'Aman_the_Streptococcus_tag-6-1': "A", + 'Aman_the_Streptococcus_tag-6-2': "B", + "Aman_the_Streptococcus_tag-6-3": "C", + "Aman_the_Streptococcus_tag-6-4": "D"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-3": "C", + "Zion_the_Streptococcus_tag-7-4": "D", + 'Zion_the_Streptococcus_tag-7-1': "A", + 'Zion_the_Streptococcus_tag-7-2': "B"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-3": "C", + "Dina_the_Shigella_tag-8-4": "D", + 'Dina_the_Shigella_tag-8-1': "A", + 'Dina_the_Shigella_tag-8-2': "B"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-3": "C", + "Silas_the_Legionella_tag-9-4": "D", + 'Silas_the_Legionella_tag-9-1': "A", + 'Silas_the_Legionella_tag-9-2': "B"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-1': "A", + 'Lilly_the_Shigella_tag-10-2': "B"}} + + expected_low_freq_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag_2': "G"}, + 'Christina_the_Streptococcus': {}, + 'Ajwa_the_Shigella': {}, + 'Ajwa_the_Legionella': {}, + 'Cari_the_Listeria': {}, + 'Aman_the_Streptococcus': {}, + 'Zion_the_Streptococcus': {}, + 'Dina_the_Shigella': {}, + 'Silas_the_Legionella': {}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-6': "F"}} + + expected_acc_gene_dict = {'Silas_the_Salmonella_w_refound': {'Silas_the_Salmonella_tag-1-5.1': 'E', + 'Silas_the_Salmonella_tag-1-5.2': 'E'}, + 'Christina_the_Streptococcus': {'Christina_the_Streptococcus_tag-2-5': "E"}, + 'Ajwa_the_Shigella': {"Ajwa_the_Shigella_tag-3-5": "E"}, + 'Ajwa_the_Legionella': {'Ajwa_the_Legionella_tag-4-5': "E"}, + 'Cari_the_Listeria': {"Cari_the_Listeria_tag-5-5": "E"}, + 'Aman_the_Streptococcus': {"Aman_the_Streptococcus_tag-6-5": "E"}, + 'Zion_the_Streptococcus': {"Zion_the_Streptococcus_tag-7-5": "E"}, + 'Dina_the_Shigella': {"Dina_the_Shigella_tag-8-5": "E"}, + 'Silas_the_Legionella': {"Silas_the_Legionella_tag-9-5": "E"}, + 'Lilly_the_Shigella': {'Lilly_the_Shigella_tag-10-5': "E"}} + + self.assertEqual(expected_core_gene_dict, core_gene_dict) + self.assertEqual(expected_low_freq_gene_dict, low_freq_gene_dict) + self.assertEqual(expected_acc_gene_dict, acc_gene_dict) + class TestReadGeneData(unittest.TestCase): """ Function to test the passing of gene_data.csv file from Panaroo """ diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Corrected_gffs/place_holder new file mode 100644 index 0000000..e69de29 diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff new file mode 100644 index 0000000..a2d074f --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/Silas_the_Salmonella_w_refound.gff @@ -0,0 +1,11 @@ +contig_1 . CDS 1 90 . . . ID=Silas_the_Salmonella_tag-1-1;locus_tag=Silas_the_Salmonella_tag-1-1 +contig_1 . CDS 100 190 . . . ID=Silas_the_Salmonella_tag-1-2.2;locus_tag=Silas_the_Salmonella_tag-1-2.2 +contig_1 . CDS 300 390 . . . ID=Silas_the_Salmonella_tag-1-3;locus_tag=Silas_the_Salmonella_tag-1-3 +contig_1 . CDS 400 490 . . . ID=Silas_the_Salmonella_tag-1-4.1;locus_tag=Silas_the_Salmonella_tag-1-4.1 +contig_1 . CDS 500 590 . . . ID=Silas_the_Salmonella_tag-1-4.2;locus_tag=Silas_the_Salmonella_tag-1-4.2 +contig_1 . CDS 600 690 . . . ID=Silas_the_Salmonella_tag-1-5.1;locus_tag=Silas_the_Salmonella_tag-1-5.1 +contig_1 . CDS 700 790 . . . ID=Silas_the_Salmonella_tag_2;locus_tag=Silas_the_Salmonella_tag_2 +contig_1 . CDS 800 890 . . . ID=Silas_the_Salmonella_tag-1-5.2;locus_tag=Silas_the_Salmonella_tag-1-5.2 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv new file mode 100644 index 0000000..8065be5 --- /dev/null +++ b/unit_tests/unit_test_data/TestParsingGenePresenceAbsenceFile/gene_presence_absence_w_refound_fragment.csv @@ -0,0 +1,8 @@ +Gene,Non.unique.Gene.name,Annotation,No..isolates,No..sequences,Avg.sequences.per.isolate,Genome.Fragment,Order.within.Fragment,Accessory.Fragment,Accessory.Order.with.Fragment,QC,Min.group.size.nuc,Max.group.size.nuc,Avg.group.size.nuc,Christina_the_Streptococcus,Ajwa_the_Shigella,Ajwa_the_Legionella,Cari_the_Listeria,Aman_the_Streptococcus,Zion_the_Streptococcus,Dina_the_Shigella,Silas_the_Legionella,Lilly_the_Shigella,Silas_the_Salmonella_w_refound +A,,,10,10,1,,,,,,,,,Christina_the_Streptococcus_tag-2-1,Ajwa_the_Shigella_tag-3-1,Ajwa_the_Legionella_tag-4-1,Cari_the_Listeria_tag-5-1,Aman_the_Streptococcus_tag-6-1,Zion_the_Streptococcus_tag-7-1,Dina_the_Shigella_tag-8-1,Silas_the_Legionella_tag-9-1,Lilly_the_Shigella_tag-10-1,Silas_the_Salmonella_tag-1-1 +B,,,10,11,1.2,,,,,,,,,Christina_the_Streptococcus_tag-2-2,Ajwa_the_Shigella_tag-3-2,Ajwa_the_Legionella_tag-4-2,Cari_the_Listeria_tag-5-2,Aman_the_Streptococcus_tag-6-2,Zion_the_Streptococcus_tag-7-2,Dina_the_Shigella_tag-8-2,Silas_the_Legionella_tag-9-2,Lilly_the_Shigella_tag-10-2,0_refound_0;Silas_the_Salmonella_tag-1-2.2 +C,,,9,9,1,,,,,,,,,Christina_the_Streptococcus_tag-2-3,Ajwa_the_Shigella_tag-3-3,Ajwa_the_Legionella_tag-4-3,Cari_the_Listeria_tag-5-3,Aman_the_Streptococcus_tag-6-3,Zion_the_Streptococcus_tag-7-3,Dina_the_Shigella_tag-8-3,Silas_the_Legionella_tag-9-3,,Silas_the_Salmonella_tag-1-3 +D,,,9,10,1.1,,,,,,,,,Christina_the_Streptococcus_tag-2-4,Ajwa_the_Shigella_tag-3-4,Ajwa_the_Legionella_tag-4-4,Cari_the_Listeria_tag-5-4,Aman_the_Streptococcus_tag-6-4,Zion_the_Streptococcus_tag-7-4,Dina_the_Shigella_tag-8-4,Silas_the_Legionella_tag-9-4,,Silas_the_Salmonella_tag-1-4.1;Silas_the_Salmonella_tag-1-4.2 +E,,,10,11,1.2,,,,,,,,,Christina_the_Streptococcus_tag-2-5,Ajwa_the_Shigella_tag-3-5,Ajwa_the_Legionella_tag-4-5,Cari_the_Listeria_tag-5-5,Aman_the_Streptococcus_tag-6-5,Zion_the_Streptococcus_tag-7-5,Dina_the_Shigella_tag-8-5,Silas_the_Legionella_tag-9-5,Lilly_the_Shigella_tag-10-5,Silas_the_Salmonella_tag-1-5.1;Silas_the_Salmonella_tag-1-5.2 +F,,,1,1,1,,,,,,,,,,,,,,,,,Lilly_the_Shigella_tag-10-6, +G,,,1,1,1,,,,,,,,,,,,,,,,,,Silas_the_Salmonella_tag_2 \ No newline at end of file From 1a594ea263b1e2efa0e78c269a73a6a96bd54913 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:36:17 +1100 Subject: [PATCH 02/20] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index bbc0960..dbbc061 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,8 +1,7 @@ -usage: Corekaburra -ig file.gff [file.gff ...] -ip - path/to/pan_genome [-cg complete_genomes.txt] - [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] - [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] - [-h] [-v] +usage: commandline_interface.py -ig file.gff [file.gff ...] -ip path/to/pan_genome + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 1dd5c39a200e4f1ca55a8618756cabac71c537d0 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:38:46 +1100 Subject: [PATCH 03/20] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index dbbc061..8a72a85 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,4 +1,4 @@ -usage: commandline_interface.py -ig file.gff [file.gff ...] -ip path/to/pan_genome +usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] [-l | -q] [-h] [-v] From f93b4e3ba79d21bacb9106d700655456ad0ae789 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:43:16 +1100 Subject: [PATCH 04/20] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 8a72a85..7bf37b1 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 29a6e0304769f8807d90a819401088cd0f8c76fd Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:45:44 +1100 Subject: [PATCH 05/20] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- unit_tests/Corekaburra_test.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index 7bf37b1..b0ccabb 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 15860ca..b112a64 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -382,7 +382,6 @@ def tearDown(self): try: for file in os.listdir('TestParsingGenePresenceAbsenceFile/Corrected_gffs/'): if '.gff' in file: - print(file) os.remove(os.path.join('TestParsingGenePresenceAbsenceFile/Corrected_gffs/', file)) except FileNotFoundError: pass From 124e7eabacf8487dfd194d2cbfa0ba2dff615965 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:47:50 +1100 Subject: [PATCH 06/20] Change the help message from commandline --- functional_tests/test_data/no_input.expected | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected index b0ccabb..f14ac07 100644 --- a/functional_tests/test_data/no_input.expected +++ b/functional_tests/test_data/no_input.expected @@ -1,7 +1,7 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome - [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] - [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] - [-l | -q] [-h] [-v] + [-cg complete_genomes.txt] [-a] [-cc 1.0] [-lc 0.05] + [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int] + [-l | -q] [-h] [-v] Welcome to Corekaburra! An extension to pan-genome analyses that summarise genomic regions between core genes and segments of neighbouring core genes From 89602d1b297171c5d0e31ac3c134377d187d1199 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:26:51 +1100 Subject: [PATCH 07/20] Add handling of core gene graphs that form multiple components can be handled, also if not complete (linear) --- Corekaburra/consesus_core_genome.py | 53 ++++++--- functional_tests/Corekaburra-test.sh | 21 ++++ ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 20 ++++ .../core_segments.csv.expected | 14 +++ .../low_frequency_gene_placement.tsv.expected | 46 ++++++++ .../no_accessory_core_segments.csv.expected | 14 +++ .../gene_presence_absence.csv | 14 +++ ...e_core_accessory_gene_content.tsv.expected | 1 + .../core_pair_summary.csv.expected | 18 +++ .../core_segments.csv.expected | 14 +++ .../low_frequency_gene_placement.tsv.expected | 40 +++++++ .../no_accessory_core_segments.csv.expected | 14 +++ .../complete_genome_double_chrom_2_larger.gff | 19 +++ .../complete_genome_double_chrom_3_larger.gff | 19 +++ .../complete_genome_double_chrom_larger.gff | 19 +++ ...complete_larger_double_chr_genome_list.txt | 3 + unit_tests/Corekaburra_test.py | 108 ++++++++++++++++-- 18 files changed, 415 insertions(+), 23 deletions(-) create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected create mode 100644 functional_tests/test_data/complete_genome_double_chrom_2_larger.gff create mode 100644 functional_tests/test_data/complete_genome_double_chrom_3_larger.gff create mode 100644 functional_tests/test_data/complete_genome_double_chrom_larger.gff create mode 100644 functional_tests/test_data/complete_larger_double_chr_genome_list.txt diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py index 1326abe..1ec8fbb 100644 --- a/Corekaburra/consesus_core_genome.py +++ b/Corekaburra/consesus_core_genome.py @@ -115,27 +115,40 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun return sub_segment_dict -def identify_segments(core_graph, num_gffs, core_gene_dict): +def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components): """ Function to identify stretches of core genes between core genes neighbouring multiple different genes :param core_graph: Graph over core genes with weights being the number of connections between the genes :param num_gffs: Number of gffs inputted + :param core_gene_dict: Dict with keys being genomes, each genome is a dict with keys being genes and values the mapped pan-genome gene cluster. + :return: Dict over stretches of core genes found in the core gene graph. """ + # TODO - Describe missing parameters in docstring + + # TODO - Fix Ouli's problem where the core gene graph may split into two seperat pieces, and also handle double chromosome. + # - Add a chek if the core gene graph is a single component of multiple. Handle components separately. - Write test then program + # - This likely require a change to the all-vs-all search of multi edge core gene search, by adding a try and expect statement maybe, or just handle each component separately. # Identify all nodes that contain more than two degrees. multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2] + # Check if multiple components in core graph, if then find single edge core_genes + if num_core_graph_components > 1: + singe_edge_nodes = [node for node, connections in core_graph.degree if connections == 1] + else: + singe_edge_nodes = [] # Check if any node have multiple edges, if not then return. - if len(multi_edge_nodes) == 0: + if len(multi_edge_nodes+singe_edge_nodes) == 0: return None # Dict to hold connections between >2 edge nodes connect_dict = {} # for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees - for node in multi_edge_nodes: - connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes] + for node in multi_edge_nodes+singe_edge_nodes: + connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) + if neighbor in multi_edge_nodes or neighbor in singe_edge_nodes] # Turn the weight into a 'distance' or number of times not found together. for edge in core_graph.edges(data=True): @@ -147,8 +160,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): # Go through all source and taget nodes, # see if a path can be found where all nodes between them have only two degrees - for source_node in multi_edge_nodes: - for target_node in multi_edge_nodes: + for source_node in multi_edge_nodes+singe_edge_nodes: + for target_node in multi_edge_nodes+singe_edge_nodes: if target_node != source_node: # Get path (segment) from source to target segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra @@ -157,7 +170,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): segment_length = len(segment) # Get length of segment with multi nodes removed - two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes]) + two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+singe_edge_nodes]) # Check if no node between the source and target has more than two edges, # if then move to record the segment/path @@ -183,9 +196,9 @@ def identify_segments(core_graph, num_gffs, core_gene_dict): f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!") # Calculate the expected number of paths - total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2]) - num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict]) - expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + len(multi_edge_connect_adjust) + total_edges_from_non_two_edge_core_genes = sum([connections for _, connections in core_graph.degree if connections > 2 or connections < 2]) + num_edges_between_non_two_edge_core_genes = sum([len(connect_dict[key]) for key in connect_dict]) + expected_segment_number = int((total_edges_from_non_two_edge_core_genes / 2) - (num_edges_between_non_two_edge_core_genes / 2)) + len(multi_edge_connect_adjust) # Check if less than the number of expected paths has been found, # if then try to identify missing paths @@ -285,6 +298,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes :param logger: Program logger + # TODO - Add parameters :return double_edge_segements: :return no_acc_segments: @@ -294,11 +308,22 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num # Construct a graph from core gene neighbours core_graph = construct_core_graph(core_neighbour_pairs) + num_core_graph_components = nx.number_connected_components(core_graph) + + logger.debug(f'Identified: {num_core_graph_components} components in core genome graph') + + double_edge_segements = {} + # Identify all segments in components of core graph + for component in nx.connected_components(core_graph): + logger.debug(f'Searching component related to: {component}') - # Find segments in the genome between core genes with multiple neighbors - double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict) + component_graph = core_graph.subgraph(component).copy() + return_segments = identify_segments(component_graph, num_gffs, core_gene_dict, num_core_graph_components) + if return_segments is not None: + double_edge_segements = double_edge_segements | return_segments - if double_edge_segements is not None: + # if double_edge_segements is not None: + if double_edge_segements: logger.debug(f'A total of {len(double_edge_segements)} core genes were identified to have multiple neighbours.') logger.debug(f'Genes with multiple neighbours: {double_edge_segements}') @@ -316,4 +341,4 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num if __name__ == '__main__': - pass \ No newline at end of file + pass diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 5b464ce..3ed12f8 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -249,6 +249,27 @@ rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. +# TODO - Test that segmnets can be identified on two 'chromosomes'/contigs that are linear and not circular. +call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" +Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multi_component_graph_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multi_component_graph_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multi_component_graph_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + + +# TODO Test the above but with complete genomes +call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - circular input gffs" +Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Multiple_component_graph_complete_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/core_segments.csv Multiple_component_graph_complete_expected/core_segments.csv.expected +test_output_file test_out_folder/no_accessory_core_segments.csv Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected +rm -r test_out_folder + call_new_test "Test with decreased core-gene cutoff" Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..71a4959 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -0,0 +1,20 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +I-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 +K-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 +M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected new file mode 100644 index 0000000..f82eee9 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected @@ -0,0 +1,14 @@ +Segment_name,Segment_position,Core_gene +A-I,1,A +A-I,2,I +B-C,1,C +B-C,2,B +D-J,1,D +D-J,2,J +E-K,1,E +E-K,2,K +F-G,1,F +F-G,2,G +H-M,1,H +H-M,2,L +H-M,3,M diff --git a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5b32628 --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,46 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_larger A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_larger A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_3_larger B C 9 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_larger C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_larger D J 0 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_larger E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_larger E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_3_larger F G 9 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_larger G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_larger H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_2_larger I Sequence_break 0 0 +complete_genome_double_chrom_3_larger I Sequence_break 0 0 +complete_genome_double_chrom_larger I Sequence_break 0 0 +complete_genome_double_chrom_2_larger J Sequence_break 1 0 +complete_genome_double_chrom_3_larger J Sequence_break 3 0 +complete_genome_double_chrom_larger J Sequence_break 1 0 +complete_genome_double_chrom_2_larger K Sequence_break 0 0 +complete_genome_double_chrom_3_larger K Sequence_break 0 0 +complete_genome_double_chrom_larger K Sequence_break 0 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_larger L M -700 0 +complete_genome_double_chrom_larger L M 0 0 +complete_genome_double_chrom_2_larger M Sequence_break 2 0 +complete_genome_double_chrom_3_larger M Sequence_break 698 0 +complete_genome_double_chrom_larger M Sequence_break 2 0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..5652fbe --- /dev/null +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,14 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-I,A-I,1,1,A +A-I,A-I,1,2,I +B-C,C-B,1,1,C +B-C,C-B,1,2,B +D-J,D-J,1,1,D +D-J,D-J,1,2,J +E-K,E-K,1,1,E +E-K,E-K,1,2,K +F-G,F-G,1,1,F +F-G,F-G,1,2,G +H-M,H-M,1,1,H +H-M,H-M,1,2,L +H-M,H-M,1,3,M diff --git a/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv b/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv new file mode 100644 index 0000000..4c9ca7f --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph/gene_presence_absence.csv @@ -0,0 +1,14 @@ +","","","","","","","","","","","","","","complete_genome_double_chrom_larger","complete_genome_double_chrom_2_larger","complete_genome_double_chrom_3_larger" +"A","","","3","3","1","","","","","","","","","dub_chrom_A","dub_chrom_2_A","dub_chrom_A" +"B","","","3","3","1","","","","","","","","","dub_chrom_B","dub_chrom_2_B","dub_chrom_B" +"C","","","3","3","1","","","","","","","","","dub_chrom_C","dub_chrom_2_C","dub_chrom_C" +"D","","","3","3","1","","","","","","","","","dub_chrom_D","dub_chrom_2_D","dub_chrom_D" +"E","","","3","3","1","","","","","","","","","dub_chrom_E","dub_chrom_2_E","dub_chrom_E" +"F","","","3","3","1","","","","","","","","","dub_chrom_F","dub_chrom_2_F","dub_chrom_F" +"G","","","3","3","1","","","","","","","","","dub_chrom_G","dub_chrom_2_G","dub_chrom_G" +"H","","","3","3","1","","","","","","","","","dub_chrom_H","dub_chrom_2_H","dub_chrom_H" +"I","","","3","3","1","","","","","","","","","dub_chrom_I","dub_chrom_2_I","dub_chrom_I" +"J","","","3","3","1","","","","","","","","","dub_chrom_J","dub_chrom_2_J","dub_chrom_J" +"K","","","3","3","1","","","","","","","","","dub_chrom_K","dub_chrom_2_K","dub_chrom_K" +"L","","","3","3","1","","","","","","","","","dub_chrom_L","dub_chrom_2_L","dub_chrom_L" +"M","","","3","3","1","","","","","","","","","dub_chrom_M","dub_chrom_2_M","dub_chrom_M" \ No newline at end of file diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..fee984c --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1 @@ +Gff Core_gene_1 Core_gene_2 gene type diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..66b9420 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_pair_summary.csv.expected @@ -0,0 +1,18 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +A-I,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +B-C,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +B-D,1,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +C-D,2,3,3,3,3,3,3.0,3.0,0,0,0.0,0.0 +D-J,3,3,3,3,-3,1,-0.7,0.0,0,0,0.0,0.0 +E-F,2,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-G,1,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +E-K,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 +F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 +I-J,3,3,3,3,1,3,1.7,1.0,0,0,0.0,0.0 +K-M,3,3,3,3,-698,2,-231.3,2.0,0,0,0.0,0.0 +L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected new file mode 100644 index 0000000..6450104 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/core_segments.csv.expected @@ -0,0 +1,14 @@ +Segment_name,Segment_position,Core_gene +A-D,1,A +A-D,2,I +A-D,3,J +A-D,4,D +B-C,1,B +B-C,2,C +E-H,1,E +E-H,2,K +E-H,3,M +E-H,4,L +E-H,5,H +F-G,1,F +F-G,2,G diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5d41d78 --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,40 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_double_chrom_3_larger A B 9 0 +complete_genome_double_chrom_larger A B 9 0 +complete_genome_double_chrom_2_larger A C 9 0 +complete_genome_double_chrom_2_larger A I 0 0 +complete_genome_double_chrom_3_larger A I 0 0 +complete_genome_double_chrom_larger A I 0 0 +complete_genome_double_chrom_2_larger B C 9 0 +complete_genome_double_chrom_3_larger B C 9 0 +complete_genome_double_chrom_larger B C 9 0 +complete_genome_double_chrom_2_larger B D 3 0 +complete_genome_double_chrom_3_larger C D 3 0 +complete_genome_double_chrom_larger C D 3 0 +complete_genome_double_chrom_2_larger D J 1 0 +complete_genome_double_chrom_3_larger D J 0 0 +complete_genome_double_chrom_larger D J -3 0 +complete_genome_double_chrom_3_larger E F 9 0 +complete_genome_double_chrom_larger E F 9 0 +complete_genome_double_chrom_2_larger E G 9 0 +complete_genome_double_chrom_2_larger E K 0 0 +complete_genome_double_chrom_3_larger E K 0 0 +complete_genome_double_chrom_larger E K 0 0 +complete_genome_double_chrom_2_larger F G 9 0 +complete_genome_double_chrom_3_larger F G 9 0 +complete_genome_double_chrom_larger F G 9 0 +complete_genome_double_chrom_2_larger F H 0 0 +complete_genome_double_chrom_3_larger G H 0 0 +complete_genome_double_chrom_larger G H 0 0 +complete_genome_double_chrom_2_larger H L 0 0 +complete_genome_double_chrom_3_larger H L 0 0 +complete_genome_double_chrom_larger H L 0 0 +complete_genome_double_chrom_2_larger I J 1 0 +complete_genome_double_chrom_3_larger I J 3 0 +complete_genome_double_chrom_larger I J 1 0 +complete_genome_double_chrom_2_larger K M 2 0 +complete_genome_double_chrom_3_larger K M -698 0 +complete_genome_double_chrom_larger K M 2 0 +complete_genome_double_chrom_2_larger L M 0 0 +complete_genome_double_chrom_3_larger L M -700 0 +complete_genome_double_chrom_larger L M 0 0 diff --git a/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected new file mode 100644 index 0000000..2ba270c --- /dev/null +++ b/functional_tests/test_data/Multiple_component_graph_complete_expected/no_accessory_core_segments.csv.expected @@ -0,0 +1,14 @@ +Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene +A-D,A-D,1,1,A +A-D,A-D,1,2,I +A-D,A-D,1,3,J +A-D,A-D,1,4,D +B-C,B-C,1,1,B +B-C,B-C,1,2,C +E-H,E-H,1,1,E +E-H,E-H,1,2,K +E-H,E-H,1,3,M +E-H,E-H,1,4,L +E-H,E-H,1,5,H +F-G,F-G,1,1,F +F-G,F-G,1,2,G diff --git a/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff new file mode 100644 index 0000000..e8d9bcf --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_2_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_2_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_2_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_2_C;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_2_B;Other_info +contig_1 . CDS 294 295 . . . ID=dub_chrom_2_D;Other_info +contig_1 . CDS 297 299 . . . ID=dub_chrom_2_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_2_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_2_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_2_G;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_2_F;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_2_H;Other_info +contig_2 . CDS 295 296 . . . ID=dub_chrom_2_L;Other_info +contig_2 . CDS 297 298 . . . ID=dub_chrom_2_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff new file mode 100644 index 0000000..9a0423f --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_3_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 295 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 296 297 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 996 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 998 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_genome_double_chrom_larger.gff b/functional_tests/test_data/complete_genome_double_chrom_larger.gff new file mode 100644 index 0000000..f26c08d --- /dev/null +++ b/functional_tests/test_data/complete_genome_double_chrom_larger.gff @@ -0,0 +1,19 @@ +##gff-version3 +contig_1 . CDS 1 2 . . . ID=dub_chrom_I;Other_info +contig_1 . CDS 3 90 . . . ID=dub_chrom_A;Other_info +contig_1 . CDS 100 190 . . . ID=dub_chrom_B;Other_info +contig_1 . CDS 200 290 . . . ID=dub_chrom_C;Other_info +contig_1 . CDS 294 299 . . . ID=dub_chrom_D;Other_info +contig_1 . CDS 297 299 . . . ID=dub_chrom_J;Other_info +contig_2 . CDS 1 2 . . . ID=dub_chrom_K;Other_info +contig_2 . CDS 3 90 . . . ID=dub_chrom_E;Other_info +contig_2 . CDS 100 190 . . . ID=dub_chrom_F;Other_info +contig_2 . CDS 200 290 . . . ID=dub_chrom_G;Other_info +contig_2 . CDS 291 294 . . . ID=dub_chrom_H;Other_info +contig_2 . CDS 295 296 . . . ID=dub_chrom_L;Other_info +contig_2 . CDS 297 298 . . . ID=dub_chrom_M;Other_info +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>contig_2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/complete_larger_double_chr_genome_list.txt b/functional_tests/test_data/complete_larger_double_chr_genome_list.txt new file mode 100644 index 0000000..f291a78 --- /dev/null +++ b/functional_tests/test_data/complete_larger_double_chr_genome_list.txt @@ -0,0 +1,3 @@ +complete_genome_double_chrom_2_larger.gff +complete_genome_double_chrom_3_larger.gff +complete_genome_double_chrom_larger \ No newline at end of file diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index b112a64..5846e37 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -9,6 +9,7 @@ import os from shutil import copyfile import logging +from networkx import number_connected_components, connected_components # pylint: disable=no-name-in-module # import Corekaburra functions @@ -3321,8 +3322,9 @@ def test_double_edge_segment_identification_all_2_degree_input(self): 'pan_cluster_6--pan_cluster_1': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}) + return_1 = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(None, return_1) @@ -3350,8 +3352,9 @@ def test_double_edge_segment_identification_two_segments(self): 'genome_10': {'tag_1': 'pan_cluster_1', 'tag_2': 'pan_cluster_4', 'tag_3': 'pan_cluster_2', 'tag_4': 'pan_cluster_5'},} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3375,8 +3378,9 @@ def test_double_edge_segment_identification_four_segments(self): 'pan_cluster_1--pan_cluster_10': 10} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3402,7 +3406,9 @@ def test_double_edge_segment_identification_segments_node_w_four_degrees(self): 'pan_cluster_6--pan_cluster_1': 9} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3427,7 +3433,9 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths(se 'genome_5': {'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3456,7 +3464,9 @@ def test_double_edge_segment_identification_segments_node_w_challenging_paths_2( 'genome_8': {'tag_5': 'pan_cluster_E', 'tag_4': 'pan_cluster_D', 'tag_3': 'pan_cluster_C', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', }} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 8, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3487,7 +3497,9 @@ def test_double_edge_segment_identification_segments_node_w_all_challenging_path 'genome_5': {'tag_5': 'pan_cluster_K', 'tag_4': 'pan_cluster_L', 'tag_3': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_1': 'pan_cluster_A', 'tag_6': 'pan_cluster_C', 'tag_7': 'pan_cluster_D'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 5, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3511,7 +3523,9 @@ def test_double_edge_segment_identification_segments_node_w_less_than_all_presen } core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 10, {}, num_components) self.assertEqual(expected_segments, double_edge_segements) @@ -3538,10 +3552,86 @@ def test_double_edge_segment_identification_segments_node_w_two_gene_segment(sel 'genome_3': {'gene_1': 'pan_cluster_A', 'gene_2': 'pan_cluster_B', 'gene_3': 'pan_cluster_E', 'gene_4': 'pan_cluster_G', 'gene_5': 'pan_cluster_D', 'gene_7': 'pan_cluster_H'}} core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) - double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict) + num_components = number_connected_components(core_graph) + + double_edge_segements = consesus_core_genome.identify_segments(core_graph, 3, core_gene_dict, num_components) self.assertEqual(expected_segments, double_edge_segements) + def test_multiple_component_core_graph(self): + expected_segments = {'pan_cluster_A--pan_cluster_I': ['pan_cluster_A', 'pan_cluster_I'], + 'pan_cluster_B--pan_cluster_C': ['pan_cluster_C', 'pan_cluster_B'], + 'pan_cluster_D--pan_cluster_J': ['pan_cluster_D', 'pan_cluster_J'], + 'pan_cluster_E--pan_cluster_K': ['pan_cluster_E', 'pan_cluster_K'], + 'pan_cluster_F--pan_cluster_G': ['pan_cluster_G', 'pan_cluster_F'], + 'pan_cluster_H--pan_cluster_M': ['pan_cluster_H', 'pan_cluster_L', 'pan_cluster_M'], + 'pan_cluster_Q--pan_cluster_O': ['pan_cluster_Q', 'pan_cluster_P', 'pan_cluster_O']} + + core_neighbour_pairs = {'pan_cluster_A--pan_cluster_B': 1, + 'pan_cluster_A--pan_cluster_C': 1, + 'pan_cluster_A--pan_cluster_I': 2, + 'pan_cluster_B--pan_cluster_C': 2, + 'pan_cluster_B--pan_cluster_D': 1, + 'pan_cluster_C--pan_cluster_D': 1, + 'pan_cluster_D--pan_cluster_J': 2, + 'pan_cluster_E--pan_cluster_F': 1, + 'pan_cluster_E--pan_cluster_G': 1, + 'pan_cluster_E--pan_cluster_K': 2, + 'pan_cluster_F--pan_cluster_G': 2, + 'pan_cluster_F--pan_cluster_H': 1, + 'pan_cluster_G--pan_cluster_H': 1, + 'pan_cluster_H--pan_cluster_L': 2, + 'pan_cluster_L--pan_cluster_M': 2, + 'pan_cluster_O--pan_cluster_P': 2, + 'pan_cluster_P--pan_cluster_Q': 2, + } + + core_gene_dict = {'genome_1': {'tag_1': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_3': 'pan_cluster_C', + 'tag_4': 'pan_cluster_D', 'tag_5': 'pan_cluster_E', 'tag_6': 'pan_cluster_F', + 'tag_7': 'pan_cluster_G', 'tag_8': 'pan_cluster_H', 'tag_9': 'pan_cluster_I', + 'tag_10': 'pan_cluster_J', 'tag_11': 'pan_cluster_K', 'tag_12': 'pan_cluster_L', + 'tag_13': 'pan_cluster_M', 'tag_14': 'pan_cluster_O', 'tag_15': 'pan_cluster_P', + 'tag_16': 'pan_cluster_Q'}, + 'genome_2': {'tag_1': 'pan_cluster_A', 'tag_2': 'pan_cluster_B', 'tag_3': 'pan_cluster_C', + 'tag_4': 'pan_cluster_D', 'tag_5': 'pan_cluster_E', 'tag_6': 'pan_cluster_F', + 'tag_7': 'pan_cluster_G', 'tag_8': 'pan_cluster_H', 'tag_9': 'pan_cluster_I', + 'tag_10': 'pan_cluster_J', 'tag_11': 'pan_cluster_K', 'tag_12': 'pan_cluster_L', + 'tag_13': 'pan_cluster_M', 'tag_14': 'pan_cluster_O', 'tag_15': 'pan_cluster_P', + 'tag_16': 'pan_cluster_Q'}} + + core_graph = consesus_core_genome.construct_core_graph(core_neighbour_pairs) + num_components = number_connected_components(core_graph) + + double_edge_segements = {} + for component in connected_components(core_graph): + component_graph = core_graph.subgraph(component).copy() + double_edge_segements = double_edge_segements | consesus_core_genome.identify_segments(component_graph, 2, + core_gene_dict, + num_components) + + # comparisons = [True for x in double_edge_segements + # if + # (x in expected_segments and + # (expected_segments[x] == double_edge_segements[x] or expected_segments[x][::-1] == double_edge_segements[x])) + # or + # (f"{x.split('--')[1]}'--'{x.split('--')[0]}" in expected_segments and + # (expected_segments[x] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"] or expected_segments[x][::-1] == double_edge_segements[f"{x.split('--')[1]}'--'{x.split('--')[0]}"])) + # ] + key_forward = [x for x in double_edge_segements if x in expected_segments] + key_reverse = [f"{x.split('--')[1]}--{x.split('--')[0]}" for x in double_edge_segements if f"{x.split('--')[1]}--{x.split('--')[0]}" in expected_segments] + expected_key_match = key_forward+key_reverse + + # Test if the number of expected segments were returned + self.assertEqual(len(expected_key_match), len(expected_segments)) + + comparisons = [True for returned_key, expected_key in zip(double_edge_segements, expected_key_match) + if double_edge_segements[returned_key] == expected_segments[expected_key] + or + double_edge_segements[returned_key] == expected_segments[expected_key][::-1]] + + # Test of all returned segments look as expected + self.assertTrue(all(comparisons)) + # TODO - Chat to Andrew about this function how it works and how we can test it more - possibly just run some things to see if it breaks From 8fa29f2a656318cdb5f08acb33e634f7c7cf44bb Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:44:57 +1100 Subject: [PATCH 08/20] Add in sorting of segmenets, so they are sorted from 'lowest' to 'highest' and the segments is oriented by the sorting of the name --- Corekaburra/output_writer_functions.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 9d8117a..644d398 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -103,7 +103,7 @@ def segment_writer(segments, out_path, prefix): :param prefix: Prefix for any output files :return: Nothing """ - + # TODO - Maybe include presence of core genes in segment output? # Generate file name out_file_name = 'core_segments.csv' if prefix is not None: @@ -119,8 +119,21 @@ def segment_writer(segments, out_path, prefix): # Write remaining rows: for key in sorted(segments.keys()): + + # Examine if key pair is ordered + split_key = sorted(key.split('--')) + if key != f"{split_key[0]}--{split_key[1]}": + sorted_key = f"{split_key[0]}-{split_key[1]}" + else: + sorted_key = key.replace('--', '-') + + # Examine if segment follows ordered key + if sorted_key.split('-')[0] != segments[key][0]: + segments[key] = segments[key][::-1] + + # Write segment for index, gene in enumerate(segments[key]): - info = [key.replace('--', '-'), index+1, gene] + info = [sorted_key, index+1, gene] writer.writerow(info) From a8486326de9f3449ca2cb5a4bb693b93a63ff093 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:56:13 +1100 Subject: [PATCH 09/20] Change check in main for presence of core gene segments add in changes to expecte output of segments for functional tests after implementation of sorting --- Corekaburra/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py index 26668ab..888eb11 100644 --- a/Corekaburra/__main__.py +++ b/Corekaburra/__main__.py @@ -261,7 +261,7 @@ def main(): time_end_passing_gffs = time.time() time_start_segments_search = time.time() - time_start = time.time() + time_start = time.time() # TODO - This seems like a lonely start timer? # Count number of unique accessory genes inserted into a core-core region across the genomes acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq} # Count number of unique low frequency genes inserted into a core-core region across the genomes @@ -290,7 +290,7 @@ def main(): logger.debug("Summary output") summary_info_writer(master_summary_info, args.output_path, args.output_prefix) - if double_edge_segements is not None: + if double_edge_segements: logger.debug("Segment output") segment_writer(double_edge_segements, args.output_path, args.output_prefix) From 6d2ebb5876b8fbcdb1a5b27a42c0017bee42152f Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:56:37 +1100 Subject: [PATCH 10/20] Change check in main for presence of core gene segments add in changes to expecte output of segments for functional tests after implementation of sorting --- .../Multi_component_graph_expected/core_segments.csv.expected | 4 ++-- .../no_accessory_core_segments.csv.expected | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected index f82eee9..db78a6b 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_segments.csv.expected @@ -1,8 +1,8 @@ Segment_name,Segment_position,Core_gene A-I,1,A A-I,2,I -B-C,1,C -B-C,2,B +B-C,1,B +B-C,2,C D-J,1,D D-J,2,J E-K,1,E diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected index 5652fbe..89c1ba2 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -1,8 +1,8 @@ Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene A-I,A-I,1,1,A A-I,A-I,1,2,I -B-C,C-B,1,1,C -B-C,C-B,1,2,B +B-C,C-B,1,1,B +B-C,C-B,1,2,C D-J,D-J,1,1,D D-J,D-J,1,2,J E-K,E-K,1,1,E From cc1f367012de3504f3da2e82a088f588d4a7e30c Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 17:01:35 +1100 Subject: [PATCH 11/20] Small changes to functional test result files and add in STDOUT to void in functional test --- functional_tests/Corekaburra-test.sh | 4 ++-- .../no_accessory_core_segments.csv.expected | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 3ed12f8..0ff335e 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -349,7 +349,7 @@ rm -r test_out_folder call_new_test "Test with a core-less contig draft" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv coreless_contig_draft_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -357,7 +357,7 @@ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv core rm -r test_out_folder call_new_test "Test with a core-less contig complete" -Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt +Corekaburra -ig complete_genome_double_chrom_2.gff complete_genome_double_chrom.gff -ip Coreless_contig_run/ -o test_out_folder/ -cg Complete_double_chromosomes.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Coreless_contig_complete_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Coreless_contig_complete_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_expected/core_pair_summary.csv.expected diff --git a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected index 89c1ba2..0e13db4 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/no_accessory_core_segments.csv.expected @@ -1,8 +1,8 @@ Parent_segment_name,Sub_segment_name,Parent_segment_position,Sub_segment_position,Core_gene A-I,A-I,1,1,A A-I,A-I,1,2,I -B-C,C-B,1,1,B -B-C,C-B,1,2,C +B-C,B-C,1,1,B +B-C,B-C,1,2,C D-J,D-J,1,1,D D-J,D-J,1,2,J E-K,E-K,1,1,E From d202856cab56d6a8cb6b9cf6f89d7e8c9ca511da Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Tue, 25 Jan 2022 17:11:29 +1100 Subject: [PATCH 12/20] Add in ordering of non-accessory segments and test for it --- Corekaburra/output_writer_functions.py | 14 +++++++++++++- unit_tests/Corekaburra_test.py | 17 +++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py index 644d398..90e9d84 100644 --- a/Corekaburra/output_writer_functions.py +++ b/Corekaburra/output_writer_functions.py @@ -162,10 +162,22 @@ def no_acc_segment_writer(no_acc_segments, out_path, prefix): # Write remaining rows: for key in sorted(no_acc_segments.keys()): + + # Examine if key pair is ordered + split_key = sorted(key.split('--')) + if key != f"{split_key[0]}--{split_key[1]}": + sorted_key = f"{split_key[0]}-{split_key[1]}" + else: + sorted_key = key.replace('--', '-') + + # Examine if segment follows ordered key, if not reverse the element + if sorted_key.split('-')[0] != no_acc_segments[key][0][0]: + no_acc_segments[key] = [sub_seg[::-1] for sub_seg in no_acc_segments[key]][::-1] + for sub_index, subsegment in enumerate(no_acc_segments[key]): sub_name = f'{subsegment[0]}-{subsegment[-1]}' for index, gene in enumerate(subsegment): - info = [key.replace('--', '-'), sub_name, sub_index + 1, index + 1, gene] + info = [sorted_key, sub_name, sub_index + 1, index + 1, gene] writer.writerow(info) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 5846e37..313b5fb 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -3836,12 +3836,12 @@ def test_segment_writer(self): input_segments = {'pan_cluster_2--pan_cluster_4': ['pan_cluster_2', 'pan_cluster_3', 'pan_cluster_4'], - 'pan_cluster_2--pan_cluster_6': ['pan_cluster_2', + 'pan_cluster_2--pan_cluster_6': ['pan_cluster_6', 'pan_cluster_1', - 'pan_cluster_6'], - 'pan_cluster_4--pan_cluster_6': ['pan_cluster_4', + 'pan_cluster_2'], + 'pan_cluster_6--pan_cluster_4': ['pan_cluster_6', 'pan_cluster_5', - 'pan_cluster_6']} + 'pan_cluster_4']} out_path = 'TestWritingOutputFunction' prefix = 'test' @@ -3858,12 +3858,13 @@ def test_no_acc_segment_writer(self): input_segments = {'pan_cluster_2--pan_cluster_4': [['pan_cluster_2'], ['pan_cluster_3', 'pan_cluster_4']], - 'pan_cluster_2--pan_cluster_6': [['pan_cluster_2'], + 'pan_cluster_6--pan_cluster_2': [['pan_cluster_2'], ['pan_cluster_1'], ['pan_cluster_6']], - 'pan_cluster_4--pan_cluster_6': [['pan_cluster_4', - 'pan_cluster_5'], - ['pan_cluster_6']]} + 'pan_cluster_6--pan_cluster_4': [['pan_cluster_6'], + ['pan_cluster_5', + 'pan_cluster_4'] + ]} out_path = 'TestWritingOutputFunction' prefix = 'test' From 18503dfb7b1d147fa3260c911579e1306d8a1cbd Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 13:31:35 +1100 Subject: [PATCH 13/20] Add in handling of refound genes in both fragments and to see if gffs have been previously corrected. Add in functional tests for multiple things regarding refound genes and resumption of run --- Corekaburra/correct_gffs.py | 9 ++- Corekaburra/gff_parser.py | 28 ++++----- Corekaburra/parse_gene_presence_absence.py | 48 +++++++++++----- functional_tests/Corekaburra-test.sh | 57 ++++++++++++++++--- ...e_core_accessory_gene_content.tsv.expected | 16 ++++++ .../core_pair_summary.csv.expected | 3 + .../low_frequency_gene_placement.tsv.expected | 9 +++ ...ingle_chrom_larger_refound_2_corrected.gff | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 7 +++ .../core_pair_summary.csv.expected | 7 +++ .../gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 16 ++++++ ...ingle_chrom_larger_refound_2_corrected.gff | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 6 ++ .../core_pair_summary.csv.expected | 10 ++++ .../Resume_refound_gene/gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 19 +++++++ ...om_larger_refound_2_corrected.gff.expected | 21 +++++++ ..._single_chrom_larger_refound_corrected.gff | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 7 +++ .../core_pair_summary.csv.expected | 7 +++ .../Resume_refound_run_fragment/gene_data.csv | 3 + .../gene_presence_absence_roary.csv | 8 +++ .../low_frequency_gene_placement.tsv.expected | 16 ++++++ .../gene_data.csv | 1 + .../gene_presence_absence_roary.csv | 8 +++ ...hrom_larger_refound_corrected.gff.expected | 21 +++++++ ...e_core_accessory_gene_content.tsv.expected | 11 ++++ .../core_pair_summary.csv.expected | 4 ++ .../low_frequency_gene_placement.tsv.expected | 13 +++++ .../genome_single_chrom_larger_refound.gff | 10 ++++ .../genome_single_chrom_larger_refound_2.gff | 10 ++++ 36 files changed, 468 insertions(+), 34 deletions(-) create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_refound_gene/gene_data.csv create mode 100644 functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv create mode 100644 functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected create mode 100644 functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected create mode 100644 functional_tests/test_data/genome_single_chrom_larger_refound.gff create mode 100644 functional_tests/test_data/genome_single_chrom_larger_refound_2.gff diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index 40ad9ef..a802c9d 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -30,6 +30,7 @@ def read_gene_data(gene_data_file): for line in gene_data.readlines(): # Split read line at commas line = line.split(',') + # TODO - Scaffold (contig) name can be found in second position of a gene_data.csv line. This could possibly be used to speed things up so that the entire set of contigs isn't required for search. # Check if refound gene if 'refound' in line[2]: @@ -69,6 +70,9 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): try: os.mkdir(corrected_gff_out_dir) except FileExistsError: + # Get path for input + input_path = os.path.split(gffs[0])[0] + corrected_folder_content = os.listdir(corrected_gff_out_dir) gff_names = [os.path.basename(gff) for gff in gffs] @@ -76,9 +80,12 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): corrected_files = [file for file in corrected_folder_content if f'{file.split("_corrected")[0]}.gff' in gff_names] + corrected_files_w_path = [os.path.join(corrected_gff_out_dir, file) for file in corrected_files] + if len(corrected_files) > 0: gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] - gffs = gffs + corrected_files + gffs = [os.path.join(input_path, gff) for gff in gffs] + gffs = gffs + corrected_files_w_path return gene_data_dict, corrected_gff_out_dir, gffs diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py index 83079e5..3946acd 100644 --- a/Corekaburra/gff_parser.py +++ b/Corekaburra/gff_parser.py @@ -99,7 +99,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous # Check that a line from gff is provided and previous gene is not a sequence break if gff_line is not None and previous_core_gene_id != "Sequence_break": - # Check if core gene is fragmented + # Check if core gene is fragmented, if then change coordinates to the last part of the fragment. if core_genes[gff_name][previous_core_gene_id] == core_genes[gff_name][gff_line[8]]: previous_core_gene_id = gff_line[8] previous_core_gene_end_coor = int(gff_line[4]) @@ -109,17 +109,19 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) # Set core cluster names - # If no line from gff is given there is a sequence break, + # If no line from gff is given there is a sequence-break, # if it is given then set current cluster and try to find previous if not found it is a sequence break if gff_line is not None: current_core_gene_cluster = core_genes[gff_name][gff_line[8]] try: previous_core_gene_cluster = core_genes[gff_name][previous_core_gene_id] + core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) # Catch is previous gene was a sequence break. except KeyError: previous_core_gene_cluster = previous_core_gene_id + core_gene_neighbours = [previous_core_gene_cluster, current_core_gene_cluster] - core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) + # core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster]) else: current_core_gene_cluster = "Sequence_break" @@ -448,7 +450,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc # Set that first core gene has been observed first_core_gene = False - # Check if first gene on new contig is a core gene, if the record it. + # Check if first gene on new contig is a core gene, if then record it. elif line[8] in core_genes[gff_name]: previous_core_gene_id = "Sequence_break" @@ -541,15 +543,15 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig], - previous_core_gene_id, - previous_core_gene_end_coor, - acc_genes_in_region, - low_freq_genes_in_region, - core_gene_pair_distance, - accessory_gene_content, - low_freq_gene_content, - core_gene_pairs, - master_info) + previous_core_gene_id, + previous_core_gene_end_coor, + acc_genes_in_region, + low_freq_genes_in_region, + core_gene_pair_distance, + accessory_gene_content, + low_freq_gene_content, + core_gene_pairs, + master_info) else: # Add a core-less contig if there has been accessory genes: coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0]) diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index b1f1eba..e8dc131 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -2,12 +2,17 @@ import csv from math import ceil, floor import gffutils +EXIT_GFF_REANNOTATION_ERROR = 3 try: from Corekaburra.correct_gffs import annotate_refound_genes except ModuleNotFoundError: from correct_gffs import annotate_refound_genes +try: + from Corekaburra.exit_with_error import exit_with_error +except ModuleNotFoundError: + from exit_with_error import exit_with_error def add_gene_to_dict(main_dict, gene, pan_gene_name, genome): @@ -37,17 +42,35 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ :return: A List of booleans indicating if a fragments has nothing in between fragments (True) or not (False) """ # Check if any refound genes are in fragments to be checked, if then reannotate the genes before checking: - refound_genes = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] - if refound_genes: - for i, gene_gff in refound_genes: - # TODO Check if corrected genome is already made if then skip and just correct genome to look in. + refound_fregments = [[i, gene_gff] for i, gene_gff in enumerate(fragment_info) if 'refound' in gene_gff[0]] + if refound_fregments: + for i, gene_gff in refound_fregments: gene, gff = gene_gff - gff_name = [gff_name for gff_name in input_gffs - if gff in [os.path.basename(gff_name), - os.path.basename(gff_name).rsplit('.', 1)[0], - os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + gff_name = None + + try: + gff_name = [gff_name for gff_name in input_gffs + if f"{gff}_corrected" in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + print('HERE') + except IndexError: + pass + + if gff_name is None: + try: + gff_name = [gff_name for gff_name in input_gffs + if gff in [os.path.basename(gff_name), + os.path.basename(gff_name).rsplit('.', 1)[0], + os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] + except IndexError: + exit_with_error(EXIT_GFF_REANNOTATION_ERROR, + f'A problem occurred when trying to find a file for reannotation, when passing the ' + f'gene_presence_absence_roary.csv! GFF: {gff}, Gene: {gene}') - fragment_info[i][1] = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + gff_name = annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_dir, logger) + + fragment_info[i][1] = gff_name fragments_close = [] for fragment in fragment_info: @@ -93,13 +116,13 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ region = (first_fragment_contig, min_frag_coor, max_frag_coor) # Find all features that are completely within the region - region_features = gff_database.region(region=region, completely_within=True) + region_features = gff_database.region(region=region, completely_within=True, featuretype=['ID']) # Find if some pieces are refound and change old_locus_tag to ID refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] if refound_pieces: - for piece in refound_pieces: - fragment_pieces[i] = gff_database[piece[1]]['ID'][0] + for i, piece in refound_pieces: + fragment_pieces[i] = gff_database[piece]['ID'][0] # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) @@ -199,7 +222,6 @@ def read_gene_presence_absence(pres_abs_file, core_gene_presence, low_freq_gene, # Check that each annotation is neighboring the other annotation. fragments_close = check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_dict, corrected_dir, logger) # TODO - If a core gene is found to be made up of fragments not places close enough (With something in between) should this then not be subtracted from the core gene count? - How would this be handled if there is a gff that is not given as input? - # Check if gene was found to be a core gene if all(fragments_close): # Add the gene to the annotation dict diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 0ff335e..40fc5c6 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -249,7 +249,6 @@ rm -r test_out_folder # TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes. -# TODO - Test that segmnets can be identified on two 'chromosomes'/contigs that are linear and not circular. call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - non circular input gffs" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multi_component_graph_expected/core_core_accessory_gene_content.tsv.expected @@ -259,8 +258,6 @@ test_output_file test_out_folder/core_segments.csv Multi_component_graph_expecte test_output_file test_out_folder/no_accessory_core_segments.csv Multi_component_graph_expected/no_accessory_core_segments.csv.expected rm -r test_out_folder - -# TODO Test the above but with complete genomes call_new_test "Test when core graph forms multiple components - not forming a single 'chromosome' - circular input gffs" Corekaburra -ip Multiple_component_graph/ -ig complete_genome_double_chrom_larger.gff complete_genome_double_chrom_2_larger.gff complete_genome_double_chrom_3_larger.gff -o test_out_folder/ -cg complete_larger_double_chr_genome_list.txt > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Multiple_component_graph_complete_expected/core_core_accessory_gene_content.tsv.expected @@ -336,12 +333,24 @@ test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_run_expec rm -r test_out_folder # TODO Test a fragmented core gene not accepted as core -#Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ -# TODO - run the test check results and transfer to expected folder -#rm -r test_out_folder +call_new_test "Test a fragmented core gene not accepted as core" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_core_gene_break_run/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected +rm -r test_out_folder + +# TODO Test with part of fragmented gene being a refound gene +call_new_test "Test with part of fragmented gene being a refound gene" +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip fragmented_refound_core_gene/ -o test_out_folder/ > /dev/null 2>&1 +test_output_file test_out_folder/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file test_out_folder/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file test_out_folder/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file test_out_folder/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected +rm -r test_out_folder call_new_test "Test for accessory genes being fragmented" -Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 +Corekaburra -ig complete_genome_single_chrom.gff genome_single_chrom_larger.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Fragmented_accessory_gene_run/ -o test_out_folder/ > /dev/null 2>&1 test_output_file test_out_folder/core_core_accessory_gene_content.tsv Fragmented_accessory_gene_run_expected/core_core_accessory_gene_content.tsv.expected test_output_file test_out_folder/low_frequency_gene_placement.tsv Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected test_output_file test_out_folder/core_pair_summary.csv Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -364,6 +373,40 @@ test_output_file test_out_folder/core_pair_summary.csv Coreless_contig_complete_ test_output_file test_out_folder/coreless_contig_accessory_gene_content.tsv Coreless_contig_complete_expected/coreless_contig_accessory_gene_content.tsv.expected rm -r test_out_folder +# TODO - Test with a genome that have been corrected and one that have not - with fragmented refound gene (Resume run) +call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 +test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected +rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff +rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv +rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv +rm Resume_refound_run_fragment/core_pair_summary.csv +rm Resume_refound_run_fragment/Corekaburra.log + +# TODO!! - Test with all genomes that have been corrected (Resume run) +call_new_test "Test with all genomes that have been corrected (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 +test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv +rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv +rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv +rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log + +# TODO - Test recognition of corrected gff files in output folder (Resume run) +call_new_test "Test recognition of corrected gff files in output folder (Resume run)" +Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 +test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_gene/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +rm Resume_refound_gene/low_frequency_gene_placement.tsv +rm Resume_refound_gene/core_core_accessory_gene_content.tsv +rm Resume_refound_gene/core_pair_summary.csv +rm Resume_refound_gene/Corekaburra.log # 3. End of testing - check if any errors occurrred if [ "$num_errors" -gt 0 ]; then diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..122abf3 --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger C Sequence_break A intermediate_frequency +genome_single_chrom_larger C Sequence_break E intermediate_frequency +genome_single_chrom_larger C Sequence_break F intermediate_frequency +genome_single_chrom_larger C Sequence_break G low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency +complete_genome_single_chrom Sequence_break C A intermediate_frequency +complete_genome_single_chrom Sequence_break C B intermediate_frequency +complete_genome_single_chrom_2 Sequence_break C A intermediate_frequency +complete_genome_single_chrom_2 Sequence_break C E intermediate_frequency +genome_single_chrom_larger Sequence_break C A intermediate_frequency +genome_single_chrom_larger Sequence_break C B intermediate_frequency +genome_single_chrom_larger_rearrange Sequence_break C A intermediate_frequency +genome_single_chrom_larger_rearrange Sequence_break C D low_frequency diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..0462fbd --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/core_pair_summary.csv.expected @@ -0,0 +1,3 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +Sequence_break-C,4,0,4,0,199,199,199.0,199.0,2,2,2.0,2.0 diff --git a/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..092dab2 --- /dev/null +++ b/functional_tests/test_data/Fragmented_core_gene_break_run_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,9 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger C Sequence_break 310 4 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break C 199 2 +complete_genome_single_chrom_2 Sequence_break C 199 2 +genome_single_chrom_larger Sequence_break C 199 2 +genome_single_chrom_larger_rearrange Sequence_break C 199 2 diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..9502287 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_rearrange C E B intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected new file mode 100644 index 0000000..137b761 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected @@ -0,0 +1,7 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,4,4,4,9,109,42.3,9.0,0,1,0.3,0.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,3,4,4,4,9,109,75.7,109.0,0,1,0.7,1.0 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv new file mode 100644 index 0000000..12775e5 --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,6,1.5,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,tag_0001;1_refound_1,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..08e03ea --- /dev/null +++ b/functional_tests/test_data/Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +genome_single_chrom_larger_refound_2 A C 9 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_rearrange C E 109 1 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..974f506 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,6 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected new file mode 100644 index 0000000..c8e3646 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/core_pair_summary.csv.expected @@ -0,0 +1,10 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-B,2,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +A-C,1,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +B-C,3,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +B-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,2,4,4,4,9,109,59.0,59.0,0,1,0.5,0.5 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_refound_gene/gene_data.csv b/functional_tests/test_data/Resume_refound_gene/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv new file mode 100644 index 0000000..750d6c5 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,4,1,,,,,,,,,tag_0001,single_comp_2_A,tag_0001,single_comp_2_A +B,,,4,4,1,,,,,,,,,0_refound_0,single_comp_B,1_refound_1,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..62da1fa --- /dev/null +++ b/functional_tests/test_data/Resume_refound_gene/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,19 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_refound A B 9 0 +genome_single_chrom_larger_refound_2 A B 9 0 +genome_single_chrom_larger_rearrange A C 109 1 +complete_genome_single_chrom_2 A E 9 0 +genome_single_chrom_larger_rearrange B C 9 0 +genome_single_chrom_larger_refound B C 9 0 +genome_single_chrom_larger_refound_2 B C 9 0 +genome_single_chrom_larger_rearrange B E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected new file mode 100644 index 0000000..60239c7 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=1_refound_1;name=Gene_name_1;annotation=Gene_function_1 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..9502287 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,7 @@ +Gff Core_gene_1 Core_gene_2 gene type +genome_single_chrom_larger_rearrange A C D intermediate_frequency +genome_single_chrom_larger_rearrange C E B intermediate_frequency +genome_single_chrom_larger_refound C E D intermediate_frequency +genome_single_chrom_larger_rearrange E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound E Sequence_break G intermediate_frequency diff --git a/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected b/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected new file mode 100644 index 0000000..137b761 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/core_pair_summary.csv.expected @@ -0,0 +1,7 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,3,4,4,4,9,109,42.3,9.0,0,1,0.3,0.0 +A-E,1,4,4,4,9,9,9.0,9.0,0,0,0.0,0.0 +C-E,3,4,4,4,9,109,75.7,109.0,0,1,0.7,1.0 +C-Sequence_break,2,4,0,0,10,310,160.0,160.0,0,0,0.0,0.0 +E-Sequence_break,2,4,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv b/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv new file mode 100644 index 0000000..c821e4a --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/gene_data.csv @@ -0,0 +1,3 @@ +gff_file,scaffold_name,clustering_id,annotation_id,prot_sequence,dna_sequence,gene_name,description +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function +genome_single_chrom_larger_refound_2,contig_1,1_refound_1,1_refound_1,RQPS,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name_1,Gene_function_1 \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv b/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv new file mode 100644 index 0000000..12775e5 --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,genome_single_chrom_larger_refound_2,genome_single_chrom_larger_rearrange +A,,,4,6,1.5,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,tag_0001;1_refound_1,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,tag_0003,single_comp_2_C +D,,,3,3,1,,,,,,,,,tag_0004,, tag_0004,single_comp_2_D +E,,,4,4,1,,,,,,,,,tag_0005,single_comp_2_B, tag_0005,single_comp_2_E +F,,,3,3,1,,,,,,,,,tag_0006,, tag_0006,single_comp_2_F +G,,,2,2,1,,,,,,,,,tag_0007,, tag_0007, \ No newline at end of file diff --git a/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..08e03ea --- /dev/null +++ b/functional_tests/test_data/Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,16 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +genome_single_chrom_larger_refound_2 A C 9 0 +complete_genome_single_chrom_2 A E 9 0 +complete_genome_single_chrom_2 C E 9 0 +genome_single_chrom_larger_rearrange C E 109 1 +genome_single_chrom_larger_refound C E 109 1 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_refound_2 C Sequence_break 310 0 +genome_single_chrom_larger_rearrange E Sequence_break 110 1 +genome_single_chrom_larger_refound E Sequence_break 110 2 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 +genome_single_chrom_larger_refound_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv b/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv new file mode 100644 index 0000000..1f1531f --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene/gene_data.csv @@ -0,0 +1 @@ +genome_single_chrom_larger_refound,contig_1,0_refound_0,0_refound_0,SPQR,CTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATT,Gene_name,Gene_function \ No newline at end of file diff --git a/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv b/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv new file mode 100644 index 0000000..19b3ebb --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene/gene_presence_absence_roary.csv @@ -0,0 +1,8 @@ +,,,,,,,,,,,,,,genome_single_chrom_larger_refound,complete_genome_single_chrom_2,complete_genome_single_chrom,genome_single_chrom_larger_rearrange +A,,,4,5,1.25,,,,,,,,,tag_0001;0_refound_0,single_comp_2_A,single_comp_A,single_comp_2_A +B,,,2,2,1,,,,,,,,,,,single_comp_B,single_comp_2_B +C,,,4,4,1,,,,,,,,,tag_0003,single_comp_2_C,single_comp_C,single_comp_2_C +D,,,1,1,1,,,,,,,,,tag_0004,,,single_comp_2_D +E,,,3,3,1,,,,,,,,,tag_0005,single_comp_2_B,,single_comp_2_E +F,,,2,2,1,,,,,,,,,tag_0006,,,single_comp_2_F +G,,,1,1,1,,,,,,,,,tag_0007,,, \ No newline at end of file diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected new file mode 100644 index 0000000..eb81519 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_corrected.gff.expected @@ -0,0 +1,21 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 Panaroo CDS 100 190 . + 0 ID=tag_0008;locus_tag=tag_0008;old_locus_tag=0_refound_0;name=Gene_name;annotation=Gene_function +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGAT +TGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATT +GAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected new file mode 100644 index 0000000..521eb93 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected @@ -0,0 +1,11 @@ +Gff Core_gene_1 Core_gene_2 gene type +complete_genome_single_chrom A C B intermediate_frequency +complete_genome_single_chrom_2 A C E intermediate_frequency +genome_single_chrom_larger_rearrange A C D low_frequency +genome_single_chrom_larger_rearrange C Sequence_break B intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break E intermediate_frequency +genome_single_chrom_larger_rearrange C Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break E intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break F intermediate_frequency +genome_single_chrom_larger_refound C Sequence_break D low_frequency +genome_single_chrom_larger_refound C Sequence_break G low_frequency diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected new file mode 100644 index 0000000..befdd38 --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/core_pair_summary.csv.expected @@ -0,0 +1,4 @@ +Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc +A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected new file mode 100644 index 0000000..5a78a0a --- /dev/null +++ b/functional_tests/test_data/fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected @@ -0,0 +1,13 @@ +Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count +complete_genome_single_chrom A C 109 1 +complete_genome_single_chrom_2 A C 109 1 +genome_single_chrom_larger_rearrange A C 109 1 +genome_single_chrom_larger_refound A C 9 0 +complete_genome_single_chrom C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_refound C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 +genome_single_chrom_larger_refound Sequence_break A 0 0 diff --git a/functional_tests/test_data/genome_single_chrom_larger_refound.gff b/functional_tests/test_data/genome_single_chrom_larger_refound.gff new file mode 100644 index 0000000..7185660 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_refound.gff @@ -0,0 +1,10 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff b/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff new file mode 100644 index 0000000..7185660 --- /dev/null +++ b/functional_tests/test_data/genome_single_chrom_larger_refound_2.gff @@ -0,0 +1,10 @@ +##gff-version3 +contig_1 . CDS 1 90 . . . ID=tag_0001;Other_info;locus_tag=tag_0001 +contig_1 . CDS 200 290 . . . ID=tag_0003;Other_info;locus_tag=tag_0003 +contig_1 . CDS 300 390 . . . ID=tag_0004;Other_info;locus_tag=tag_0004 +contig_1 . CDS 400 490 . . . ID=tag_0005;Other_info;locus_tag=tag_0005 +contig_1 . CDS 500 590 . . . ID=tag_0006;Other_info;locus_tag=tag_0006 +contig_1 . CDS 591 592 . . . ID=tag_0007;Other_info;locus_tag=tag_0007 +##FASTA +>contig_1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTTCCGATCTAATCAAGATTGAGAGGAATTGCTGTTTTTATTGGCAAGACAATTTTACTCTTCCGATCTAATCAAGATTGAGAGGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file From 20390d241b2f74c042b443f84db3fc19522eb792 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 13:59:53 +1100 Subject: [PATCH 14/20] Add in so that non corrected genomes are returned with paths from prepair reannotation. Remove ID feature selection when looking for feature between gene fragments --- Corekaburra/correct_gffs.py | 5 +++-- Corekaburra/parse_gene_presence_absence.py | 4 +--- unit_tests/Corekaburra_test.py | 9 ++++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py index a802c9d..f0d809f 100644 --- a/Corekaburra/correct_gffs.py +++ b/Corekaburra/correct_gffs.py @@ -71,7 +71,8 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): os.mkdir(corrected_gff_out_dir) except FileExistsError: # Get path for input - input_path = os.path.split(gffs[0])[0] + input_path_dict = {os.path.basename(gff): os.path.split(gff)[0] for gff in gffs} + # input_path = os.path.split(gffs[0])[0] corrected_folder_content = os.listdir(corrected_gff_out_dir) @@ -84,7 +85,7 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger): if len(corrected_files) > 0: gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files] - gffs = [os.path.join(input_path, gff) for gff in gffs] + gffs = [os.path.join(input_path_dict[gff], gff) for gff in gffs] gffs = gffs + corrected_files_w_path return gene_data_dict, corrected_gff_out_dir, gffs diff --git a/Corekaburra/parse_gene_presence_absence.py b/Corekaburra/parse_gene_presence_absence.py index e8dc131..4b4764b 100644 --- a/Corekaburra/parse_gene_presence_absence.py +++ b/Corekaburra/parse_gene_presence_absence.py @@ -53,7 +53,6 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ if f"{gff}_corrected" in [os.path.basename(gff_name), os.path.basename(gff_name).rsplit('.', 1)[0], os.path.basename(gff_name).rsplit('.', 1)[0].rsplit('.', 1)[0]]][0] - print('HERE') except IndexError: pass @@ -116,14 +115,13 @@ def check_fragmented_gene(fragment_info, input_gffs, tmp_folder_path, gene_data_ region = (first_fragment_contig, min_frag_coor, max_frag_coor) # Find all features that are completely within the region - region_features = gff_database.region(region=region, completely_within=True, featuretype=['ID']) + region_features = gff_database.region(region=region, completely_within=True) # Find if some pieces are refound and change old_locus_tag to ID refound_pieces = [[i, fragment_piece] for i, fragment_piece in enumerate(fragment_pieces) if 'refound' in fragment_piece] if refound_pieces: for i, piece in refound_pieces: fragment_pieces[i] = gff_database[piece]['ID'][0] - # find all genes that are not part of the fragmented gene region_locus_tags = set([feature[8]['locus_tag'][0] for feature in region_features]) excess_genes = region_locus_tags.difference(fragment_pieces) diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py index 313b5fb..0d386ff 100644 --- a/unit_tests/Corekaburra_test.py +++ b/unit_tests/Corekaburra_test.py @@ -892,13 +892,15 @@ def test_no_files_annotated(self): self.assertEqual(input_gffs, corrected_files_return) def test_some_files_annotated(self): - input_gffs = ['Mock_1.gff', 'Mock_2.gff'] + input_gffs = ['mock/test/path/Mock_1.gff', 'mock/test/path/Mock_2.gff', 'Mocky/mock/mock/path/Mock_3.gff'] gene_data_dict_return, corrected_gff_out_dir_return, corrected_files_return = correct_gffs.prepair_for_reannotation( 'TestPrepairForReannotation/Mock_gene_data.csv', 'TestPrepairForReannotation/Some_genomes', input_gffs, self.logger) - expected_gffs = ['Mock_2.gff', 'Mock_1_corrected.gff'] + expected_gffs = ['mock/test/path/Mock_2.gff', + 'Mocky/mock/mock/path/Mock_3.gff', + 'TestPrepairForReannotation/Some_genomes/Corrected_gff_files/Mock_1_corrected.gff'] self.assertEqual(expected_gffs, corrected_files_return) @@ -909,7 +911,8 @@ def test_all_files_annotated(self): 'TestPrepairForReannotation/All_genomes', input_gffs, self.logger) - expected_gffs = ['Mock_1_corrected.gff', 'Mock_2_corrected.gff'] + expected_gffs = ['TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_1_corrected.gff', + 'TestPrepairForReannotation/All_genomes/Corrected_gff_files/Mock_2_corrected.gff'] self.assertEqual(expected_gffs, corrected_files_return) From 66e72ca59a471e07e71c3a571230d82f671f0840 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 14:59:02 +1100 Subject: [PATCH 15/20] Change result for funcitonal tests with new changes --- functional_tests/Corekaburra-test.sh | 20 +++++++++---------- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 8 ++++---- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 10 +++++----- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 12 +++++------ .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 4 ++-- .../core_pair_summary.csv.expected | 4 ++-- .../low_frequency_gene_placement.tsv.expected | 6 +++--- ...e_core_accessory_gene_content.tsv.expected | 3 ++- .../core_pair_summary.csv.expected | 5 +++-- .../low_frequency_gene_placement.tsv.expected | 5 +++-- 26 files changed, 73 insertions(+), 70 deletions(-) diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh index 40fc5c6..ce81743 100755 --- a/functional_tests/Corekaburra-test.sh +++ b/functional_tests/Corekaburra-test.sh @@ -376,10 +376,10 @@ rm -r test_out_folder # TODO - Test with a genome that have been corrected and one that have not - with fragmented refound gene (Resume run) call_new_test "Test with a genome that have been corrected and one that have not (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_run_fragment/ -o Resume_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected -test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff fragmented_refound_core_gene_expected/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected +test_output_file Resume_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_run_fragment/low_frequency_gene_placement.tsv Resume_refound_run_fragment/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_run_fragment/core_pair_summary.csv Resume_refound_run_fragment/core_pair_summary.csv.expected +test_output_file Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff.expected rm Resume_refound_run_fragment/Corrected_gff_files/genome_single_chrom_larger_refound_2_corrected.gff rm Resume_refound_run_fragment/low_frequency_gene_placement.tsv rm Resume_refound_run_fragment/core_core_accessory_gene_content.tsv @@ -389,9 +389,9 @@ rm Resume_refound_run_fragment/Corekaburra.log # TODO!! - Test with all genomes that have been corrected (Resume run) call_new_test "Test with all genomes that have been corrected (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_all_found_gene_refound_run_fragment/ -o Resume_all_found_gene_refound_run_fragment/ > /dev/null 2>&1 -test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv.expected +test_output_file Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv.expected rm Resume_all_found_gene_refound_run_fragment/low_frequency_gene_placement.tsv rm Resume_all_found_gene_refound_run_fragment/core_core_accessory_gene_content.tsv rm Resume_all_found_gene_refound_run_fragment/core_pair_summary.csv @@ -400,9 +400,9 @@ rm Resume_all_found_gene_refound_run_fragment/Corekaburra.log # TODO - Test recognition of corrected gff files in output folder (Resume run) call_new_test "Test recognition of corrected gff files in output folder (Resume run)" Corekaburra -ig genome_single_chrom_larger_refound_2.gff genome_single_chrom_larger_refound.gff genome_single_chrom_larger_rearrange.gff complete_genome_single_chrom_2.gff -ip Resume_refound_gene/ -o Resume_refound_gene/ > /dev/null 2>&1 -test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv fragmented_refound_core_gene_expected/core_core_accessory_gene_content.tsv.expected -test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv fragmented_refound_core_gene_expected/low_frequency_gene_placement.tsv.expected -test_output_file Resume_refound_gene/core_pair_summary.csv fragmented_refound_core_gene_expected/core_pair_summary.csv.expected +test_output_file Resume_refound_gene/core_core_accessory_gene_content.tsv Resume_refound_gene/core_core_accessory_gene_content.tsv.expected +test_output_file Resume_refound_gene/low_frequency_gene_placement.tsv Resume_refound_gene/low_frequency_gene_placement.tsv.expected +test_output_file Resume_refound_gene/core_pair_summary.csv Resume_refound_gene/core_pair_summary.csv.expected rm Resume_refound_gene/low_frequency_gene_placement.tsv rm Resume_refound_gene/core_core_accessory_gene_content.tsv rm Resume_refound_gene/core_pair_summary.csv diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index f4afb24..3cae9f5 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected index e8df429..530a15b 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 3 genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index 6167deb..d5a9336 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected index 5558b6d..6405fe0 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger_2 A C 9 0 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger_2 A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger_2 C Sequence_break 310 4 -genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger_2 Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected index 551faf3..45dace7 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected index e0313a2..e88cbe7 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected index bac3707..b9225b3 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected @@ -1,9 +1,9 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,4,3,3,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,3,3,4,3,9,9,9.0,9.0,0,0,0.0,0.0 B-E,1,3,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,1,4,3,3,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,1,4,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,2,3,0,0,110,110,110.0,110.0,1,2,1.5,1.5 +Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected index 929708d..46da0ae 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,6 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 genome_single_chrom_larger A B 9 0 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 genome_single_chrom_larger B C 9 0 genome_single_chrom_larger_rearrange B C 9 0 @@ -13,3 +10,6 @@ genome_single_chrom_larger C E 109 1 complete_genome_single_chrom C Sequence_break 10 0 genome_single_chrom_larger E Sequence_break 110 2 genome_single_chrom_larger_rearrange E Sequence_break 110 1 +complete_genome_single_chrom Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected index 71a4959..b05e67d 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -13,8 +13,8 @@ F-G,3,3,3,3,9,9,9.0,9.0,0,0,0.0,0.0 F-H,1,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 G-H,2,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 -I-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 -K-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected index 5b32628..4ae9bca 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/low_frequency_gene_placement.tsv.expected @@ -29,18 +29,18 @@ complete_genome_double_chrom_larger G H 0 0 complete_genome_double_chrom_2_larger H L 0 0 complete_genome_double_chrom_3_larger H L 0 0 complete_genome_double_chrom_larger H L 0 0 -complete_genome_double_chrom_2_larger I Sequence_break 0 0 -complete_genome_double_chrom_3_larger I Sequence_break 0 0 -complete_genome_double_chrom_larger I Sequence_break 0 0 complete_genome_double_chrom_2_larger J Sequence_break 1 0 complete_genome_double_chrom_3_larger J Sequence_break 3 0 complete_genome_double_chrom_larger J Sequence_break 1 0 -complete_genome_double_chrom_2_larger K Sequence_break 0 0 -complete_genome_double_chrom_3_larger K Sequence_break 0 0 -complete_genome_double_chrom_larger K Sequence_break 0 0 complete_genome_double_chrom_2_larger L M 0 0 complete_genome_double_chrom_3_larger L M -700 0 complete_genome_double_chrom_larger L M 0 0 complete_genome_double_chrom_2_larger M Sequence_break 2 0 complete_genome_double_chrom_3_larger M Sequence_break 698 0 complete_genome_double_chrom_larger M Sequence_break 2 0 +complete_genome_double_chrom_2_larger Sequence_break I 0 0 +complete_genome_double_chrom_3_larger Sequence_break I 0 0 +complete_genome_double_chrom_larger Sequence_break I 0 0 +complete_genome_double_chrom_2_larger Sequence_break K 0 0 +complete_genome_double_chrom_3_larger Sequence_break K 0 0 +complete_genome_double_chrom_larger Sequence_break K 0 0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected index 83f14d5..413f596 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,2,1.2,1.0 -A-Sequence_break,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected index b41a429..616887b 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected @@ -3,11 +3,11 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 2 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 1 complete_genome_single_chrom_2 C Sequence_break 10 1 genome_single_chrom_larger C Sequence_break 310 4 genome_single_chrom_larger_rearrange C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected index e48e928..78fd9f1 100644 --- a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected index 9c3b77e..1357baf 100644 --- a/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Simple_run_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 complete_genome_single_chrom_2 A B 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected index 57bef6d..8b628b8 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected @@ -1,6 +1,6 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 -A-Sequence_break,1,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,1,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 89bad36..18b6ce2 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -2,7 +2,7 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected index 89a2209..67512ab 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected @@ -1,8 +1,8 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-E,1,3,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,2,3,2,2,9,109,59.0,59.0,0,1,0.5,0.5 C-Sequence_break,2,3,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,1,2,0,0,110,110,110.0,110.0,2,2,2.0,2.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file diff --git a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected index 90a8203..89a3b74 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,6 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A B 9 0 genome_single_chrom_larger A B 9 0 complete_genome_single_chrom_2 A E 9 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom B C 9 0 genome_single_chrom_larger B C 9 0 complete_genome_single_chrom_2 C E 9 0 @@ -12,3 +9,6 @@ genome_single_chrom_larger C E 109 1 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger E Sequence_break 110 2 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected index e48e928..be20079 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 0aa11ff..31959d6 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_double_chrom A B 9 0 complete_genome_double_chrom_2 A B 9 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_double_chrom_2 A Sequence_break 0 0 complete_genome_double_chrom B C 9 0 complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom C Sequence_break 10 0 complete_genome_double_chrom_2 C Sequence_break 10 0 +complete_genome_double_chrom Sequence_break A 0 0 +complete_genome_double_chrom_2 Sequence_break A 0 0 \ No newline at end of file diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected index 551faf3..45dace7 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -A-Sequence_break,3,3,0,0,0,0,0.0,0.0,0,0,0.0,0.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected index e0313a2..e88cbe7 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/low_frequency_gene_placement.tsv.expected @@ -2,9 +2,9 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 1 genome_single_chrom_larger A C 109 1 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom_2 A Sequence_break 0 0 -genome_single_chrom_larger A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger C Sequence_break 310 4 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom_2 Sequence_break A 0 0 +genome_single_chrom_larger Sequence_break A 0 0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected index ec996bc..bed9362 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -1,3 +1,4 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_double_chrom B Sequence_break C low_frequency -complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom Sequence_break E D low_frequency diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index aacd5c6..fff00cc 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -1,6 +1,7 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -A-Sequence_break,2,2,0,0,0,0,0.0,0.0,0,0,0.0,0.0 B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 -E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 96337b1..3c10cbb 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -1,9 +1,10 @@ Gff Core_gene_1 Core_gene_2 Core_region_size Core_region_accessory_count complete_genome_double_chrom A B 9 0 complete_genome_single_chrom A B 9 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_single_chrom A Sequence_break 0 0 complete_genome_single_chrom B E 9 0 complete_genome_double_chrom B Sequence_break 110 1 complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 +complete_genome_double_chrom A Sequence_break 0 0 +complete_genome_single_chrom A Sequence_break 0 0 +complete_genome_single_chrom E Sequence_break 99 1 From 8b1206f51b0e9dc736e5e56f0a0f84157688c377 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:18:20 +1100 Subject: [PATCH 16/20] Small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 4 ++-- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../Simple_run_expected/core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../core_90_cutoff_expected/core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 2 +- .../low_freq_cutoff_0_expected/core_pair_summary.csv.expected | 2 +- .../core_core_accessory_gene_content.tsv.expected | 2 +- .../core_pair_summary.csv.expected | 4 ++-- 17 files changed, 19 insertions(+), 19 deletions(-) diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index 3cae9f5..fb54173 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index d5a9336..82662bf 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected index 6405fe0..73993a4 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/low_frequency_gene_placement.tsv.expected @@ -6,7 +6,7 @@ genome_single_chrom_larger_rearrange A C 109 1 complete_genome_single_chrom C Sequence_break 10 0 complete_genome_single_chrom_2 C Sequence_break 10 0 genome_single_chrom_larger_2 C Sequence_break 310 4 -genome_single_chrom_larger_rearrange C Sequence_break 310 3 +genome_single_chrom_larger_rearrange C Sequence_break 310 3 complete_genome_single_chrom Sequence_break A 0 0 complete_genome_single_chrom_2 Sequence_break A 0 0 genome_single_chrom_larger_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected index 45dace7..e7b2b7f 100644 --- a/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Increase_low_cutoff_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected index b9225b3..fe68367 100644 --- a/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Less_than_all_gffs_run_expected/core_pair_summary.csv.expected @@ -6,4 +6,4 @@ B-E,1,3,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,1,4,3,3,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,1,4,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,2,3,0,0,110,110,110.0,110.0,1,2,1.5,1.5 -Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-A,3,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected index b05e67d..fea2c4f 100644 --- a/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Multi_component_graph_expected/core_pair_summary.csv.expected @@ -16,5 +16,5 @@ H-L,3,3,3,3,0,0,0.0,0.0,0,0,0.0,0.0 J-Sequence_break,3,3,0,0,1,3,1.7,1.0,0,0,0.0,0.0 L-M,3,3,3,3,-700,0,-233.3,0.0,0,0,0.0,0.0 M-Sequence_break,3,3,0,0,2,698,234.0,2.0,0,0,0.0,0.0 -Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-I,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-K,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected index 413f596..ff0d453 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,2,1.2,1.0 -C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 +C-Sequence_break,4,4,0,0,10,310,160.0,160.0,1,4,2.5,2.5 Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected index 616887b..5bf8eaa 100644 --- a/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Reannotation_sucessful_expected/low_frequency_gene_placement.tsv.expected @@ -3,7 +3,6 @@ complete_genome_single_chrom A C 109 1 complete_genome_single_chrom_2 A C 109 2 genome_single_chrom_larger A C 109 1 genome_single_chrom_larger_rearrange A C 109 1 -genome_single_chrom_larger_rearrange A Sequence_break 0 0 complete_genome_single_chrom C Sequence_break 10 1 complete_genome_single_chrom_2 C Sequence_break 10 1 genome_single_chrom_larger C Sequence_break 310 4 @@ -11,3 +10,4 @@ genome_single_chrom_larger_rearrange C Sequence_break 310 4 complete_genome_single_chrom Sequence_break A 0 0 complete_genome_single_chrom_2 Sequence_break A 0 0 genome_single_chrom_larger Sequence_break A 0 0 +genome_single_chrom_larger_rearrange Sequence_break A 0 0 diff --git a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected index 78fd9f1..0ad556a 100644 --- a/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Simple_run_expected/core_pair_summary.csv.expected @@ -1,5 +1,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected index 8b628b8..318be40 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/core_pair_summary.csv.expected @@ -2,5 +2,5 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 A-C,1,2,2,2,10,10,10.0,10.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 -C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 +C-Sequence_break,1,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 Sequence_break-A,1,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected index 18b6ce2..88d24e9 100644 --- a/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/Single_comple_chromosome_expected/low_frequency_gene_placement.tsv.expected @@ -4,5 +4,5 @@ complete_genome_single_chrom_2 A B 9 0 complete_genome_single_chrom A C 10 0 complete_genome_single_chrom B C 9 0 complete_genome_single_chrom_2 B C 9 0 -complete_genome_single_chrom_2 C Sequence_break 10 0 +complete_genome_single_chrom_2 C Sequence_break 10 0 complete_genome_single_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected index 67512ab..fcbf02e 100644 --- a/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/core_90_cutoff_expected/core_pair_summary.csv.expected @@ -5,4 +5,4 @@ B-C,2,2,3,2,9,9,9.0,9.0,0,0,0.0,0.0 C-E,2,3,2,2,9,109,59.0,59.0,0,1,0.5,0.5 C-Sequence_break,2,3,0,0,10,10,10.0,10.0,0,0,0.0,0.0 E-Sequence_break,1,2,0,0,110,110,110.0,110.0,2,2,2.0,2.0 -Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 \ No newline at end of file +Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected index be20079..0ad556a 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/core_pair_summary.csv.expected @@ -2,4 +2,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-C,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 C-Sequence_break,2,2,0,0,10,10,10.0,10.0,0,0,0.0,0.0 -Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 31959d6..8d781b1 100644 --- a/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/coreless_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -6,4 +6,4 @@ complete_genome_double_chrom_2 B C 9 0 complete_genome_double_chrom C Sequence_break 10 0 complete_genome_double_chrom_2 C Sequence_break 10 0 complete_genome_double_chrom Sequence_break A 0 0 -complete_genome_double_chrom_2 Sequence_break A 0 0 \ No newline at end of file +complete_genome_double_chrom_2 Sequence_break A 0 0 diff --git a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected index 45dace7..e7b2b7f 100644 --- a/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/low_freq_cutoff_0_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,3,3,3,3,109,109,109.0,109.0,1,1,1.0,1.0 -C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 +C-Sequence_break,3,3,0,0,10,310,110.0,10.0,0,4,1.3,0.0 Sequence_break-A,3,0,3,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected index bed9362..76d4e0a 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_core_accessory_gene_content.tsv.expected @@ -1,4 +1,4 @@ Gff Core_gene_1 Core_gene_2 gene type complete_genome_double_chrom B Sequence_break C low_frequency -complete_genome_double_chrom E Sequence_break F low_frequency +complete_genome_double_chrom E Sequence_break F low_frequency complete_genome_double_chrom Sequence_break E D low_frequency diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index fff00cc..b9862ed 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -2,6 +2,6 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist, A-B,2,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 -E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 -Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 +E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 +Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 From b47fdf9bf4c1d3bc9e0c23fc5fc40301a4b314eb Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:25:12 +1100 Subject: [PATCH 17/20] More small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../core_pair_summary.csv.expected | 2 +- .../low_frequency_gene_placement.tsv.expected | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected index fb54173..b34d600 100644 --- a/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_accessory_gene_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,109,109,109.0,109.0,1,1,1.0,1.0 C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,3,1.5,1.5 -Sequence_break-A,4,4,0,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected index 82662bf..befdd38 100644 --- a/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/Fragmented_core_run_expected/core_pair_summary.csv.expected @@ -1,4 +1,4 @@ Core_pair,n,occurrence_core_1,occurrence_core_2,co_occurrence,min_dist,max_dist,mean_dist,median_dist,min_acc,max_acc,mean_acc,median_acc A-C,4,4,4,4,9,109,84.0,109.0,0,1,0.8,1.0 C-Sequence_break,4,4,0,0,10,310,160.0,160.0,0,4,1.8,1.5 -A-Sequence_break,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 +Sequence_break-A,4,0,4,0,0,0,0.0,0.0,0,0,0.0,0.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index b9862ed..ab752ec 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -4,4 +4,4 @@ B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-E,1,0,1,0,99,99,99.0,99.0,0,1,1.0,1.0 +Sequence_break-E,1,0,2,0,99,99,99.0,99.0,0,1,1.0,1.0 diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index 3c10cbb..e7ae4b2 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -5,6 +5,6 @@ complete_genome_single_chrom B E 9 0 complete_genome_double_chrom B Sequence_break 110 1 complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 -complete_genome_double_chrom A Sequence_break 0 0 -complete_genome_single_chrom A Sequence_break 0 0 -complete_genome_single_chrom E Sequence_break 99 1 +complete_genome_double_chrom Sequence_break A 0 0 +complete_genome_single_chrom Sequence_break A 0 0 +complete_genome_single_chrom Sequence_break E 99 1 From bdf14b48c542b034f401652fa1cd88fe0058915a Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:31:08 +1100 Subject: [PATCH 18/20] Even more small changes to expected files for functional tests --- .../low_frequency_gene_placement.tsv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected index e7ae4b2..86b1099 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/low_frequency_gene_placement.tsv.expected @@ -7,4 +7,4 @@ complete_genome_double_chrom E Sequence_break 110 1 complete_genome_single_chrom E Sequence_break 10 0 complete_genome_double_chrom Sequence_break A 0 0 complete_genome_single_chrom Sequence_break A 0 0 -complete_genome_single_chrom Sequence_break E 99 1 +complete_genome_double_chrom Sequence_break E 99 1 From ab898780a09ca948674f475d7b19ccb6985ab088 Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:37:26 +1100 Subject: [PATCH 19/20] Final small changes to expected files for functional tests --- .../core_pair_summary.csv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected index ab752ec..aa6dc0d 100644 --- a/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected +++ b/functional_tests/test_data/single_core_contig_draft_expected/core_pair_summary.csv.expected @@ -4,4 +4,4 @@ B-E,1,2,2,2,9,9,9.0,9.0,0,0,0.0,0.0 B-Sequence_break,1,2,0,0,110,110,110.0,110.0,1,1,1.0,1.0 E-Sequence_break,2,2,0,0,10,110,60.0,60.0,0,1,0.5,0.5 Sequence_break-A,2,0,2,0,0,0,0.0,0.0,0,0,0.0,0.0 -Sequence_break-E,1,0,2,0,99,99,99.0,99.0,0,1,1.0,1.0 +Sequence_break-E,1,0,2,0,99,99,99.0,99.0,1,1,1.0,1.0 From c01a8a7ac66be3a86c67c9ac6f000ffb8f69e52b Mon Sep 17 00:00:00 2001 From: milnus <44769523+milnus@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:44:39 +1100 Subject: [PATCH 20/20] Bump version for new PyPi upload and add keywords and classifiers for PyPi --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4513c01..1371e4f 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='Corekaburra', - version='0.0.1', + version='0.0.2', author='Magnus Ganer Jespersen', author_email='magnus.ganer.j@gmail.com', packages=['Corekaburra'], @@ -25,4 +25,11 @@ description=('A prototypical bioinformatics command line tool'), long_description=(LONG_DESCRIPTION), install_requires=["biopython", "networkx", "gffutils", "numpy"], + keywords=['Genomic', 'pan-genome', 'bacteria', 'prokaryotes', 'bioinformatics'], + classifiers=[ + 'Programming Language :: Python :: 3.9', + 'License :: OSI Approved :: MIT License', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Development Status :: 4 - Beta'] )