Merge pull request #5 from milnus/dev

Dev
milnus · Jan 11, 2022 · 99b075c · 99b075c
2 parents 9171ac6 + 4093dcb
commit 99b075c
Show file tree

Hide file tree

Showing 10 changed files with 1,899 additions and 265 deletions.
diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py
@@ -6,8 +6,11 @@
 Maintainer  : [email protected] 
 Portability : POSIX
 
-The program reads one or more input FASTA files. For each file it computes a
-variety of statistics, and then prints a summary of the statistics as output. # TODO - Change description
+Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra
+identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters
+and distance between core genes. Information from neighboring core genes is further used to identify stretches of core
+gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs
+from standard pan-genome pipelines: Roary and Panaroo.
 '''
 
 import os
@@ -322,5 +325,6 @@ def main():
     if args.discard_gffs:
         os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files'))
 
+
 if __name__ == '__main__':
     main()
diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py
@@ -12,12 +12,11 @@ def get_commandline_arguments(args):
     :return: matched argument object for passing in main function.
     """
     # Set up parser
-    parser = argparse.ArgumentParser(description='Welcome to Corekaburra!\n '
-                                                 'Program to determine consensus core sequence from multiple genomes.\n'
-                                                 'Outputs consensus core gene alignment, distance between core genes, '
-                                                 'number of accessory genes between core genes and low frequency genes '
-                                                 'between core genes',
-                                     add_help=False) #TODO - Change
+    parser = argparse.ArgumentParser(description='Welcome to Corekaburra!'
+                                                 'An extension to pan-genome analyses that summarise genomic regions '
+                                                 'between core genes and segments of neighbouring core genes using '
+                                                 'gene synteny from a set of input genomes and a pan-genome folder.',
+                                     add_help=False)
 
     required = parser.add_argument_group('Required arguments')
     run_mods = parser.add_argument_group('Analysis modifiers')

diff --git a/Corekaburra/output_writer_functions.py b/Corekaburra/output_writer_functions.py
@@ -13,7 +13,7 @@ def master_info_writer(master_info, out_path, prefix):
     """
 
     # Write general content
-    out_file_name = 'low_frequency_gene_placement.tsv'
+    out_file_name = 'low_frequency_gene_placement.tsv' # Previously 'low_frequency_gene_placement.tsv' - Proposed name: core_region_content.tsv
     if prefix is not None:
         out_file_name = f'{prefix}_{out_file_name}'
     with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file:
@@ -31,7 +31,7 @@ def master_info_writer(master_info, out_path, prefix):
             writer.writerow(info)
 
     # Write gene content in long format
-    out_file_name = 'core_core_accessory_gene_content.tsv'
+    out_file_name = 'core_core_accessory_gene_content.tsv' # Previously core_core_accessory_gene_content.tsv - Proposed name: accessory_gene_placement.tsv
     if prefix is not None:
         out_file_name = f'{prefix}_{out_file_name}'
 
@@ -73,7 +73,7 @@ def summary_info_writer(master_summary_info, out_path, prefix):
     :return: Nothing
     """
     # Generate file name
-    out_file_name = 'core_pair_summary.csv'
+    out_file_name = 'core_pair_summary.csv' # Previously: core_pair_summary.csv - proposed name: core_region_summary.csv
     if prefix is not None:
         out_file_name = prefix + '_' + out_file_name
 

diff --git a/README.md b/README.md
diff --git a/functional_tests/Corekaburra-test.sh b/functional_tests/Corekaburra-test.sh
@@ -247,6 +247,8 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co
 test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected
 rm -r test_out_folder
 
+# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes.
+
 call_new_test "Test with decreased core-gene cutoff"
 Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1
 test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected

diff --git a/functional_tests/test_data/no_input.expected b/functional_tests/test_data/no_input.expected
@@ -3,10 +3,9 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome
                    [-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int]
                    [-l | -q] [-h]
 
-Welcome to Corekaburra! Program to determine consensus core sequence from
-multiple genomes. Outputs consensus core gene alignment, distance between core
-genes, number of accessory genes between core genes and low frequency genes
-between core genes
+Welcome to Corekaburra!An extension to pan-genome analyses that summarise
+genomic regions between core genes and segments of neighbouring core genes
+using gene synteny from a set of input genomes and a pan-genome folder.
 
 Required arguments:
   -ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...]

diff --git a/setup.py b/setup.py
@@ -3,12 +3,11 @@
 from distutils.core import setup
 
 LONG_DESCRIPTION = \
-'''The program reads one or more input FASTA files.
-For each file it computes a variety of statistics, and then
-prints a summary of the statistics as output.
-
-The goal is to provide a solid foundation for new bioinformatics command line tools,
-and is an ideal starting place for new projects.'''
+'''Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra 
+identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters 
+and distance between core genes. Information from neighboring core genes is further used to identify stretches of core  
+gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs 
+from standard pan-genome pipelines: Roary and Panaroo.'''
 
 
 setup(

diff --git a/unit_tests/Corekaburra_test.py b/unit_tests/Corekaburra_test.py
@@ -87,6 +87,7 @@ def test_all_files_found(self):
                      '/path/complete_genome_2.gff.gz',
                      'complete_genome_3.gff.gz',
                      'complete_genome_4.gff',
+                     'complete_genome_5',
                      'dummy_index_1',
                      'dummy_index_2']
 
@@ -95,7 +96,8 @@ def test_all_files_found(self):
         expected_return = ['complete_genome_1',
                            'complete_genome_2',
                            'complete_genome_3',
-                           'complete_genome_4']
+                           'complete_genome_4',
+                           'complete_genome_5']
 
         return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files, self.logger)
 
@@ -105,6 +107,7 @@ def test_correct_one_files_not_found(self):
         gff_files = ['/path/complete_genome_2.gff.gz',
                      'complete_genome_3.gff.gz',
                      'complete_genome_4.gff',
+                     'complete_genome_5',
                      'dummy_index_1',
                      'dummy_index_2']
 
@@ -138,6 +141,14 @@ def test_panaroo_input(self):
         self.assertEqual("Panaroo", return_program)
         self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path)
 
+    def test_minimal_panaroo_input(self):
+        input_folder_path = 'TestPangenomeSourceProgram/Mock_minimal_panaroo'
+
+        return_program, return_path = check_inputs.define_pangenome_program(input_folder_path, self.logger)
+
+        self.assertEqual("Panaroo", return_program)
+        self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path)
+
     # def test_pirate_input(self): TODO - Make Corekaburra take Pirate input!
     #     pass
     #     input_folder_path = 'TestPangenomeSourceProgram/Mock_pirate'