Skip to content

Commit

Permalink
Merge pull request #5 from milnus/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
milnus authored Jan 11, 2022
2 parents 9171ac6 + 4093dcb commit 99b075c
Show file tree
Hide file tree
Showing 10 changed files with 1,899 additions and 265 deletions.
8 changes: 6 additions & 2 deletions Corekaburra/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
Maintainer : [email protected]
Portability : POSIX
The program reads one or more input FASTA files. For each file it computes a
variety of statistics, and then prints a summary of the statistics as output. # TODO - Change description
Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra
identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters
and distance between core genes. Information from neighboring core genes is further used to identify stretches of core
gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs
from standard pan-genome pipelines: Roary and Panaroo.
'''

import os
Expand Down Expand Up @@ -322,5 +325,6 @@ def main():
if args.discard_gffs:
os.rmdir(os.path.join(args.output_path, 'Corrected_gff_files'))


if __name__ == '__main__':
main()
11 changes: 5 additions & 6 deletions Corekaburra/commandline_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ def get_commandline_arguments(args):
:return: matched argument object for passing in main function.
"""
# Set up parser
parser = argparse.ArgumentParser(description='Welcome to Corekaburra!\n '
'Program to determine consensus core sequence from multiple genomes.\n'
'Outputs consensus core gene alignment, distance between core genes, '
'number of accessory genes between core genes and low frequency genes '
'between core genes',
add_help=False) #TODO - Change
parser = argparse.ArgumentParser(description='Welcome to Corekaburra!'
'An extension to pan-genome analyses that summarise genomic regions '
'between core genes and segments of neighbouring core genes using '
'gene synteny from a set of input genomes and a pan-genome folder.',
add_help=False)

required = parser.add_argument_group('Required arguments')
run_mods = parser.add_argument_group('Analysis modifiers')
Expand Down
6 changes: 3 additions & 3 deletions Corekaburra/output_writer_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def master_info_writer(master_info, out_path, prefix):
"""

# Write general content
out_file_name = 'low_frequency_gene_placement.tsv'
out_file_name = 'low_frequency_gene_placement.tsv' # Previously 'low_frequency_gene_placement.tsv' - Proposed name: core_region_content.tsv
if prefix is not None:
out_file_name = f'{prefix}_{out_file_name}'
with open(os.path.join(out_path, out_file_name), 'w', newline='', encoding='utf-8') as out_file:
Expand All @@ -31,7 +31,7 @@ def master_info_writer(master_info, out_path, prefix):
writer.writerow(info)

# Write gene content in long format
out_file_name = 'core_core_accessory_gene_content.tsv'
out_file_name = 'core_core_accessory_gene_content.tsv' # Previously core_core_accessory_gene_content.tsv - Proposed name: accessory_gene_placement.tsv
if prefix is not None:
out_file_name = f'{prefix}_{out_file_name}'

Expand Down Expand Up @@ -73,7 +73,7 @@ def summary_info_writer(master_summary_info, out_path, prefix):
:return: Nothing
"""
# Generate file name
out_file_name = 'core_pair_summary.csv'
out_file_name = 'core_pair_summary.csv' # Previously: core_pair_summary.csv - proposed name: core_region_summary.csv
if prefix is not None:
out_file_name = prefix + '_' + out_file_name

Expand Down
330 changes: 87 additions & 243 deletions README.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions functional_tests/Corekaburra-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ test_output_file test_out_folder/core_segments.csv Rearrangement_run_expected/co
test_output_file test_out_folder/no_accessory_core_segments.csv Rearrangement_run_expected/no_accessory_core_segments.csv.expected
rm -r test_out_folder

# TODO - Test that segmnets can be identified with a core-cutoff that is less than all genomes.

call_new_test "Test with decreased core-gene cutoff"
Corekaburra -ig complete_genome_single_chrom.gff complete_genome_single_chrom_2.gff genome_single_chrom_larger.gff -ip Change_cutoffs -o test_out_folder -cc 0.9 > /dev/null 2>&1
test_output_file test_out_folder/core_core_accessory_gene_content.tsv core_90_cutoff_expected/core_core_accessory_gene_content.tsv.expected
Expand Down
7 changes: 3 additions & 4 deletions functional_tests/test_data/no_input.expected
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ usage: Corekaburra -ig file.gff [file.gff ...] -ip path/to/pan_genome
[-o path/to/output] [-p OUTPUT_PREFIX] [-d] [-c int]
[-l | -q] [-h]

Welcome to Corekaburra! Program to determine consensus core sequence from
multiple genomes. Outputs consensus core gene alignment, distance between core
genes, number of accessory genes between core genes and low frequency genes
between core genes
Welcome to Corekaburra!An extension to pan-genome analyses that summarise
genomic regions between core genes and segments of neighbouring core genes
using gene synteny from a set of input genomes and a pan-genome folder.

Required arguments:
-ig file.gff [file.gff ...], --input_gffs file.gff [file.gff ...]
Expand Down
11 changes: 5 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
from distutils.core import setup

LONG_DESCRIPTION = \
'''The program reads one or more input FASTA files.
For each file it computes a variety of statistics, and then
prints a summary of the statistics as output.
The goal is to provide a solid foundation for new bioinformatics command line tools,
and is an ideal starting place for new projects.'''
'''Corekaburra looks at the gene synteny across genomes used to build a pan-genome. Using syntenic information Corekaburra
identifies regions between core gene clusters. Regions are described in terms of their content of accessory gene clusters
and distance between core genes. Information from neighboring core genes is further used to identify stretches of core
gene clusters throughout the pan-genome that appear in all genomes given as input. Corekaburra is compatible with outputs
from standard pan-genome pipelines: Roary and Panaroo.'''


setup(
Expand Down
13 changes: 12 additions & 1 deletion unit_tests/Corekaburra_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def test_all_files_found(self):
'/path/complete_genome_2.gff.gz',
'complete_genome_3.gff.gz',
'complete_genome_4.gff',
'complete_genome_5',
'dummy_index_1',
'dummy_index_2']

Expand All @@ -95,7 +96,8 @@ def test_all_files_found(self):
expected_return = ['complete_genome_1',
'complete_genome_2',
'complete_genome_3',
'complete_genome_4']
'complete_genome_4',
'complete_genome_5']

return_object = read_complete_genome_file.parse_complete_genome_file(complete_genome_file, gff_files, self.logger)

Expand All @@ -105,6 +107,7 @@ def test_correct_one_files_not_found(self):
gff_files = ['/path/complete_genome_2.gff.gz',
'complete_genome_3.gff.gz',
'complete_genome_4.gff',
'complete_genome_5',
'dummy_index_1',
'dummy_index_2']

Expand Down Expand Up @@ -138,6 +141,14 @@ def test_panaroo_input(self):
self.assertEqual("Panaroo", return_program)
self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path)

def test_minimal_panaroo_input(self):
input_folder_path = 'TestPangenomeSourceProgram/Mock_minimal_panaroo'

return_program, return_path = check_inputs.define_pangenome_program(input_folder_path, self.logger)

self.assertEqual("Panaroo", return_program)
self.assertEqual(input_folder_path + '/gene_presence_absence_roary.csv', return_path)

# def test_pirate_input(self): TODO - Make Corekaburra take Pirate input!
# pass
# input_folder_path = 'TestPangenomeSourceProgram/Mock_pirate'
Expand Down
Loading

0 comments on commit 99b075c

Please sign in to comment.