Skip to content

Commit

Permalink
Merge pull request #6 from milnus/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
milnus authored Jan 26, 2022
2 parents 99b075c + c01a8a7 commit ac96c30
Show file tree
Hide file tree
Showing 85 changed files with 1,209 additions and 157 deletions.
24 changes: 13 additions & 11 deletions Corekaburra/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def main():
inital_check_time_start = time.time()

# get arguments from the commandline
args = get_commandline_arguments(sys.argv[1:])
args = get_commandline_arguments(sys.argv[1:], PROGRAM_VERSION)

# Construct output folder
try:
Expand Down Expand Up @@ -195,21 +195,23 @@ def main():

## Read in gene presence absence file
time_start_read_files = time.time()
# Prepair folder for reannotated genes and examine if any are already present
if source_program == "Panaroo" and args.annotate:
gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path,
args.input_gffs, logger)
else:
gene_data_dict = None
corrected_dir = None

# TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead?
# - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes
# - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files.
# TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670]
core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff,
args.low_cutoff, source_program,
args.input_gffs, tmp_folder_path, logger)
args.input_gffs, tmp_folder_path,
gene_data_dict, corrected_dir, logger)

# Prepair folder for reannotated genes and examine if any are already present
if source_program == "Panaroo" and args.annotate:
gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path,
args.input_gffs, logger)
else:
gene_data_dict = None
corrected_dir = None

time_end_read_files = time.time()
time_start_passing_gffs = time.time()
Expand Down Expand Up @@ -259,7 +261,7 @@ def main():
time_end_passing_gffs = time.time()
time_start_segments_search = time.time()

time_start = time.time()
time_start = time.time() # TODO - This seems like a lonely start timer?
# Count number of unique accessory genes inserted into a core-core region across the genomes
acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq}
# Count number of unique low frequency genes inserted into a core-core region across the genomes
Expand Down Expand Up @@ -288,7 +290,7 @@ def main():
logger.debug("Summary output")
summary_info_writer(master_summary_info, args.output_path, args.output_prefix)

if double_edge_segements is not None:
if double_edge_segements:
logger.debug("Segment output")
segment_writer(double_edge_segements, args.output_path, args.output_prefix)

Expand Down
14 changes: 11 additions & 3 deletions Corekaburra/commandline_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
EXIT_COMMAND_LINE_ERROR = 2


def get_commandline_arguments(args):
def get_commandline_arguments(args, version):
"""
Function that takes the input given to the commandline and passes it.
will check for no input and '-help'
:param args: List of input arguments given to the commandline
:param version: Version of Corekaburra
:return: matched argument object for passing in main function.
"""
# Set up parser
parser = argparse.ArgumentParser(description='Welcome to Corekaburra!'
parser = argparse.ArgumentParser(description='Welcome to Corekaburra! '
'An extension to pan-genome analyses that summarise genomic regions '
'between core genes and segments of neighbouring core genes using '
'gene synteny from a set of input genomes and a pan-genome folder.',
Expand Down Expand Up @@ -124,7 +125,10 @@ def get_commandline_arguments(args):
action='help',
help='Show help function')


rem_args.add_argument('-v',
'--version',
action='version',
version=f'Corekaburra {version}')

# Check if any thing is given as input otherwise warn and print help
if len(args) < 1:
Expand All @@ -137,3 +141,7 @@ def get_commandline_arguments(args):
args = parser.parse_args(args)

return args


if __name__ == '__main__':
get_commandline_arguments([], 666)
53 changes: 39 additions & 14 deletions Corekaburra/consesus_core_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,40 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun
return sub_segment_dict


def identify_segments(core_graph, num_gffs, core_gene_dict):
def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components):
"""
Function to identify stretches of core genes between core genes neighbouring multiple different genes
:param core_graph: Graph over core genes with weights being the number of connections between the genes
:param num_gffs: Number of gffs inputted
:param core_gene_dict: Dict with keys being genomes, each genome is a dict with keys being genes and values the mapped pan-genome gene cluster.
:return: Dict over stretches of core genes found in the core gene graph.
"""
# TODO - Describe missing parameters in docstring

# TODO - Fix Ouli's problem where the core gene graph may split into two seperat pieces, and also handle double chromosome.
# - Add a chek if the core gene graph is a single component of multiple. Handle components separately. - Write test then program
# - This likely require a change to the all-vs-all search of multi edge core gene search, by adding a try and expect statement maybe, or just handle each component separately.

# Identify all nodes that contain more than two degrees.
multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2]
# Check if multiple components in core graph, if then find single edge core_genes
if num_core_graph_components > 1:
singe_edge_nodes = [node for node, connections in core_graph.degree if connections == 1]
else:
singe_edge_nodes = []

# Check if any node have multiple edges, if not then return.
if len(multi_edge_nodes) == 0:
if len(multi_edge_nodes+singe_edge_nodes) == 0:
return None

# Dict to hold connections between >2 edge nodes
connect_dict = {}

# for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees
for node in multi_edge_nodes:
connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes]
for node in multi_edge_nodes+singe_edge_nodes:
connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node)
if neighbor in multi_edge_nodes or neighbor in singe_edge_nodes]

# Turn the weight into a 'distance' or number of times not found together.
for edge in core_graph.edges(data=True):
Expand All @@ -147,8 +160,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):

# Go through all source and taget nodes,
# see if a path can be found where all nodes between them have only two degrees
for source_node in multi_edge_nodes:
for target_node in multi_edge_nodes:
for source_node in multi_edge_nodes+singe_edge_nodes:
for target_node in multi_edge_nodes+singe_edge_nodes:
if target_node != source_node:
# Get path (segment) from source to target
segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra
Expand All @@ -157,7 +170,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):
segment_length = len(segment)

# Get length of segment with multi nodes removed
two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes])
two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+singe_edge_nodes])

# Check if no node between the source and target has more than two edges,
# if then move to record the segment/path
Expand All @@ -183,9 +196,9 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):
f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!")

# Calculate the expected number of paths
total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2])
num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict])
expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + len(multi_edge_connect_adjust)
total_edges_from_non_two_edge_core_genes = sum([connections for _, connections in core_graph.degree if connections > 2 or connections < 2])
num_edges_between_non_two_edge_core_genes = sum([len(connect_dict[key]) for key in connect_dict])
expected_segment_number = int((total_edges_from_non_two_edge_core_genes / 2) - (num_edges_between_non_two_edge_core_genes / 2)) + len(multi_edge_connect_adjust)

# Check if less than the number of expected paths has been found,
# if then try to identify missing paths
Expand Down Expand Up @@ -285,6 +298,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num
:param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs
:param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes
:param logger: Program logger
# TODO - Add parameters
:return double_edge_segements:
:return no_acc_segments:
Expand All @@ -294,11 +308,22 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num

# Construct a graph from core gene neighbours
core_graph = construct_core_graph(core_neighbour_pairs)
num_core_graph_components = nx.number_connected_components(core_graph)

logger.debug(f'Identified: {num_core_graph_components} components in core genome graph')

double_edge_segements = {}
# Identify all segments in components of core graph
for component in nx.connected_components(core_graph):
logger.debug(f'Searching component related to: {component}')

# Find segments in the genome between core genes with multiple neighbors
double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict)
component_graph = core_graph.subgraph(component).copy()
return_segments = identify_segments(component_graph, num_gffs, core_gene_dict, num_core_graph_components)
if return_segments is not None:
double_edge_segements = double_edge_segements | return_segments

if double_edge_segements is not None:
# if double_edge_segements is not None:
if double_edge_segements:
logger.debug(f'A total of {len(double_edge_segements)} core genes were identified to have multiple neighbours.')
logger.debug(f'Genes with multiple neighbours: {double_edge_segements}')

Expand All @@ -316,4 +341,4 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num


if __name__ == '__main__':
pass
pass
18 changes: 10 additions & 8 deletions Corekaburra/correct_gffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def read_gene_data(gene_data_file):
for line in gene_data.readlines():
# Split read line at commas
line = line.split(',')
# TODO - Scaffold (contig) name can be found in second position of a gene_data.csv line. This could possibly be used to speed things up so that the entire set of contigs isn't required for search.

# Check if refound gene
if 'refound' in line[2]:
# Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence,
# Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, name, and function in that order.
# if the first key (genome) is not found in gene_data dict,
# then construct dict for the genome and add the gene
try:
Expand Down Expand Up @@ -69,16 +70,23 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger):
try:
os.mkdir(corrected_gff_out_dir)
except FileExistsError:
# Get path for input
input_path_dict = {os.path.basename(gff): os.path.split(gff)[0] for gff in gffs}
# input_path = os.path.split(gffs[0])[0]

corrected_folder_content = os.listdir(corrected_gff_out_dir)

gff_names = [os.path.basename(gff) for gff in gffs]

corrected_files = [file for file in corrected_folder_content if
f'{file.split("_corrected")[0]}.gff' in gff_names]

corrected_files_w_path = [os.path.join(corrected_gff_out_dir, file) for file in corrected_files]

if len(corrected_files) > 0:
gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files]
gffs = gffs + corrected_files
gffs = [os.path.join(input_path_dict[gff], gff) for gff in gffs]
gffs = gffs + corrected_files_w_path

return gene_data_dict, corrected_gff_out_dir, gffs

Expand Down Expand Up @@ -316,9 +324,3 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_

if __name__ == '__main__':
pass
# _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv',
# 1, 0.05)
#
# correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv',
# "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict)
# # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff')
28 changes: 15 additions & 13 deletions Corekaburra/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous

# Check that a line from gff is provided and previous gene is not a sequence break
if gff_line is not None and previous_core_gene_id != "Sequence_break":
# Check if core gene is fragmented
# Check if core gene is fragmented, if then change coordinates to the last part of the fragment.
if core_genes[gff_name][previous_core_gene_id] == core_genes[gff_name][gff_line[8]]:
previous_core_gene_id = gff_line[8]
previous_core_gene_end_coor = int(gff_line[4])
Expand All @@ -109,17 +109,19 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous
core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info)

# Set core cluster names
# If no line from gff is given there is a sequence break,
# If no line from gff is given there is a sequence-break,
# if it is given then set current cluster and try to find previous if not found it is a sequence break
if gff_line is not None:
current_core_gene_cluster = core_genes[gff_name][gff_line[8]]
try:
previous_core_gene_cluster = core_genes[gff_name][previous_core_gene_id]
core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])
# Catch is previous gene was a sequence break.
except KeyError:
previous_core_gene_cluster = previous_core_gene_id
core_gene_neighbours = [previous_core_gene_cluster, current_core_gene_cluster]

core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])
# core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])

else:
current_core_gene_cluster = "Sequence_break"
Expand Down Expand Up @@ -448,7 +450,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc
# Set that first core gene has been observed
first_core_gene = False

# Check if first gene on new contig is a core gene, if the record it.
# Check if first gene on new contig is a core gene, if then record it.
elif line[8] in core_genes[gff_name]:
previous_core_gene_id = "Sequence_break"

Expand Down Expand Up @@ -541,15 +543,15 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc
accessory_gene_content,
low_freq_gene_content, core_gene_pairs,
master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig],
previous_core_gene_id,
previous_core_gene_end_coor,
acc_genes_in_region,
low_freq_genes_in_region,
core_gene_pair_distance,
accessory_gene_content,
low_freq_gene_content,
core_gene_pairs,
master_info)
previous_core_gene_id,
previous_core_gene_end_coor,
acc_genes_in_region,
low_freq_genes_in_region,
core_gene_pair_distance,
accessory_gene_content,
low_freq_gene_content,
core_gene_pairs,
master_info)
else:
# Add a core-less contig if there has been accessory genes:
coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0])
Expand Down
Loading

0 comments on commit ac96c30

Please sign in to comment.