Merge pull request #6 from milnus/dev

Dev
milnus · Jan 26, 2022 · ac96c30 · ac96c30
2 parents 99b075c + c01a8a7
commit ac96c30
Show file tree

Hide file tree

Showing 85 changed files with 1,209 additions and 157 deletions.
diff --git a/Corekaburra/__main__.py b/Corekaburra/__main__.py
@@ -146,7 +146,7 @@ def main():
     inital_check_time_start = time.time()
 
     # get arguments from the commandline
-    args = get_commandline_arguments(sys.argv[1:])
+    args = get_commandline_arguments(sys.argv[1:], PROGRAM_VERSION)
 
     # Construct output folder
     try:
@@ -195,21 +195,23 @@ def main():
 
     ## Read in gene presence absence file
     time_start_read_files = time.time()
+    # Prepair folder for reannotated genes and examine if any are already present
+    if source_program == "Panaroo" and args.annotate:
+        gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path,
+                                                                                  args.input_gffs, logger)
+    else:
+        gene_data_dict = None
+        corrected_dir = None
+
     # TODO - ATM the column with presence of gene in genomes is used to define what is core and not. Is it better to use the number of input gffs instead?
     #   - There are upsides to the current. You can use the same genome to find segments for two different populations with in the dataset using the same reference of core-genes
     #   - Making it depend on the input is not viable for comparing runs, even within the same pan-genome, when using different sets of gff files.
     # TODO - Some day it would be awesome to be able to provide a clustering/population structure which could divide genes into the 13 definitions outlined by Horesh et al. [DOI: 10.1099/mgen.0.000670]
     core_dict, low_freq_dict, acc_gene_dict = read_gene_presence_absence(input_pres_abs_file_path, args.core_cutoff,
                                                                          args.low_cutoff, source_program,
-                                                                         args.input_gffs, tmp_folder_path, logger)
+                                                                         args.input_gffs, tmp_folder_path,
+                                                                         gene_data_dict, corrected_dir, logger)
 
-    # Prepair folder for reannotated genes and examine if any are already present
-    if source_program == "Panaroo" and args.annotate:
-        gene_data_dict, corrected_dir, args.input_gffs = prepair_for_reannotation(gene_data_path, args.output_path,
-                                                                                   args.input_gffs, logger)
-    else:
-        gene_data_dict = None
-        corrected_dir = None
 
     time_end_read_files = time.time()
     time_start_passing_gffs = time.time()
@@ -259,7 +261,7 @@ def main():
     time_end_passing_gffs = time.time()
     time_start_segments_search = time.time()
 
-    time_start = time.time()
+    time_start = time.time() # TODO - This seems like a lonely start timer?
     # Count number of unique accessory genes inserted into a core-core region across the genomes
     acc_region_count = {key: len(set(core_neighbour_low_freq[key])) for key in core_neighbour_low_freq}
     # Count number of unique low frequency genes inserted into a core-core region across the genomes
@@ -288,7 +290,7 @@ def main():
     logger.debug("Summary output")
     summary_info_writer(master_summary_info, args.output_path, args.output_prefix)
 
-    if double_edge_segements is not None:
+    if double_edge_segements:
         logger.debug("Segment output")
         segment_writer(double_edge_segements, args.output_path, args.output_prefix)
 

diff --git a/Corekaburra/commandline_interface.py b/Corekaburra/commandline_interface.py
@@ -4,15 +4,16 @@
 EXIT_COMMAND_LINE_ERROR = 2
 
 
-def get_commandline_arguments(args):
+def get_commandline_arguments(args, version):
     """
     Function that takes the input given to the commandline and passes it.
     will check for no input and '-help'
     :param args: List of input arguments given to the commandline
+    :param version: Version of Corekaburra
     :return: matched argument object for passing in main function.
     """
     # Set up parser
-    parser = argparse.ArgumentParser(description='Welcome to Corekaburra!'
+    parser = argparse.ArgumentParser(description='Welcome to Corekaburra! '
                                                  'An extension to pan-genome analyses that summarise genomic regions '
                                                  'between core genes and segments of neighbouring core genes using '
                                                  'gene synteny from a set of input genomes and a pan-genome folder.',
@@ -124,7 +125,10 @@ def get_commandline_arguments(args):
                           action='help',
                           help='Show help function')
 
-
+    rem_args.add_argument('-v',
+                          '--version',
+                          action='version',
+                          version=f'Corekaburra {version}')
 
     # Check if any thing is given as input otherwise warn and print help
     if len(args) < 1:
@@ -137,3 +141,7 @@ def get_commandline_arguments(args):
     args = parser.parse_args(args)
 
     return args
+
+
+if __name__ == '__main__':
+    get_commandline_arguments([], 666)
diff --git a/Corekaburra/consesus_core_genome.py b/Corekaburra/consesus_core_genome.py
@@ -115,27 +115,40 @@ def identify_no_accessory_segments(double_edge_segements, combined_acc_gene_coun
     return sub_segment_dict
 
 
-def identify_segments(core_graph, num_gffs, core_gene_dict):
+def identify_segments(core_graph, num_gffs, core_gene_dict, num_core_graph_components):
     """
     Function to identify stretches of core genes between core genes neighbouring multiple different genes
     :param core_graph: Graph over core genes with weights being the number of connections between the genes
     :param num_gffs: Number of gffs inputted
+    :param core_gene_dict: Dict with keys being genomes, each genome is a dict with keys being genes and values the mapped pan-genome gene cluster.
+
     :return: Dict over stretches of core genes found in the core gene graph.
     """
+    # TODO - Describe missing parameters in docstring
+
+    # TODO - Fix Ouli's problem where the core gene graph may split into two seperat pieces, and also handle double chromosome.
+    #  - Add a chek if the core gene graph is a single component of multiple. Handle components separately. - Write test then program
+    #  - This likely require a change to the all-vs-all search of multi edge core gene search, by adding a try and expect statement maybe, or just handle each component separately.
 
     # Identify all nodes that contain more than two degrees.
     multi_edge_nodes = [node for node, connections in core_graph.degree if connections > 2]
+    # Check if multiple components in core graph, if then find single edge core_genes
+    if num_core_graph_components > 1:
+        singe_edge_nodes = [node for node, connections in core_graph.degree if connections == 1]
+    else:
+        singe_edge_nodes = []
 
     # Check if any node have multiple edges, if not then return.
-    if len(multi_edge_nodes) == 0:
+    if len(multi_edge_nodes+singe_edge_nodes) == 0:
         return None
 
     # Dict to hold connections between >2 edge nodes
     connect_dict = {}
 
     # for all nodes with >2 degrees themself, identify neighbouring nodes with >2 degrees
-    for node in multi_edge_nodes:
-        connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node) if neighbor in multi_edge_nodes]
+    for node in multi_edge_nodes+singe_edge_nodes:
+        connect_dict[node] = [neighbor for neighbor in core_graph.neighbors(node)
+                              if neighbor in multi_edge_nodes or neighbor in singe_edge_nodes]
 
     # Turn the weight into a 'distance' or number of times not found together.
     for edge in core_graph.edges(data=True):
@@ -147,8 +160,8 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):
 
     # Go through all source and taget nodes,
     # see if a path can be found where all nodes between them have only two degrees
-    for source_node in multi_edge_nodes:
-        for target_node in multi_edge_nodes:
+    for source_node in multi_edge_nodes+singe_edge_nodes:
+        for target_node in multi_edge_nodes+singe_edge_nodes:
             if target_node != source_node:
                 # Get path (segment) from source to target
                 segment = nx.shortest_path(core_graph, source_node, target_node, weight='weight', method='dijkstra') # bellman-ford or dijkstra
@@ -157,7 +170,7 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):
                 segment_length = len(segment)
 
                 # Get length of segment with multi nodes removed
-                two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes])
+                two_degree_segment_length = len([node for node in segment if node not in multi_edge_nodes+singe_edge_nodes])
 
                 # Check if no node between the source and target has more than two edges,
                 # if then move to record the segment/path
@@ -183,9 +196,9 @@ def identify_segments(core_graph, num_gffs, core_gene_dict):
                                             f"Path from one node to another ({source_target_name}) was found, but did not match previously found path!")
 
     # Calculate the expected number of paths
-    total_edges_from_multi_edge_nodes = sum([connections for _, connections in core_graph.degree if connections > 2])
-    num_edges_between_multi_edge_nodes = sum([len(connect_dict[key]) for key in connect_dict])
-    expected_segment_number = int((total_edges_from_multi_edge_nodes / 2) - (num_edges_between_multi_edge_nodes / 2)) + len(multi_edge_connect_adjust)
+    total_edges_from_non_two_edge_core_genes = sum([connections for _, connections in core_graph.degree if connections > 2 or connections < 2])
+    num_edges_between_non_two_edge_core_genes = sum([len(connect_dict[key]) for key in connect_dict])
+    expected_segment_number = int((total_edges_from_non_two_edge_core_genes / 2) - (num_edges_between_non_two_edge_core_genes / 2)) + len(multi_edge_connect_adjust)
 
     # Check if less than the number of expected paths has been found,
     # if then try to identify missing paths
@@ -285,6 +298,7 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num
     :param combined_acc_gene_count: Number of accessory and low-frequency genes detected between core gene pairs
     :param num_gffs: Number of inputted gff files # TODO - Should this be the minimum number determined by the input cut-off for core genes
     :param logger: Program logger
+    # TODO - Add parameters
 
     :return double_edge_segements:
     :return no_acc_segments:
@@ -294,11 +308,22 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num
 
     # Construct a graph from core gene neighbours
     core_graph = construct_core_graph(core_neighbour_pairs)
+    num_core_graph_components = nx.number_connected_components(core_graph)
+
+    logger.debug(f'Identified: {num_core_graph_components} components in core genome graph')
+
+    double_edge_segements = {}
+    # Identify all segments in components of core graph
+    for component in nx.connected_components(core_graph):
+        logger.debug(f'Searching component related to: {component}')
 
-    # Find segments in the genome between core genes with multiple neighbors
-    double_edge_segements = identify_segments(core_graph, num_gffs, core_gene_dict)
+        component_graph = core_graph.subgraph(component).copy()
+        return_segments = identify_segments(component_graph, num_gffs, core_gene_dict, num_core_graph_components)
+        if return_segments is not None:
+            double_edge_segements = double_edge_segements | return_segments
 
-    if double_edge_segements is not None:
+    # if double_edge_segements is not None:
+    if double_edge_segements:
         logger.debug(f'A total of {len(double_edge_segements)} core genes were identified to have multiple neighbours.')
         logger.debug(f'Genes with multiple neighbours: {double_edge_segements}')
 
@@ -316,4 +341,4 @@ def determine_genome_segments(core_neighbour_pairs, combined_acc_gene_count, num
 
 
 if __name__ == '__main__':
-    pass
+    pass
diff --git a/Corekaburra/correct_gffs.py b/Corekaburra/correct_gffs.py
@@ -30,10 +30,11 @@ def read_gene_data(gene_data_file):
         for line in gene_data.readlines():
             # Split read line at commas
             line = line.split(',')
+            # TODO - Scaffold (contig) name can be found in second position of a gene_data.csv line. This could possibly be used to speed things up so that the entire set of contigs isn't required for search.
 
             # Check if refound gene
             if 'refound' in line[2]:
-                # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence,
+                # Try to add the refound gene to the gene_data dict as a second key, value being the DNA sequence, name, and function in that order.
                 # if the first key (genome) is not found in gene_data dict,
                 # then construct dict for the genome and add the gene
                 try:
@@ -69,16 +70,23 @@ def prepair_for_reannotation(gene_data_path, output_folder, gffs, logger):
     try:
         os.mkdir(corrected_gff_out_dir)
     except FileExistsError:
+        # Get path for input
+        input_path_dict = {os.path.basename(gff): os.path.split(gff)[0] for gff in gffs}
+        # input_path = os.path.split(gffs[0])[0]
+
         corrected_folder_content = os.listdir(corrected_gff_out_dir)
 
         gff_names = [os.path.basename(gff) for gff in gffs]
 
         corrected_files = [file for file in corrected_folder_content if
                            f'{file.split("_corrected")[0]}.gff' in gff_names]
 
+        corrected_files_w_path = [os.path.join(corrected_gff_out_dir, file) for file in corrected_files]
+
         if len(corrected_files) > 0:
             gffs = [file for file in gff_names if f'{file.replace(".gff", "")}_corrected.gff' not in corrected_files]
-            gffs = gffs + corrected_files
+            gffs = [os.path.join(input_path_dict[gff], gff) for gff in gffs]
+            gffs = gffs + corrected_files_w_path
 
     return gene_data_dict, corrected_gff_out_dir, gffs
 
@@ -316,9 +324,3 @@ def annotate_refound_genes(gff_name, gene_data_dict, tmp_folder_path, corrected_
 
 if __name__ == '__main__':
     pass
-    # _, _, attribute_dict = read_gene_presence_absence('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_presence_absence_roary.csv',
-    #                                                   1, 0.05)
-    #
-    # correct_gffs(['/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_008694005.gff'], '/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_pan_split_paralogs/gene_data.csv',
-    #                  "/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests", attribute_dict)
-    # # genome_dict = extract_genome_fasta('/Users/mjespersen/OneDrive - The University of Melbourne/Phd/Parts/Recombination_hotspots/Code/Between_core_variation/data_for_unit_tests/test_pan_genome/50_refseq_genomes/GCA_000006785.gff')
diff --git a/Corekaburra/gff_parser.py b/Corekaburra/gff_parser.py
@@ -99,7 +99,7 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous
 
     # Check that a line from gff is provided and previous gene is not a sequence break
     if gff_line is not None and previous_core_gene_id != "Sequence_break":
-        # Check if core gene is fragmented
+        # Check if core gene is fragmented, if then change coordinates to the last part of the fragment.
         if core_genes[gff_name][previous_core_gene_id] == core_genes[gff_name][gff_line[8]]:
             previous_core_gene_id = gff_line[8]
             previous_core_gene_end_coor = int(gff_line[4])
@@ -109,17 +109,19 @@ def record_core_core_region(core_genes, gff_name, gff_line, contig_end, previous
                     core_gene_pair_distance, accessory_gene_content, low_freq_gene_content, core_gene_pairs, master_info)
 
     # Set core cluster names
-    # If no line from gff is given there is a sequence break,
+    # If no line from gff is given there is a sequence-break,
     # if it is given then set current cluster and try to find previous if not found it is a sequence break
     if gff_line is not None:
         current_core_gene_cluster = core_genes[gff_name][gff_line[8]]
         try:
             previous_core_gene_cluster = core_genes[gff_name][previous_core_gene_id]
+            core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])
         # Catch is previous gene was a sequence break.
         except KeyError:
             previous_core_gene_cluster = previous_core_gene_id
+            core_gene_neighbours = [previous_core_gene_cluster, current_core_gene_cluster]
 
-        core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])
+        # core_gene_neighbours = sorted([previous_core_gene_cluster, current_core_gene_cluster])
 
     else:
         current_core_gene_cluster = "Sequence_break"
@@ -448,7 +450,7 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc
                 # Set that first core gene has been observed
                 first_core_gene = False
 
-            # Check if first gene on new contig is a core gene, if the record it.
+            # Check if first gene on new contig is a core gene, if then record it.
             elif line[8] in core_genes[gff_name]:
                 previous_core_gene_id = "Sequence_break"
 
@@ -541,15 +543,15 @@ def segment_gff_content(gff_generator, core_genes, low_freq_genes, gff_path, acc
              accessory_gene_content,
              low_freq_gene_content, core_gene_pairs,
             master_info) = record_core_core_region(core_genes, gff_name, None, contig_sizes[previous_contig],
-                                                                             previous_core_gene_id,
-                                                                             previous_core_gene_end_coor,
-                                                                             acc_genes_in_region,
-                                                                             low_freq_genes_in_region,
-                                                                             core_gene_pair_distance,
-                                                                             accessory_gene_content,
-                                                                             low_freq_gene_content,
-                                                                             core_gene_pairs,
-                                                                             master_info)
+                                                   previous_core_gene_id,
+                                                   previous_core_gene_end_coor,
+                                                   acc_genes_in_region,
+                                                   low_freq_genes_in_region,
+                                                   core_gene_pair_distance,
+                                                   accessory_gene_content,
+                                                   low_freq_gene_content,
+                                                   core_gene_pairs,
+                                                   master_info)
         else:
             # Add a core-less contig if there has been accessory genes:
             coreless_contigs = record_coreless_contig(coreless_contigs, acc_genes_in_region, low_freq_genes_in_region, gff_name, line[0])