From e23b183cde41b496146055e43ef7dc2a177b16be Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 11 Dec 2024 20:56:55 -0600 Subject: [PATCH 01/45] it seems to work! --- src/main.rs | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6f63cf2..bc159d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,7 +31,9 @@ struct RangeInfo { start: usize, end: usize, gfa_id: usize, - steps: Vec, // Store the path steps for this range + steps: Vec, // Path steps for this range + step_positions: Vec<(usize, usize)>, // Start and end positions of each step + step_lengths: Vec, // Lengths of each step } // Helper function to check if two ranges are contiguous @@ -176,10 +178,24 @@ fn main() { if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) { // Get the path steps and translate their IDs let mut translated_steps = Vec::new(); + let mut step_positions = Vec::new(); + let mut step_lengths = Vec::new(); + let mut cumulative_pos = start; + if let Some(path_ref) = block_graph.get_path_ref(path_id) { for step in path_ref.nodes.iter() { let translated_id = id_translation + step.id().into(); - translated_steps.push(Handle::pack(translated_id, step.is_reverse())); + let translated_step = Handle::pack(translated_id, step.is_reverse()); + translated_steps.push(translated_step); + + // Get the sequence length of the node + let node_seq = block_graph.sequence(*step).collect::>(); + let node_length = node_seq.len(); + step_lengths.push(node_length); + + // Record the end position of this step + step_positions.push((cumulative_pos, cumulative_pos + node_length)); + cumulative_pos = cumulative_pos + node_length; } } @@ -190,6 +206,8 @@ fn main() { end, gfa_id, steps: translated_steps, + step_positions, + step_lengths, }); } } @@ -202,11 +220,156 @@ fn main() { eprintln!("Total path ranges collected: {}", path_key_ranges.len()); } + let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; + // Sort ranges and create merged paths in the combined graph for (path_key, ranges) in path_key_ranges.iter_mut() { // Sort ranges by start position ranges.sort_by_key(|r| (r.start, r.end)); - + + // Process overlaps + for i in 1..ranges.len() { + let (left, right) = ranges.split_at_mut(i); + let r1 = &mut left[left.len()-1]; + let r2 = &mut right[0]; + + if has_overlap(r1, r2) { + // Calculate the overlap region + let overlap_start = r2.start; + let overlap_end = r1.end; + + // Adjust r2 to remove the overlap + let mut steps_to_remove = Vec::new(); + let mut steps_to_split = Vec::new(); + + for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { + if step_end <= overlap_start { + //println!("\tStep {} before overlap", idx); + continue; + } else if step_start >= overlap_end { + //println!("\tStep {} after overlap", idx); + break; + } else if step_start >= overlap_start && step_end <= overlap_end { + //println!("\tStep {} within overlap", idx); + steps_to_remove.push(idx); + } else { + //println!("\tStep {} partially overlaps", idx); + steps_to_split.push(idx); + } + } + + // Initialize new vectors to store updated steps + let mut new_steps = Vec::new(); + let mut new_step_positions = Vec::new(); + let mut new_step_lengths = Vec::new(); + + // Iterate over the original steps + for idx in 0..r2.steps.len() { + let step_handle = r2.steps[idx]; + let (step_start, step_end) = r2.step_positions[idx]; + + if steps_to_remove.contains(&idx) { + // Skip steps to remove + continue; + } else if steps_to_split.contains(&idx) { + // Split nodes for steps that partially overlap + let node_seq = combined_graph.sequence(step_handle).collect::>(); + let overlap_within_step_start = std::cmp::max(step_start, overlap_start); + let overlap_within_step_end = std::cmp::min(step_end, overlap_end); + let overlap_start_offset = overlap_within_step_start - step_start; + let overlap_end_offset = overlap_within_step_end - step_start; + + if step_start < overlap_start && step_end > overlap_end { + // Split into three parts + let left_seq = &node_seq[0..overlap_start_offset]; + let right_seq = &node_seq[overlap_end_offset..]; + + let left_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let right_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + + let left_node = combined_graph.create_handle(left_seq, left_id); + let right_node = combined_graph.create_handle(right_seq, right_id); + + let left_handle = if step_handle.is_reverse() { + left_node.flip() + } else { + left_node + }; + let right_handle = if step_handle.is_reverse() { + right_node.flip() + } else { + right_node + }; + + new_steps.push(left_handle); + new_step_positions.push((step_start, overlap_start)); + new_step_lengths.push(left_seq.len()); + + new_steps.push(right_handle); + new_step_positions.push((overlap_end, step_end)); + new_step_lengths.push(right_seq.len()); + + combined_graph.create_edge(Edge(left_handle, right_handle)); + } else if step_start < overlap_start { + // Keep left part + let new_seq = &node_seq[0..overlap_start_offset]; + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(new_seq, node_id); + let new_handle = if step_handle.is_reverse() { + new_node.flip() + } else { + new_node + }; + + new_steps.push(new_handle); + new_step_positions.push((step_start, overlap_start)); + new_step_lengths.push(new_seq.len()); + } else if step_end > overlap_end { + // Keep right part + let new_seq = &node_seq[overlap_end_offset..]; + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(new_seq, node_id); + let new_handle = if step_handle.is_reverse() { + new_node.flip() + } else { + new_node + }; + + new_steps.push(new_handle); + new_step_positions.push((overlap_end, step_end)); + new_step_lengths.push(new_seq.len()); + } + } else { + // Keep steps that are not to be removed or split + new_steps.push(step_handle); + new_step_positions.push((step_start, step_end)); + new_step_lengths.push(r2.step_lengths[idx]); + } + } + + // Update r2 with the new steps + r2.steps = new_steps; + r2.step_positions = new_step_positions; + r2.step_lengths = new_step_lengths; + + // Update edges for the modified steps + for idx in 0..r2.steps.len() { + if idx > 0 { + let prev_step = r2.steps[idx - 1]; + if !combined_graph.has_edge(prev_step, r2.steps[idx]) { + combined_graph.create_edge(Edge(prev_step, r2.steps[idx])); + } + } + } + + r2.start = overlap_end; + } + } + // Check for overlaps and contiguity let mut has_overlaps = false; let mut all_contiguous = true; From 3391811ade869011eaaade234fc2ad33b91e405b Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 11 Dec 2024 21:15:49 -0600 Subject: [PATCH 02/45] style: Improve debug output formatting in main function --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index bc159d7..348ce02 100644 --- a/src/main.rs +++ b/src/main.rs @@ -215,8 +215,8 @@ fn main() { } if args.debug { - eprintln!("Total nodes in combined graph: {}", combined_graph.node_count()); - eprintln!("Total edges in combined graph: {}", combined_graph.edge_count()); + eprintln!("Total nodes in the combined graph: {}", combined_graph.node_count()); + eprintln!("Total edges in the combined graph: {}", combined_graph.edge_count()); eprintln!("Total path ranges collected: {}", path_key_ranges.len()); } From 287fc69eb2c2bd6fae232e5a29b5e1d59d5a9e05 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Wed, 11 Dec 2024 21:17:53 -0600 Subject: [PATCH 03/45] feat: Add debug message for detected overlaps with range details --- src/main.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main.rs b/src/main.rs index 348ce02..ac5e7df 100644 --- a/src/main.rs +++ b/src/main.rs @@ -234,6 +234,17 @@ fn main() { let r2 = &mut right[0]; if has_overlap(r1, r2) { + if args.debug { + let overlap_start = std::cmp::max(r1.start, r2.start); + let overlap_end = std::cmp::min(r1.end, r2.end); + let overlap_amount = overlap_end - overlap_start; + + eprintln!( + "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}", + path_key, r1.start, r1.end, r2.start, r2.start, overlap_amount + ); + } + // Calculate the overlap region let overlap_start = r2.start; let overlap_end = r1.end; From f4f230e3e20d8afd6ae3af27b4119e186d17127e Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 12 Dec 2024 11:41:09 -0600 Subject: [PATCH 04/45] typo --- Cargo.lock | 38 +++++++++++++++++++------------------- src/main.rs | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 42c737b..0880f37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,9 +125,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a68f1f47cdf0ec8ee4b941b2eee2a80cb796db73118c0dd09ac63fbe405be22" +checksum = "786a307d683a5bf92e6fd5fd69a7eb613751668d1d8d67d802846dfe367c62c8" dependencies = [ "memchr", "regex-automata 0.4.9", @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.2" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" dependencies = [ "jobserver", "libc", @@ -200,9 +200,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.22" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -210,9 +210,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.22" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -234,9 +234,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" @@ -329,7 +329,7 @@ dependencies = [ name = "gfalace" version = "0.1.0" dependencies = [ - "bstr 1.11.0", + "bstr 1.11.1", "clap", "gfa", "handlegraph", @@ -394,9 +394,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.167" +version = "0.2.168" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d" [[package]] name = "liblzma" @@ -409,9 +409,9 @@ dependencies = [ [[package]] name = "liblzma-sys" -version = "0.3.9" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6630cb23edeb2e563cd6c30d4117554c69646871455843c33ddcb1d9aef82ecf" +checksum = "41e2171ce6827cbab9bc97238a58361bf9a526080475f21dbc470e1842258b2d" dependencies = [ "cc", "libc", @@ -577,18 +577,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" dependencies = [ "proc-macro2", "quote", diff --git a/src/main.rs b/src/main.rs index ac5e7df..e7a6b0a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -241,7 +241,7 @@ fn main() { eprintln!( "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}", - path_key, r1.start, r1.end, r2.start, r2.start, overlap_amount + path_key, r1.start, r1.end, r2.start, r2.end, overlap_amount ); } From febd44ecfe9625bf46cef4b694b15717814e9cbd Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 12 Dec 2024 11:59:14 -0600 Subject: [PATCH 05/45] fix: Improve overlap detection logging in main function --- src/main.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index e7a6b0a..e05ee18 100644 --- a/src/main.rs +++ b/src/main.rs @@ -234,25 +234,22 @@ fn main() { let r2 = &mut right[0]; if has_overlap(r1, r2) { + // Calculate the overlap region + let overlap_start = r2.start; + let overlap_end = r1.end; + if args.debug { - let overlap_start = std::cmp::max(r1.start, r2.start); - let overlap_end = std::cmp::min(r1.end, r2.end); let overlap_amount = overlap_end - overlap_start; eprintln!( - "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}", - path_key, r1.start, r1.end, r2.start, r2.end, overlap_amount + "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", + path_key, r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount ); } - // Calculate the overlap region - let overlap_start = r2.start; - let overlap_end = r1.end; - // Adjust r2 to remove the overlap let mut steps_to_remove = Vec::new(); let mut steps_to_split = Vec::new(); - for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { if step_end <= overlap_start { //println!("\tStep {} before overlap", idx); From 84f0a91b3876ad7c4eb739689397c2e7ef3a24c0 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 11:59:15 -0600 Subject: [PATCH 06/45] feat: Remove ranges contained within other ranges before processing overlaps --- src/main.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index e05ee18..3ad7062 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,7 +26,7 @@ struct Args { debug: bool, } -#[derive(Debug)] +#[derive(Debug, Clone)] struct RangeInfo { start: usize, end: usize, @@ -227,6 +227,31 @@ fn main() { // Sort ranges by start position ranges.sort_by_key(|r| (r.start, r.end)); + // Remove ranges that are contained within other ranges + if !ranges.is_empty() { + let mut filtered_ranges = Vec::new(); + let mut prev_range = &ranges[0]; + filtered_ranges.push(prev_range.clone()); + + for range in ranges.iter().skip(1) { + if range.start >= prev_range.start && range.end <= prev_range.end { + // Current range is fully contained within prev_range, skip it + if args.debug { + eprintln!( + "Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.", + range.start, range.end, prev_range.start, prev_range.end + ); + } + continue; + } else { + filtered_ranges.push(range.clone()); + prev_range = range; + } + } + + *ranges = filtered_ranges; + } + // Process overlaps for i in 1..ranges.len() { let (left, right) = ranges.split_at_mut(i); From 4098f5e1ad2c9f14a1498befa2cffcd667654d78 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 12 Dec 2024 12:10:15 -0600 Subject: [PATCH 07/45] feat: Add debug logging for range processing and overlap detection --- src/main.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3ad7062..5cbe607 100644 --- a/src/main.rs +++ b/src/main.rs @@ -227,6 +227,13 @@ fn main() { // Sort ranges by start position ranges.sort_by_key(|r| (r.start, r.end)); + if args.debug { + eprintln!("Processing path key '{}'", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, gfa_id={}", range.start, range.end, range.gfa_id); + } + } + // Remove ranges that are contained within other ranges if !ranges.is_empty() { let mut filtered_ranges = Vec::new(); @@ -238,7 +245,7 @@ fn main() { // Current range is fully contained within prev_range, skip it if args.debug { eprintln!( - "Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.", + "Redundant range detected: Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.", range.start, range.end, prev_range.start, prev_range.end ); } @@ -267,8 +274,8 @@ fn main() { let overlap_amount = overlap_end - overlap_start; eprintln!( - "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", - path_key, r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount + "Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", + r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount ); } From c3a23ad38ce47d03a9125eeb63ce0a7449965d66 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 12:10:16 -0600 Subject: [PATCH 08/45] fix: Handle bidirectional range containment to remove redundancies --- src/main.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5cbe607..021705f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -237,10 +237,9 @@ fn main() { // Remove ranges that are contained within other ranges if !ranges.is_empty() { let mut filtered_ranges = Vec::new(); - let mut prev_range = &ranges[0]; - filtered_ranges.push(prev_range.clone()); + let mut prev_range = ranges[0].clone(); - for range in ranges.iter().skip(1) { + for range in ranges.iter() { if range.start >= prev_range.start && range.end <= prev_range.end { // Current range is fully contained within prev_range, skip it if args.debug { @@ -250,12 +249,25 @@ fn main() { ); } continue; + } else if prev_range.start >= range.start && prev_range.end <= range.end { + // Prev_range is fully contained within current range, replace prev_range with current range + if args.debug { + eprintln!( + "Redundant range detected: Previous range [start={}, end={}] is contained within [start={}, end={}] and will be replaced.", + prev_range.start, prev_range.end, range.start, range.end + ); + } + prev_range = range.clone(); } else { - filtered_ranges.push(range.clone()); - prev_range = range; + // No containment, add the previous range and update prev_range + filtered_ranges.push(prev_range.clone()); + prev_range = range.clone(); } } + // Don't forget to add the last range + filtered_ranges.push(prev_range); + *ranges = filtered_ranges; } From 24c302604bc015ed6f59653a2c81542f47cadd12 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 12 Dec 2024 15:43:59 -0600 Subject: [PATCH 09/45] refactor: Skip the first range in the iteration for clarity --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 021705f..90c52cc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -239,7 +239,7 @@ fn main() { let mut filtered_ranges = Vec::new(); let mut prev_range = ranges[0].clone(); - for range in ranges.iter() { + for range in ranges.iter().skip(1) { if range.start >= prev_range.start && range.end <= prev_range.end { // Current range is fully contained within prev_range, skip it if args.debug { From bf88718e2923d758a5b5061b9f21e4452f598e0c Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 15:44:00 -0600 Subject: [PATCH 10/45] fix: Adjust overlap offset calculations for handle orientation in sequences --- src/main.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 90c52cc..f38d52d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -328,8 +328,18 @@ fn main() { let node_seq = combined_graph.sequence(step_handle).collect::>(); let overlap_within_step_start = std::cmp::max(step_start, overlap_start); let overlap_within_step_end = std::cmp::min(step_end, overlap_end); - let overlap_start_offset = overlap_within_step_start - step_start; - let overlap_end_offset = overlap_within_step_end - step_start; + + let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() { + // Forward handle + let start_offset = overlap_within_step_start - step_start; + let end_offset = overlap_within_step_end - step_start; + (start_offset, end_offset) + } else { + // Reverse handle + let start_offset = step_end - overlap_within_step_end; + let end_offset = step_end - overlap_within_step_start; + (start_offset, end_offset) + }; if step_start < overlap_start && step_end > overlap_end { // Split into three parts From 3a2b2597af3c95a4fd1c1f412a59813a39bc6a2f Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 15:47:00 -0600 Subject: [PATCH 11/45] fix: Prevent panic by checking for empty sequences before node creation --- src/main.rs | 116 +++++++++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/src/main.rs b/src/main.rs index f38d52d..b05748e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -346,64 +346,76 @@ fn main() { let left_seq = &node_seq[0..overlap_start_offset]; let right_seq = &node_seq[overlap_end_offset..]; - let left_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let right_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - - let left_node = combined_graph.create_handle(left_seq, left_id); - let right_node = combined_graph.create_handle(right_seq, right_id); - - let left_handle = if step_handle.is_reverse() { - left_node.flip() - } else { - left_node - }; - let right_handle = if step_handle.is_reverse() { - right_node.flip() - } else { - right_node - }; - - new_steps.push(left_handle); - new_step_positions.push((step_start, overlap_start)); - new_step_lengths.push(left_seq.len()); - - new_steps.push(right_handle); - new_step_positions.push((overlap_end, step_end)); - new_step_lengths.push(right_seq.len()); - - combined_graph.create_edge(Edge(left_handle, right_handle)); + // Only create left node if sequence is not empty + let mut left_handle = None; + if !left_seq.is_empty() { + let left_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let left_node = combined_graph.create_handle(left_seq, left_id); + left_handle = Some(if step_handle.is_reverse() { + left_node.flip() + } else { + left_node + }); + new_steps.push(left_handle.unwrap()); + new_step_positions.push((step_start, overlap_start)); + new_step_lengths.push(left_seq.len()); + } + + // Only create right node if sequence is not empty + let mut right_handle = None; + if !right_seq.is_empty() { + let right_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let right_node = combined_graph.create_handle(right_seq, right_id); + right_handle = Some(if step_handle.is_reverse() { + right_node.flip() + } else { + right_node + }); + new_steps.push(right_handle.unwrap()); + new_step_positions.push((overlap_end, step_end)); + new_step_lengths.push(right_seq.len()); + } + + // Create edge only if both nodes exist + if let (Some(left), Some(right)) = (left_handle, right_handle) { + combined_graph.create_edge(Edge(left, right)); + } } else if step_start < overlap_start { // Keep left part let new_seq = &node_seq[0..overlap_start_offset]; - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(new_seq, node_id); - let new_handle = if step_handle.is_reverse() { - new_node.flip() - } else { - new_node - }; - - new_steps.push(new_handle); - new_step_positions.push((step_start, overlap_start)); - new_step_lengths.push(new_seq.len()); + if !new_seq.is_empty() { + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(new_seq, node_id); + let new_handle = if step_handle.is_reverse() { + new_node.flip() + } else { + new_node + }; + + new_steps.push(new_handle); + new_step_positions.push((step_start, overlap_start)); + new_step_lengths.push(new_seq.len()); + } } else if step_end > overlap_end { // Keep right part let new_seq = &node_seq[overlap_end_offset..]; - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(new_seq, node_id); - let new_handle = if step_handle.is_reverse() { - new_node.flip() - } else { - new_node - }; - - new_steps.push(new_handle); - new_step_positions.push((overlap_end, step_end)); - new_step_lengths.push(new_seq.len()); + if !new_seq.is_empty() { + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(new_seq, node_id); + let new_handle = if step_handle.is_reverse() { + new_node.flip() + } else { + new_node + }; + + new_steps.push(new_handle); + new_step_positions.push((overlap_end, step_end)); + new_step_lengths.push(new_seq.len()); + } } } else { // Keep steps that are not to be removed or split From 1fc2e9f205935e655166e58eacd674fd0e29dcc6 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 15:49:18 -0600 Subject: [PATCH 12/45] fix: Improve overlap handling to prevent sequence discrepancies in GFA processing --- src/main.rs | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main.rs b/src/main.rs index b05748e..8c81447 100644 --- a/src/main.rs +++ b/src/main.rs @@ -329,15 +329,17 @@ fn main() { let overlap_within_step_start = std::cmp::max(step_start, overlap_start); let overlap_within_step_end = std::cmp::min(step_end, overlap_end); + // Calculate offsets relative to the node sequence + let node_len = node_seq.len(); let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() { - // Forward handle - let start_offset = overlap_within_step_start - step_start; - let end_offset = overlap_within_step_end - step_start; + // Forward handle - offsets from start of sequence + let start_offset = (overlap_within_step_start - step_start).min(node_len); + let end_offset = (overlap_within_step_end - step_start).min(node_len); (start_offset, end_offset) } else { - // Reverse handle - let start_offset = step_end - overlap_within_step_end; - let end_offset = step_end - overlap_within_step_start; + // Reverse handle - offsets from end of sequence + let start_offset = (step_end - overlap_within_step_end).min(node_len); + let end_offset = (step_end - overlap_within_step_start).min(node_len); (start_offset, end_offset) }; @@ -346,9 +348,9 @@ fn main() { let left_seq = &node_seq[0..overlap_start_offset]; let right_seq = &node_seq[overlap_end_offset..]; - // Only create left node if sequence is not empty + // Only create left node if sequence is not empty and within bounds let mut left_handle = None; - if !left_seq.is_empty() { + if !left_seq.is_empty() && overlap_start_offset > 0 { let left_id = NodeId::from(next_node_id_value); next_node_id_value += 1; let left_node = combined_graph.create_handle(left_seq, left_id); @@ -358,13 +360,15 @@ fn main() { left_node }); new_steps.push(left_handle.unwrap()); - new_step_positions.push((step_start, overlap_start)); + let left_start = step_start; + let left_end = left_start + left_seq.len(); + new_step_positions.push((left_start, left_end)); new_step_lengths.push(left_seq.len()); } - // Only create right node if sequence is not empty + // Only create right node if sequence is not empty and within bounds let mut right_handle = None; - if !right_seq.is_empty() { + if !right_seq.is_empty() && overlap_end_offset < node_seq.len() { let right_id = NodeId::from(next_node_id_value); next_node_id_value += 1; let right_node = combined_graph.create_handle(right_seq, right_id); @@ -374,7 +378,9 @@ fn main() { right_node }); new_steps.push(right_handle.unwrap()); - new_step_positions.push((overlap_end, step_end)); + let right_start = overlap_end; + let right_end = right_start + right_seq.len(); + new_step_positions.push((right_start, right_end)); new_step_lengths.push(right_seq.len()); } From 0a120485c5e652cdf6a14bfeaca0b083bc0c8492 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 15:51:36 -0600 Subject: [PATCH 13/45] fix: Correct overlap region calculation to prevent extra bases in alignment --- src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8c81447..95f6f6e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -278,9 +278,9 @@ fn main() { let r2 = &mut right[0]; if has_overlap(r1, r2) { - // Calculate the overlap region - let overlap_start = r2.start; - let overlap_end = r1.end; + // Calculate the overlap region - use max/min to get precise overlap bounds + let overlap_start = std::cmp::max(r1.start, r2.start); + let overlap_end = std::cmp::min(r1.end, r2.end); if args.debug { let overlap_amount = overlap_end - overlap_start; From fa68310aeb34da583786e855a1b10b53e7f51519 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Thu, 12 Dec 2024 16:46:24 -0600 Subject: [PATCH 14/45] fix: Update r2.start and r2.end after trimming overlapping steps --- src/main.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 95f6f6e..a845b8d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -446,7 +446,15 @@ fn main() { } } - r2.start = overlap_end; + // Update r2.start and r2.end based on the new step positions + if !r2.step_positions.is_empty() { + r2.start = r2.step_positions.first().unwrap().0; + r2.end = r2.step_positions.last().unwrap().1; + } else { + // If no steps remain, set start and end to overlap_end to effectively remove this range + r2.start = overlap_end; + r2.end = overlap_end; + } } } From 500108a943db5a1944d15b383958884e5946a68b Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 14:02:25 -0600 Subject: [PATCH 15/45] wip --- src/main.rs | 120 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 44 deletions(-) diff --git a/src/main.rs b/src/main.rs index a845b8d..67b633e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -198,8 +198,9 @@ fn main() { cumulative_pos = cumulative_pos + node_length; } } - - path_key_ranges.entry(sample_hap_name) + + if !translated_steps.is_empty() { + path_key_ranges.entry(sample_hap_name) .or_default() .push(RangeInfo { start, @@ -209,15 +210,17 @@ fn main() { step_positions, step_lengths, }); + } else if args.debug { + eprintln!(" Warning: Path '{}' has no steps", path_name); + } } } } } if args.debug { - eprintln!("Total nodes in the combined graph: {}", combined_graph.node_count()); - eprintln!("Total edges in the combined graph: {}", combined_graph.edge_count()); - eprintln!("Total path ranges collected: {}", path_key_ranges.len()); + eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", + combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::()); } let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; @@ -230,45 +233,60 @@ fn main() { if args.debug { eprintln!("Processing path key '{}'", path_key); for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, gfa_id={}", range.start, range.end, range.gfa_id); + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } + + eprintln!("Removing redundant ranges"); } // Remove ranges that are contained within other ranges - if !ranges.is_empty() { - let mut filtered_ranges = Vec::new(); - let mut prev_range = ranges[0].clone(); - - for range in ranges.iter().skip(1) { - if range.start >= prev_range.start && range.end <= prev_range.end { - // Current range is fully contained within prev_range, skip it - if args.debug { - eprintln!( - "Redundant range detected: Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.", - range.start, range.end, prev_range.start, prev_range.end - ); - } - continue; - } else if prev_range.start >= range.start && prev_range.end <= range.end { - // Prev_range is fully contained within current range, replace prev_range with current range - if args.debug { - eprintln!( - "Redundant range detected: Previous range [start={}, end={}] is contained within [start={}, end={}] and will be replaced.", - prev_range.start, prev_range.end, range.start, range.end - ); - } - prev_range = range.clone(); - } else { - // No containment, add the previous range and update prev_range - filtered_ranges.push(prev_range.clone()); - prev_range = range.clone(); + let mut write_idx = 0; + for read_idx in 1..ranges.len() { + let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); + let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); + + if curr_start == prev_start && curr_end == prev_end { + // Current range is a duplicate of the previous range, skip it + if args.debug { + eprintln!( + " Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.", + curr_start, curr_end + ); + } + continue; + } else if curr_start >= prev_start && curr_end <= prev_end { + // Current range is contained within previous range, skip it + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is contained within current range + if args.debug { + eprintln!( + " Redundant range detected: Range [start={}, end={}, len={}] is contained within [start={}, end={} | len={}] and will be removed.", + prev_start, prev_end, prev_end - prev_start, curr_start, curr_end, curr_end - curr_start + ); + } + ranges.swap(write_idx, read_idx); + } else { + // No containment, advance write_idx and copy current range + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); } } + } - // Don't forget to add the last range - filtered_ranges.push(prev_range); + // Truncate the ranges vector to remove any skipped elements + ranges.truncate(write_idx + 1); - *ranges = filtered_ranges; + if args.debug { + eprintln!("Path key '{}' without redundancy", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + } + } + + if args.debug { + eprintln!("Trimming overlapping ranges"); } // Process overlaps @@ -286,7 +304,7 @@ fn main() { let overlap_amount = overlap_end - overlap_start; eprintln!( - "Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", + " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount ); } @@ -455,25 +473,38 @@ fn main() { r2.start = overlap_end; r2.end = overlap_end; } + + + if args.debug { + eprintln!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); + } + } + } + + if args.debug { + eprintln!("Path key '{}' without overlaps", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } } // Check for overlaps and contiguity - let mut has_overlaps = false; let mut all_contiguous = true; for window in ranges.windows(2) { if has_overlap(&window[0], &window[1]) { - has_overlaps = true; + if args.debug { + eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", + window[0].start, window[0].end, window[1].start, window[1].end); + } + panic!("Unresolved overlaps detected in path key '{}'", path_key); } if !is_contiguous(&window[0], &window[1]) { all_contiguous = false; } } - - if (has_overlaps || !all_contiguous) && args.debug { - eprintln!(" Path key '{}' ranges analysis:", path_key); - + + if !all_contiguous && args.debug { let mut current_start = ranges[0].start; let mut current_end = ranges[0].end; let mut current_gfa_ids = vec![ranges[0].gfa_id]; @@ -510,7 +541,7 @@ fn main() { current_start, current_end, current_gfa_ids); } - if all_contiguous && !has_overlaps { + if all_contiguous { // Create a single path with the original key let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap(); let mut prev_step = None; @@ -546,6 +577,7 @@ fn main() { // Create path name with range information let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); + println!("Creating path: {} with {} steps", path_name, steps.len()); let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); // Add steps to the path From 6ed8e4b5f6f1ad0758e114177aa2bf709412f899 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 14:10:39 -0600 Subject: [PATCH 16/45] chore: Add a blank line before the main function for better readability --- src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.rs b/src/main.rs index 67b633e..8749343 100644 --- a/src/main.rs +++ b/src/main.rs @@ -132,6 +132,7 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( Ok(()) } + fn main() { let args = Args::parse(); From b361a047e44ff748c3bf3d6063d3ecdccb2b2331 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 14:34:06 -0600 Subject: [PATCH 17/45] style: Update debug log messages for consistency and clarity --- src/main.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8749343..a83f937 100644 --- a/src/main.rs +++ b/src/main.rs @@ -212,7 +212,7 @@ fn main() { step_lengths, }); } else if args.debug { - eprintln!(" Warning: Path '{}' has no steps", path_name); + eprintln!(" Warning: path '{}' has no steps", path_name); } } } @@ -275,7 +275,6 @@ fn main() { } } } - // Truncate the ranges vector to remove any skipped elements ranges.truncate(write_idx + 1); @@ -315,20 +314,33 @@ fn main() { let mut steps_to_split = Vec::new(); for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { if step_end <= overlap_start { - //println!("\tStep {} before overlap", idx); + if args.debug && r2.start == 154089 { + eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); + } continue; } else if step_start >= overlap_end { - //println!("\tStep {} after overlap", idx); + if args.debug && r2.start == 154089 { + eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); + } break; } else if step_start >= overlap_start && step_end <= overlap_end { - //println!("\tStep {} within overlap", idx); + if args.debug && r2.start == 154089 { + eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); + } steps_to_remove.push(idx); } else { - //println!("\tStep {} partially overlaps", idx); + if args.debug && r2.start == 154089 { + eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); + } steps_to_split.push(idx); } } + if args.debug && r2.start == 154089 { + eprintln!(" {} steps to remove", steps_to_remove.len()); + eprintln!(" Steps to split: {:?}", steps_to_split); + } + // Initialize new vectors to store updated steps let mut new_steps = Vec::new(); let mut new_step_positions = Vec::new(); From 54ba991e4122d3c20fce86c140ec5f8d822086c2 Mon Sep 17 00:00:00 2001 From: "AndreaGuarracino (aider)" Date: Fri, 13 Dec 2024 14:34:08 -0600 Subject: [PATCH 18/45] feat: Update to handle only one partially overlapping step at a time --- src/main.rs | 60 ++++++++--------------------------------------------- 1 file changed, 9 insertions(+), 51 deletions(-) diff --git a/src/main.rs b/src/main.rs index a83f937..17c0a0e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -311,7 +311,7 @@ fn main() { // Adjust r2 to remove the overlap let mut steps_to_remove = Vec::new(); - let mut steps_to_split = Vec::new(); + let mut step_to_split: Option = None; for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { if step_end <= overlap_start { if args.debug && r2.start == 154089 { @@ -332,13 +332,16 @@ fn main() { if args.debug && r2.start == 154089 { eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); } - steps_to_split.push(idx); + if step_to_split.is_some() { + panic!("Error: More than one step is partially overlapping, which is not allowed."); + } + step_to_split = Some(idx); } } if args.debug && r2.start == 154089 { eprintln!(" {} steps to remove", steps_to_remove.len()); - eprintln!(" Steps to split: {:?}", steps_to_split); + eprintln!(" Step to split: {:?}", step_to_split); } // Initialize new vectors to store updated steps @@ -354,8 +357,8 @@ fn main() { if steps_to_remove.contains(&idx) { // Skip steps to remove continue; - } else if steps_to_split.contains(&idx) { - // Split nodes for steps that partially overlap + } else if step_to_split == Some(idx) { + // Split node for the single partially overlapping step let node_seq = combined_graph.sequence(step_handle).collect::>(); let overlap_within_step_start = std::cmp::max(step_start, overlap_start); let overlap_within_step_end = std::cmp::min(step_end, overlap_end); @@ -374,52 +377,7 @@ fn main() { (start_offset, end_offset) }; - if step_start < overlap_start && step_end > overlap_end { - // Split into three parts - let left_seq = &node_seq[0..overlap_start_offset]; - let right_seq = &node_seq[overlap_end_offset..]; - - // Only create left node if sequence is not empty and within bounds - let mut left_handle = None; - if !left_seq.is_empty() && overlap_start_offset > 0 { - let left_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let left_node = combined_graph.create_handle(left_seq, left_id); - left_handle = Some(if step_handle.is_reverse() { - left_node.flip() - } else { - left_node - }); - new_steps.push(left_handle.unwrap()); - let left_start = step_start; - let left_end = left_start + left_seq.len(); - new_step_positions.push((left_start, left_end)); - new_step_lengths.push(left_seq.len()); - } - - // Only create right node if sequence is not empty and within bounds - let mut right_handle = None; - if !right_seq.is_empty() && overlap_end_offset < node_seq.len() { - let right_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let right_node = combined_graph.create_handle(right_seq, right_id); - right_handle = Some(if step_handle.is_reverse() { - right_node.flip() - } else { - right_node - }); - new_steps.push(right_handle.unwrap()); - let right_start = overlap_end; - let right_end = right_start + right_seq.len(); - new_step_positions.push((right_start, right_end)); - new_step_lengths.push(right_seq.len()); - } - - // Create edge only if both nodes exist - if let (Some(left), Some(right)) = (left_handle, right_handle) { - combined_graph.create_edge(Edge(left, right)); - } - } else if step_start < overlap_start { + if step_start < overlap_start { // Keep left part let new_seq = &node_seq[0..overlap_start_offset]; if !new_seq.is_empty() { From 8ebcec2d51952e346b5a0251a555cf4b8ded80c3 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 16:42:33 -0600 Subject: [PATCH 19/45] fix orientation and offsets --- src/main.rs | 106 +++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/src/main.rs b/src/main.rs index 17c0a0e..4166b40 100644 --- a/src/main.rs +++ b/src/main.rs @@ -212,7 +212,7 @@ fn main() { step_lengths, }); } else if args.debug { - eprintln!(" Warning: path '{}' has no steps", path_name); + eprintln!(" Warning: Path '{}' has no steps", path_name); } } } @@ -314,24 +314,24 @@ fn main() { let mut step_to_split: Option = None; for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { if step_end <= overlap_start { - if args.debug && r2.start == 154089 { - eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); - } + // if args.debug { + // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); + // } continue; } else if step_start >= overlap_end { - if args.debug && r2.start == 154089 { - eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); - } + // if args.debug { + // eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); + // } break; } else if step_start >= overlap_start && step_end <= overlap_end { - if args.debug && r2.start == 154089 { - eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); - } + // if args.debug { + // eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); + // } steps_to_remove.push(idx); } else { - if args.debug && r2.start == 154089 { - eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); - } + // if args.debug { + // eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); + // } if step_to_split.is_some() { panic!("Error: More than one step is partially overlapping, which is not allowed."); } @@ -340,7 +340,7 @@ fn main() { } if args.debug && r2.start == 154089 { - eprintln!(" {} steps to remove", steps_to_remove.len()); + eprintln!(" Total steps to remove: {}", steps_to_remove.len()); eprintln!(" Step to split: {:?}", step_to_split); } @@ -365,52 +365,48 @@ fn main() { // Calculate offsets relative to the node sequence let node_len = node_seq.len(); - let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() { - // Forward handle - offsets from start of sequence - let start_offset = (overlap_within_step_start - step_start).min(node_len); - let end_offset = (overlap_within_step_end - step_start).min(node_len); - (start_offset, end_offset) - } else { - // Reverse handle - offsets from end of sequence - let start_offset = (step_end - overlap_within_step_end).min(node_len); - let end_offset = (step_end - overlap_within_step_start).min(node_len); - (start_offset, end_offset) - }; + // Calculate offsets consistently regardless of strand + let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len); + let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len); + + if args.debug { + eprintln!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", + idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); + eprintln!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); + } if step_start < overlap_start { - // Keep left part - let new_seq = &node_seq[0..overlap_start_offset]; - if !new_seq.is_empty() { - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(new_seq, node_id); - let new_handle = if step_handle.is_reverse() { - new_node.flip() - } else { - new_node - }; - - new_steps.push(new_handle); - new_step_positions.push((step_start, overlap_start)); - new_step_lengths.push(new_seq.len()); + if args.debug { + eprintln!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); } + assert!(overlap_start_offset > 0); + + // Keep left part + let new_seq = node_seq[0..overlap_start_offset].to_vec(); + + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(&new_seq, node_id); + + new_steps.push(new_node); + new_step_positions.push((step_start, overlap_start)); + new_step_lengths.push(new_seq.len()); } else if step_end > overlap_end { - // Keep right part - let new_seq = &node_seq[overlap_end_offset..]; - if !new_seq.is_empty() { - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(new_seq, node_id); - let new_handle = if step_handle.is_reverse() { - new_node.flip() - } else { - new_node - }; - - new_steps.push(new_handle); - new_step_positions.push((overlap_end, step_end)); - new_step_lengths.push(new_seq.len()); + if args.debug { + eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); } + assert!(overlap_end_offset < node_len); + + // Keep right part + let new_seq = node_seq[overlap_end_offset..].to_vec(); + + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(&new_seq, node_id); + + new_steps.push(new_node); + new_step_positions.push((overlap_end, step_end)); + new_step_lengths.push(new_seq.len()); } } else { // Keep steps that are not to be removed or split From e9cc5054ef6e93dae5c977e602b9deb8d26490c1 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 16:43:56 -0600 Subject: [PATCH 20/45] update version --- Cargo.lock | 6 +++--- Cargo.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0880f37..da73213 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.3" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" +checksum = "9157bbaa6b165880c27a4293a474c91cdcf265cc68cc829bf10be0964a391caf" dependencies = [ "jobserver", "libc", @@ -327,7 +327,7 @@ dependencies = [ [[package]] name = "gfalace" -version = "0.1.0" +version = "0.1.1" dependencies = [ "bstr 1.11.1", "clap", diff --git a/Cargo.toml b/Cargo.toml index b410919..896718d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gfalace" -version = "0.1.0" +version = "0.1.1" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From 578bdc4eb5c6ef29127970be6b418ecb5f71c423 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 19:52:16 -0600 Subject: [PATCH 21/45] better containment management --- src/main.rs | 217 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 208 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index 4166b40..11fedf3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -247,6 +247,7 @@ fn main() { let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); if curr_start == prev_start && curr_end == prev_end { + // Skip duplicate range // Current range is a duplicate of the previous range, skip it if args.debug { eprintln!( @@ -256,17 +257,38 @@ fn main() { } continue; } else if curr_start >= prev_start && curr_end <= prev_end { - // Current range is contained within previous range, skip it - continue; - } else if prev_start >= curr_start && prev_end <= curr_end { - // Previous range is contained within current range + // Skip range that is fully contained within previous range if args.debug { eprintln!( - " Redundant range detected: Range [start={}, end={}, len={}] is contained within [start={}, end={} | len={}] and will be removed.", - prev_start, prev_end, prev_end - prev_start, curr_start, curr_end, curr_end - curr_start + " Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.", + curr_start, curr_end, prev_start, prev_end ); } + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is fully contained within current range ranges.swap(write_idx, read_idx); + } else if curr_start < prev_end { + // Handle overlapping ranges + if read_idx < ranges.len() - 1 { + let next_start = ranges[read_idx + 1].start; + let next_end = ranges[read_idx + 1].end; + + // Only skip current range if it's substantially overlapped by both prev and next ranges + if curr_start > prev_start && next_start < curr_end && next_end > curr_end { + // Check if the current range doesn't extend far enough beyond previous range + let extension_beyond_prev = curr_end - prev_end; + let overlap_with_prev = prev_end - curr_start; + if extension_beyond_prev < overlap_with_prev { + continue; + } + } + } + // Keep current range + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } } else { // No containment, advance write_idx and copy current range write_idx += 1; @@ -314,17 +336,17 @@ fn main() { let mut step_to_split: Option = None; for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { if step_end <= overlap_start { - // if args.debug { + // if args.debug && r2.start == 11000 { // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); // } continue; } else if step_start >= overlap_end { - // if args.debug { + // if args.debug && r2.start == 11000 { // eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); // } break; } else if step_start >= overlap_start && step_end <= overlap_end { - // if args.debug { + // if args.debug && r2.start == 11000 { // eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); // } steps_to_remove.push(idx); @@ -592,3 +614,180 @@ fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> { } None } + +#[cfg(test)] +mod tests { + use super::*; + + // Helper function to create a simple RangeInfo for testing + fn create_range_info(start: usize, end: usize, gfa_id: usize) -> RangeInfo { + RangeInfo { + start, + end, + gfa_id, + steps: vec![], // Empty steps for testing + step_positions: vec![], // Empty positions for testing + step_lengths: vec![], // Empty lengths for testing + } + } + + #[test] + fn test_range_containment_removal() { + // Test cases + let test_cases = vec![ + // Test case 1: Basic containment + ( + vec![(10, 50, 0), (20, 30, 1)], // Input ranges (start, end, gfa_id) + vec![(10, 50, 0)], // Expected result + "Basic containment" + ), + + // Test case 2: No containment + ( + vec![(10, 20, 0), (30, 40, 1)], + vec![(10, 20, 0), (30, 40, 1)], + "No containment" + ), + + // Test case 3: Multiple contained ranges + ( + vec![(10, 100, 0), (20, 30, 1), (40, 50, 2), (60, 70, 3)], + vec![(10, 100, 0)], + "Multiple contained ranges" + ), + + // Test case 4: Identical ranges + ( + vec![(10, 20, 0), (10, 20, 1)], + vec![(10, 20, 0)], + "Identical ranges" + ), + + // Test case 5: Nested containment + ( + vec![(10, 100, 0), (20, 80, 1), (30, 40, 2)], + vec![(10, 100, 0)], + "Nested containment" + ), + + // Test case 6: Partial overlap (should keep both) + ( + vec![(10, 30, 0), (20, 40, 1)], + vec![(10, 30, 0), (20, 40, 1)], + "Partial overlap" + ), + + // Test case 7: Edge cases - touching ranges + ( + vec![(10, 20, 0), (20, 30, 1)], + vec![(10, 20, 0), (20, 30, 1)], + "Touching ranges" + ), + + // Test case 8: Overlapping ranges from same GFA + ( + vec![(0, 11742, 0), (9714, 13000, 1), (11000, 19000, 1)], + vec![(0, 11742, 0), (11000, 19000, 1)], + "Overlapping ranges from same GFA" + ), + + // Test case 9: Overlapping ranges with different GFA IDs + ( + vec![(0, 11742, 0), (9714, 13000, 1), (11000, 19000, 2)], + vec![(0, 11742, 0), (11000, 19000, 2)], + "Overlapping ranges" + ), + + // Test case 10: Overlapping ranges with different GFA IDs 2 + ( + vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)], + vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)], + "Overlapping ranges" + ), + ]; + + // Run each test case + for (case_index, (input_ranges, expected_ranges, case_name)) in test_cases.iter().enumerate() { + println!("Running test case {}: {}", case_index + 1, case_name); + + // Create input ranges + let mut ranges: Vec = input_ranges + .iter() + .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id)) + .collect(); +// Sort ranges by start position +ranges.sort_by_key(|r| (r.start, r.end)); + +let mut write_idx = 0; +for read_idx in 1..ranges.len() { + let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); + let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); + + if curr_start == prev_start && curr_end == prev_end { + // Skip duplicate range + continue; + } else if curr_start >= prev_start && curr_end <= prev_end { + // Skip range that is fully contained within previous range + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is fully contained within current range + ranges.swap(write_idx, read_idx); + } else if curr_start < prev_end { + // Handle overlapping ranges + if read_idx < ranges.len() - 1 { + let next_start = ranges[read_idx + 1].start; + let next_end = ranges[read_idx + 1].end; + + // Only skip current range if it's substantially overlapped by both prev and next ranges + if curr_start > prev_start && next_start < curr_end && next_end > curr_end { + // Check if the current range doesn't extend far enough beyond previous range + let extension_beyond_prev = curr_end - prev_end; + let overlap_with_prev = prev_end - curr_start; + if extension_beyond_prev < overlap_with_prev { + continue; + } + } + } + // Keep current range + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } + } else { + // No overlap - keep both ranges + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } + } +} +ranges.truncate(write_idx + 1); + + // Create expected ranges + let expected: Vec = expected_ranges + .iter() + .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id)) + .collect(); + + // Compare results + assert_eq!( + ranges.len(), + expected.len(), + "Test case '{}': Wrong number of ranges after containment removal", + case_name + ); + + for (i, (result, expected)) in ranges.iter().zip(expected.iter()).enumerate() { + assert_eq!( + (result.start, result.end, result.gfa_id), + (expected.start, expected.end, expected.gfa_id), + "Test case '{}': Mismatch at position {}", + case_name, + i + ); + } + + println!("Test case {} passed: {}", case_index + 1, case_name); + } + } +} From 6b24c9b5576222f97c20e4be75a7fbf9e8788baf Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 19:55:26 -0600 Subject: [PATCH 22/45] more debugging log --- src/main.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/main.rs b/src/main.rs index 11fedf3..ed187bd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -267,6 +267,12 @@ fn main() { continue; } else if prev_start >= curr_start && prev_end <= curr_end { // Previous range is fully contained within current range + if args.debug { + eprintln!( + " Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.", + prev_start, prev_end, curr_start, curr_end + ); + } ranges.swap(write_idx, read_idx); } else if curr_start < prev_end { // Handle overlapping ranges @@ -280,6 +286,12 @@ fn main() { let extension_beyond_prev = curr_end - prev_end; let overlap_with_prev = prev_end - curr_start; if extension_beyond_prev < overlap_with_prev { + if args.debug { + eprintln!( + " Overlapping range detected: Range [start={}, end={}] is overlapped by both previous range [start={}, end={}] and next range [start={}, end={}] and will be removed.", + curr_start, curr_end, prev_start, prev_end, next_start, next_end + ); + } continue; } } From 7b15330cf6ffc189c795a0ef7ffec19788dda48c Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 20:50:53 -0600 Subject: [PATCH 23/45] manage complex cases --- src/main.rs | 128 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 29 deletions(-) diff --git a/src/main.rs b/src/main.rs index ed187bd..0554402 100644 --- a/src/main.rs +++ b/src/main.rs @@ -245,7 +245,7 @@ fn main() { for read_idx in 1..ranges.len() { let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); - + if curr_start == prev_start && curr_end == prev_end { // Skip duplicate range // Current range is a duplicate of the previous range, skip it @@ -274,24 +274,84 @@ fn main() { ); } ranges.swap(write_idx, read_idx); + } else if curr_start < prev_end { + // Handle overlapping ranges - check both previous and next ranges + let mut should_skip = false; + + if read_idx < ranges.len() - 1 { + let next_start = ranges[read_idx + 1].start; + + // Check if current range is significantly overlapped by both neighbors + if curr_start > prev_start && next_start < curr_end { + let overlap_with_prev = prev_end - curr_start; + let overlap_with_next = curr_end - next_start; + let range_length = curr_end - curr_start; + + // Skip if the range is mostly covered by its neighbors + if overlap_with_prev + overlap_with_next > range_length { + should_skip = true; + } + } + } + + if !should_skip { + if args.debug { + eprintln!( + " Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.", + curr_start, curr_end, prev_start, prev_end + ); + } + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } + } + } else { + // No overlap - keep both ranges + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } + } + } + ranges.truncate(write_idx + 1); + + + if args.debug { + eprintln!("Path key '{}' without redundancy", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + } + } + + let mut write_idx = 0; + for read_idx in 1..ranges.len() { + let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); + let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); + + if curr_start == prev_start && curr_end == prev_end { + // Skip duplicate range + continue; + } else if curr_start >= prev_start && curr_end <= prev_end { + // Skip range that is fully contained within previous range + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is fully contained within current range + ranges.swap(write_idx, read_idx); } else if curr_start < prev_end { // Handle overlapping ranges if read_idx < ranges.len() - 1 { let next_start = ranges[read_idx + 1].start; - let next_end = ranges[read_idx + 1].end; - // Only skip current range if it's substantially overlapped by both prev and next ranges - if curr_start > prev_start && next_start < curr_end && next_end > curr_end { - // Check if the current range doesn't extend far enough beyond previous range - let extension_beyond_prev = curr_end - prev_end; + // Check if current range is significantly overlapped by both previous and next ranges + if curr_start > prev_start && next_start < curr_end { + // Calculate overlap percentages let overlap_with_prev = prev_end - curr_start; - if extension_beyond_prev < overlap_with_prev { - if args.debug { - eprintln!( - " Overlapping range detected: Range [start={}, end={}] is overlapped by both previous range [start={}, end={}] and next range [start={}, end={}] and will be removed.", - curr_start, curr_end, prev_start, prev_end, next_start, next_end - ); - } + let overlap_with_next = curr_end - next_start; + let range_length = curr_end - curr_start; + + // If the range is mostly overlapped by both neighbors, skip it + if overlap_with_prev + overlap_with_next > range_length { continue; } } @@ -302,18 +362,16 @@ fn main() { ranges.swap(write_idx, read_idx); } } else { - // No containment, advance write_idx and copy current range + // No overlap - keep both ranges write_idx += 1; if write_idx != read_idx { ranges.swap(write_idx, read_idx); } } } - // Truncate the ranges vector to remove any skipped elements ranges.truncate(write_idx + 1); - if args.debug { - eprintln!("Path key '{}' without redundancy", path_key); + eprintln!("Path key '{}' without redundancy 2", path_key); for range in ranges.iter() { eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } @@ -716,6 +774,13 @@ mod tests { vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)], "Overlapping ranges" ), + + // Test case 11: Overlapping ranges with different GFA IDs 3 + ( + vec![(8000, 11000, 0), (9694, 12313, 1), (10908, 13908, 2)], + vec![(8000, 11000, 0), (10908, 13908, 2)], + "Overlapping ranges" + ), ]; // Run each test case @@ -745,25 +810,30 @@ for read_idx in 1..ranges.len() { // Previous range is fully contained within current range ranges.swap(write_idx, read_idx); } else if curr_start < prev_end { - // Handle overlapping ranges + // Handle overlapping ranges - check both previous and next ranges + let mut should_skip = false; + if read_idx < ranges.len() - 1 { let next_start = ranges[read_idx + 1].start; - let next_end = ranges[read_idx + 1].end; - // Only skip current range if it's substantially overlapped by both prev and next ranges - if curr_start > prev_start && next_start < curr_end && next_end > curr_end { - // Check if the current range doesn't extend far enough beyond previous range - let extension_beyond_prev = curr_end - prev_end; + // Check if current range is significantly overlapped by both neighbors + if curr_start > prev_start && next_start < curr_end { let overlap_with_prev = prev_end - curr_start; - if extension_beyond_prev < overlap_with_prev { - continue; + let overlap_with_next = curr_end - next_start; + let range_length = curr_end - curr_start; + + // Skip if the range is mostly covered by its neighbors + if overlap_with_prev + overlap_with_next > range_length { + should_skip = true; } } } - // Keep current range - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); + + if !should_skip { + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } } } else { // No overlap - keep both ranges From fdd78e19eac31a7fe84ef3cce5d840bd1649013b Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 21:26:26 -0600 Subject: [PATCH 24/45] do not write unused nodes and edges --- Cargo.lock | 40 ++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/main.rs | 63 ++++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 87 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index da73213..aa32e9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,6 +100,18 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "boomphf" version = "0.5.9" @@ -309,6 +321,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "gfa" version = "0.10.1" @@ -329,6 +347,7 @@ dependencies = [ name = "gfalace" version = "0.1.1" dependencies = [ + "bitvec", "bstr 1.11.1", "clap", "gfa", @@ -508,6 +527,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "rand_core" version = "0.6.4" @@ -634,6 +659,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "thiserror" version = "1.0.69" @@ -776,6 +807,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 896718d..fc645eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,4 @@ niffler = "2.6.0" clap = { version = "4.5.22", features = ["derive"] } handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e4216ce16a16667503a8875e469a40a97a" } gfa = "0.10.1" +bitvec = "1.0.1" diff --git a/src/main.rs b/src/main.rs index 0554402..17a6504 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ use handlegraph::pathhandlegraph::{IntoPathIds, GraphPathNames, GraphPathsRef, M use handlegraph::hashgraph::HashGraph; //use handlegraph::pathhandlegraph::PathStep; use gfa::{gfa::GFA, parser::GFAParser}; +use bitvec::{bitvec, prelude::BitVec}; #[derive(Parser, Debug)] #[clap(author, version, about)] @@ -70,14 +71,16 @@ fn read_gfa(gfa_path: &str, parser: &GFAParser) -> std::io::Result std::io::Result<()> { +fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> { let mut file = File::create(output_path)?; // Write GFA version writeln!(file, "H\tVN:Z:1.0")?; - // Collect and sort nodes by ID - let mut nodes: Vec = graph.handles().collect(); + // Collect and sort nodes by ID, excluding marked nodes + let mut nodes: Vec = graph.handles() + .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) + .collect(); nodes.sort_by_key(|handle| handle.id()); // Write sorted nodes (Segments) @@ -88,8 +91,13 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( writeln!(file, "S\t{}\t{}", handle.id(), sequence_str)?; } - // Collect and sort edges - let mut edges: Vec = graph.edges().collect(); + // Collect and sort edges, excluding those connected to marked nodes + let mut edges: Vec = graph.edges() + .filter(|edge| { + !nodes_to_remove[u64::from(edge.0.id()) as usize] && + !nodes_to_remove[u64::from(edge.1.id()) as usize] + }) + .collect(); edges.sort_by(|a, b| { a.0.id().cmp(&b.0.id()) .then(a.1.id().cmp(&b.1.id())) @@ -104,16 +112,8 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; } - // Collect and sort paths by name - let mut paths: Vec<_> = graph.path_ids().collect(); - paths.sort_by_key(|&path_id| { - graph.get_path_name(path_id) - .map(|name_iter| name_iter.collect::>()) - .unwrap_or_default() - }); - - // Write sorted paths - for path_id in paths { + // Write paths + for path_id in graph.path_ids() { if let Some(name_iter) = graph.get_path_name(path_id) { let path_name = String::from_utf8(name_iter.collect::>()) .unwrap_or_else(|_| String::from("unknown_path")); @@ -661,13 +661,42 @@ fn main() { eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); } - // Write the combined graph to GFA file - match write_graph_to_gfa(&combined_graph, &args.output) { + // After all paths are created but before writing the graph + if args.debug { + eprintln!("Total nodes before filtering: {}", combined_graph.node_count()); + } + + let nodes_to_skip = mark_nodes_for_removal(&combined_graph); + + if args.debug { + eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones()); + } + + // Write the combined graph to GFA file, skipping unused nodes + match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)}, Err(e) => eprintln!("Error writing GFA file: {}", e), } } + +fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { + // Create a bitvector with all nodes initially marked for removal + let max_node_id = u64::from(graph.max_node_id()); + let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1]; + + // Mark nodes used in paths as not to be removed (set bit to 0) + for path_id in graph.path_ids() { + if let Some(path_ref) = graph.get_path_ref(path_id) { + for handle in &path_ref.nodes { + nodes_to_remove.set(u64::from(handle.id()) as usize, false); + } + } + } + + nodes_to_remove +} + fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> { // Find the last ':' to split the range from the key if let Some(last_colon) = path_name.rfind(':') { From 8e4b32981dcdc3ba5929cf47cf5fe4ad192a46d0 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 21:37:02 -0600 Subject: [PATCH 25/45] optimized node IDs --- src/main.rs | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/src/main.rs b/src/main.rs index 17a6504..21350b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -77,18 +77,27 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi // Write GFA version writeln!(file, "H\tVN:Z:1.0")?; - // Collect and sort nodes by ID, excluding marked nodes + // Collect nodes, excluding marked nodes let mut nodes: Vec = graph.handles() .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) .collect(); nodes.sort_by_key(|handle| handle.id()); - // Write sorted nodes (Segments) - for handle in nodes { - let sequence = graph.sequence(handle).collect::>(); + // Create mapping from old IDs to new sequential IDs using a Vec + // We allocate a vector with capacity for the max node ID + let max_id = u64::from(nodes.last().unwrap().id()) as usize; + let mut id_mapping = vec![0; max_id + 1]; + for (new_id, handle) in nodes.iter().enumerate() { + id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1 + } + + // Write nodes with new IDs + for handle in &nodes { + let sequence = graph.sequence(*handle).collect::>(); let sequence_str = String::from_utf8(sequence) .unwrap_or_else(|_| String::from("N")); - writeln!(file, "S\t{}\t{}", handle.id(), sequence_str)?; + let new_id = id_mapping[u64::from(handle.id()) as usize]; + writeln!(file, "S\t{}\t{}", new_id, sequence_str)?; } // Collect and sort edges, excluding those connected to marked nodes @@ -103,17 +112,25 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi .then(a.1.id().cmp(&b.1.id())) }); - // Write sorted edges (Links) + // Write edges with new IDs for edge in edges { - let from_id = edge.0.id(); - let to_id = edge.1.id(); + let from_id = id_mapping[u64::from(edge.0.id()) as usize]; + let to_id = id_mapping[u64::from(edge.1.id()) as usize]; let from_orient = if edge.0.is_reverse() { "-" } else { "+" }; let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; } - // Write paths - for path_id in graph.path_ids() { + // Collect and sort paths by name + let mut paths: Vec<_> = graph.path_ids().collect(); + paths.sort_by_key(|&path_id| { + graph.get_path_name(path_id) + .map(|name_iter| name_iter.collect::>()) + .unwrap_or_default() + }); + + // Write paths with new IDs + for path_id in paths { if let Some(name_iter) = graph.get_path_name(path_id) { let path_name = String::from_utf8(name_iter.collect::>()) .unwrap_or_else(|_| String::from("unknown_path")); @@ -121,12 +138,17 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi let mut path_elements = Vec::new(); if let Some(path_ref) = graph.get_path_ref(path_id) { for handle in &path_ref.nodes { - let orient = if handle.is_reverse() { "-" } else { "+" }; - path_elements.push(format!("{}{}", handle.id(), orient)); + let new_id = id_mapping[u64::from(handle.id()) as usize]; + if new_id > 0 { // Only include nodes that weren't removed + let orient = if handle.is_reverse() { "-" } else { "+" }; + path_elements.push(format!("{}{}", new_id, orient)); + } } } - writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + if !path_elements.is_empty() { + writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + } } } From f5959ab529ecf2274152eae599e1be47026959cb Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 21:47:20 -0600 Subject: [PATCH 26/45] better management of temp files --- Cargo.lock | 63 ++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + src/main.rs | 94 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 133 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa32e9b..3ecc40f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,6 +100,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + [[package]] name = "bitvec" version = "1.0.1" @@ -305,6 +311,22 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "errno" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "flate2" version = "1.0.35" @@ -353,6 +375,7 @@ dependencies = [ "gfa", "handlegraph", "niffler", + "tempfile", ] [[package]] @@ -405,7 +428,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" dependencies = [ "arrayvec", - "bitflags", + "bitflags 1.3.2", "cfg-if", "ryu", "static_assertions", @@ -437,6 +460,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "log" version = "0.4.22" @@ -503,6 +532,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + [[package]] name = "pkg-config" version = "0.3.31" @@ -594,6 +629,19 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rustix" +version = "0.38.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "ryu" version = "1.0.18" @@ -665,6 +713,19 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tempfile" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "thiserror" version = "1.0.69" diff --git a/Cargo.toml b/Cargo.toml index fc645eb..54238a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ clap = { version = "4.5.22", features = ["derive"] } handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e4216ce16a16667503a8875e469a40a97a" } gfa = "0.10.1" bitvec = "1.0.1" +tempfile = "3.14.0" diff --git a/src/main.rs b/src/main.rs index 21350b8..201de25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,26 @@ -use std::collections::{BTreeMap}; +use std::{ + collections::BTreeMap, + fs::File, + io::{self, Write}, + path::Path, +}; use clap::Parser; -use std::fs::File; -use std::io::Write; -use handlegraph::handle::{Handle, NodeId, Edge}; -use handlegraph::handlegraph::*; -use handlegraph::mutablehandlegraph::*; -use handlegraph::pathhandlegraph::{IntoPathIds, GraphPathNames, GraphPathsRef, MutableGraphPaths, GraphPaths}; -use handlegraph::hashgraph::HashGraph; -//use handlegraph::pathhandlegraph::PathStep; +use handlegraph::{ + handle::{Handle, NodeId, Edge}, + handlegraph::*, + mutablehandlegraph::*, + pathhandlegraph::{ + IntoPathIds, + GraphPathNames, + GraphPathsRef, + MutableGraphPaths, + GraphPaths + }, + hashgraph::HashGraph, +}; use gfa::{gfa::GFA, parser::GFAParser}; use bitvec::{bitvec, prelude::BitVec}; +use tempfile::NamedTempFile; #[derive(Parser, Debug)] #[clap(author, version, about)] @@ -47,27 +58,62 @@ fn has_overlap(r1: &RangeInfo, r2: &RangeInfo) -> bool { r1.start < r2.end && r2.start < r1.end } -// Helper function to read GFA file -fn read_gfa(gfa_path: &str, parser: &GFAParser) -> std::io::Result> { +fn read_gfa(gfa_path: &str, parser: &GFAParser) -> io::Result> { if gfa_path.ends_with(".gz") { - let file = std::fs::File::open(gfa_path)?; - let (mut reader, _format) = niffler::get_reader(Box::new(file)).unwrap(); + let file = std::fs::File::open(gfa_path).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to open gzipped file '{}': {}", gfa_path, e) + ) + })?; + + let (mut reader, _format) = niffler::get_reader(Box::new(file)) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; let mut decompressed = Vec::new(); - reader.read_to_end(&mut decompressed)?; + reader.read_to_end(&mut decompressed).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to decompress file '{}': {}", gfa_path, e) + ) + })?; - let temp_path = format!("{}.tmp", gfa_path); - { - let mut temp_file = std::fs::File::create(&temp_path)?; - temp_file.write_all(&decompressed)?; - } + // Create temporary file in the same directory as the input file for better performance + let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new(".")); + let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to create temporary file: {}", e) + ) + })?; - let result = parser.parse_file(&temp_path).unwrap(); - std::fs::remove_file(&temp_path)?; + // Write decompressed data + temp_file.as_file().write_all(&decompressed).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to write to temporary file: {}", e) + ) + })?; - Ok(result) + // Parse GFA + parser.parse_file(temp_file.path().to_str().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "Invalid temporary file path" + ) + })?).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse GFA: {}", e) + ) + }) } else { - Ok(parser.parse_file(gfa_path).unwrap()) + parser.parse_file(gfa_path).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse GFA file '{}': {}", gfa_path, e) + ) + }) } } @@ -120,7 +166,7 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; } - + // Collect and sort paths by name let mut paths: Vec<_> = graph.path_ids().collect(); paths.sort_by_key(|&path_id| { From 75fe859219b0934a2a69ca114b7ac1ef58f7427a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 13 Dec 2024 21:55:23 -0600 Subject: [PATCH 27/45] FxHashMap --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + src/main.rs | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ecc40f..ea3e602 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -375,6 +375,7 @@ dependencies = [ "gfa", "handlegraph", "niffler", + "rustc-hash", "tempfile", ] @@ -629,6 +630,12 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rustc-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" + [[package]] name = "rustix" version = "0.38.42" diff --git a/Cargo.toml b/Cargo.toml index 54238a5..9110992 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e42 gfa = "0.10.1" bitvec = "1.0.1" tempfile = "3.14.0" +rustc-hash = "2.1.0" diff --git a/src/main.rs b/src/main.rs index 201de25..e8a91f4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,9 @@ use std::{ - collections::BTreeMap, fs::File, io::{self, Write}, path::Path, }; +use rustc_hash::FxHashMap; use clap::Parser; use handlegraph::{ handle::{Handle, NodeId, Edge}, @@ -206,7 +206,7 @@ fn main() { // Create a single combined graph let mut combined_graph = HashGraph::new(); - let mut path_key_ranges: BTreeMap> = BTreeMap::new(); + let mut path_key_ranges: FxHashMap> = FxHashMap::default(); let mut id_translations = Vec::new(); // Process each GFA file From d14c951bb1543d38b54dee369ba293402e15a9e9 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 11:11:14 -0600 Subject: [PATCH 28/45] remove redundant loop --- src/main.rs | 53 ----------------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/src/main.rs b/src/main.rs index e8a91f4..d7a5f15 100644 --- a/src/main.rs +++ b/src/main.rs @@ -392,59 +392,6 @@ fn main() { } } - let mut write_idx = 0; - for read_idx in 1..ranges.len() { - let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); - let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); - - if curr_start == prev_start && curr_end == prev_end { - // Skip duplicate range - continue; - } else if curr_start >= prev_start && curr_end <= prev_end { - // Skip range that is fully contained within previous range - continue; - } else if prev_start >= curr_start && prev_end <= curr_end { - // Previous range is fully contained within current range - ranges.swap(write_idx, read_idx); - } else if curr_start < prev_end { - // Handle overlapping ranges - if read_idx < ranges.len() - 1 { - let next_start = ranges[read_idx + 1].start; - - // Check if current range is significantly overlapped by both previous and next ranges - if curr_start > prev_start && next_start < curr_end { - // Calculate overlap percentages - let overlap_with_prev = prev_end - curr_start; - let overlap_with_next = curr_end - next_start; - let range_length = curr_end - curr_start; - - // If the range is mostly overlapped by both neighbors, skip it - if overlap_with_prev + overlap_with_next > range_length { - continue; - } - } - } - // Keep current range - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); - } - } else { - // No overlap - keep both ranges - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); - } - } - } - ranges.truncate(write_idx + 1); - if args.debug { - eprintln!("Path key '{}' without redundancy 2", path_key); - for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); - } - } - if args.debug { eprintln!("Trimming overlapping ranges"); } From c692c9fcd5830f67348cfef2cf6cdcf5f5265613 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 11:55:08 -0600 Subject: [PATCH 29/45] less memory --- src/main.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/main.rs b/src/main.rs index d7a5f15..ea7a95a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -45,7 +45,6 @@ struct RangeInfo { gfa_id: usize, steps: Vec, // Path steps for this range step_positions: Vec<(usize, usize)>, // Start and end positions of each step - step_lengths: Vec, // Lengths of each step } // Helper function to check if two ranges are contiguous @@ -277,7 +276,6 @@ fn main() { gfa_id, steps: translated_steps, step_positions, - step_lengths, }); } else if args.debug { eprintln!(" Warning: Path '{}' has no steps", path_name); @@ -384,15 +382,12 @@ fn main() { } ranges.truncate(write_idx + 1); - if args.debug { eprintln!("Path key '{}' without redundancy", path_key); for range in ranges.iter() { eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } - } - if args.debug { eprintln!("Trimming overlapping ranges"); } @@ -454,7 +449,6 @@ fn main() { // Initialize new vectors to store updated steps let mut new_steps = Vec::new(); let mut new_step_positions = Vec::new(); - let mut new_step_lengths = Vec::new(); // Iterate over the original steps for idx in 0..r2.steps.len() { @@ -497,7 +491,6 @@ fn main() { new_steps.push(new_node); new_step_positions.push((step_start, overlap_start)); - new_step_lengths.push(new_seq.len()); } else if step_end > overlap_end { if args.debug { eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); @@ -513,20 +506,17 @@ fn main() { new_steps.push(new_node); new_step_positions.push((overlap_end, step_end)); - new_step_lengths.push(new_seq.len()); } } else { // Keep steps that are not to be removed or split new_steps.push(step_handle); new_step_positions.push((step_start, step_end)); - new_step_lengths.push(r2.step_lengths[idx]); } } // Update r2 with the new steps r2.steps = new_steps; r2.step_positions = new_step_positions; - r2.step_lengths = new_step_lengths; // Update edges for the modified steps for idx in 0..r2.steps.len() { From 411708ca8cf0f4f6621a7f1f72c1ee58b975a661 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 13:00:57 -0600 Subject: [PATCH 30/45] improve performance --- src/main.rs | 161 ++++++++++++++++++++++++++-------------------------- 1 file changed, 79 insertions(+), 82 deletions(-) diff --git a/src/main.rs b/src/main.rs index ea7a95a..12df35f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -166,37 +166,35 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; } - // Collect and sort paths by name - let mut paths: Vec<_> = graph.path_ids().collect(); - paths.sort_by_key(|&path_id| { - graph.get_path_name(path_id) - .map(|name_iter| name_iter.collect::>()) - .unwrap_or_default() + // Collect and sort paths directly with references to avoid multiple lookups + let mut path_entries: Vec<_> = graph.paths.iter().collect(); + path_entries.sort_by_key(|(_, path)| { + // Get name directly from path struct instead of doing lookups + path.name.as_slice() }); // Write paths with new IDs - for path_id in paths { - if let Some(name_iter) = graph.get_path_name(path_id) { - let path_name = String::from_utf8(name_iter.collect::>()) - .unwrap_or_else(|_| String::from("unknown_path")); - - let mut path_elements = Vec::new(); - if let Some(path_ref) = graph.get_path_ref(path_id) { - for handle in &path_ref.nodes { - let new_id = id_mapping[u64::from(handle.id()) as usize]; - if new_id > 0 { // Only include nodes that weren't removed - let orient = if handle.is_reverse() { "-" } else { "+" }; - path_elements.push(format!("{}{}", new_id, orient)); - } - } - } - - if !path_elements.is_empty() { - writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + for (_path_id, path) in path_entries { + // Convert name bytes to string once + let path_name = String::from_utf8_lossy(&path.name); + + // Pre-allocate vector with capacity + let mut path_elements = Vec::with_capacity(path.nodes.len()); + + // Process nodes directly from path reference + for handle in &path.nodes { + let new_id = id_mapping[u64::from(handle.id()) as usize]; + if new_id > 0 { // Only include nodes that weren't removed + let orient = if handle.is_reverse() { "-" } else { "+" }; + path_elements.push(format!("{}{}", new_id, orient)); } } + + if !path_elements.is_empty() { + writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + } } - + Ok(()) } @@ -447,8 +445,8 @@ fn main() { } // Initialize new vectors to store updated steps - let mut new_steps = Vec::new(); - let mut new_step_positions = Vec::new(); + let mut new_steps = Vec::with_capacity(r2.steps.len() / 2); + let mut new_step_positions = Vec::with_capacity(r2.steps.len() / 2); // Iterate over the original steps for idx in 0..r2.steps.len() { @@ -691,11 +689,9 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1]; // Mark nodes used in paths as not to be removed (set bit to 0) - for path_id in graph.path_ids() { - if let Some(path_ref) = graph.get_path_ref(path_id) { - for handle in &path_ref.nodes { - nodes_to_remove.set(u64::from(handle.id()) as usize, false); - } + for (_path_id, path_ref) in graph.paths.iter() { + for handle in &path_ref.nodes { + nodes_to_remove.set(u64::from(handle.id()) as usize, false); } } @@ -826,58 +822,59 @@ mod tests { .iter() .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id)) .collect(); -// Sort ranges by start position -ranges.sort_by_key(|r| (r.start, r.end)); - -let mut write_idx = 0; -for read_idx in 1..ranges.len() { - let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); - let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); - - if curr_start == prev_start && curr_end == prev_end { - // Skip duplicate range - continue; - } else if curr_start >= prev_start && curr_end <= prev_end { - // Skip range that is fully contained within previous range - continue; - } else if prev_start >= curr_start && prev_end <= curr_end { - // Previous range is fully contained within current range - ranges.swap(write_idx, read_idx); - } else if curr_start < prev_end { - // Handle overlapping ranges - check both previous and next ranges - let mut should_skip = false; - - if read_idx < ranges.len() - 1 { - let next_start = ranges[read_idx + 1].start; - - // Check if current range is significantly overlapped by both neighbors - if curr_start > prev_start && next_start < curr_end { - let overlap_with_prev = prev_end - curr_start; - let overlap_with_next = curr_end - next_start; - let range_length = curr_end - curr_start; - - // Skip if the range is mostly covered by its neighbors - if overlap_with_prev + overlap_with_next > range_length { - should_skip = true; + + // Sort ranges by start position + ranges.sort_by_key(|r| (r.start, r.end)); + + let mut write_idx = 0; + for read_idx in 1..ranges.len() { + let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); + let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); + + if curr_start == prev_start && curr_end == prev_end { + // Skip duplicate range + continue; + } else if curr_start >= prev_start && curr_end <= prev_end { + // Skip range that is fully contained within previous range + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is fully contained within current range + ranges.swap(write_idx, read_idx); + } else if curr_start < prev_end { + // Handle overlapping ranges - check both previous and next ranges + let mut should_skip = false; + + if read_idx < ranges.len() - 1 { + let next_start = ranges[read_idx + 1].start; + + // Check if current range is significantly overlapped by both neighbors + if curr_start > prev_start && next_start < curr_end { + let overlap_with_prev = prev_end - curr_start; + let overlap_with_next = curr_end - next_start; + let range_length = curr_end - curr_start; + + // Skip if the range is mostly covered by its neighbors + if overlap_with_prev + overlap_with_next > range_length { + should_skip = true; + } + } + } + + if !should_skip { + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } + } + } else { + // No overlap - keep both ranges + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); + } } } - } - - if !should_skip { - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); - } - } - } else { - // No overlap - keep both ranges - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); - } - } -} -ranges.truncate(write_idx + 1); + ranges.truncate(write_idx + 1); // Create expected ranges let expected: Vec = expected_ranges From c4beda4a5274e39da1919aa0000c2140c46348e4 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 13:16:20 -0600 Subject: [PATCH 31/45] ~2X faster --- src/main.rs | 77 ++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/src/main.rs b/src/main.rs index 12df35f..cd938e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,9 +10,6 @@ use handlegraph::{ handlegraph::*, mutablehandlegraph::*, pathhandlegraph::{ - IntoPathIds, - GraphPathNames, - GraphPathsRef, MutableGraphPaths, GraphPaths }, @@ -237,47 +234,43 @@ fn main() { } // Process paths and collect ranges with their steps - for path_id in block_graph.path_ids() { - if let Some(name_iter) = block_graph.get_path_name(path_id) { - let path_name = String::from_utf8(name_iter.collect::>()).unwrap(); - - if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) { - // Get the path steps and translate their IDs - let mut translated_steps = Vec::new(); - let mut step_positions = Vec::new(); - let mut step_lengths = Vec::new(); - let mut cumulative_pos = start; - - if let Some(path_ref) = block_graph.get_path_ref(path_id) { - for step in path_ref.nodes.iter() { - let translated_id = id_translation + step.id().into(); - let translated_step = Handle::pack(translated_id, step.is_reverse()); - translated_steps.push(translated_step); - - // Get the sequence length of the node - let node_seq = block_graph.sequence(*step).collect::>(); - let node_length = node_seq.len(); - step_lengths.push(node_length); - - // Record the end position of this step - step_positions.push((cumulative_pos, cumulative_pos + node_length)); - cumulative_pos = cumulative_pos + node_length; - } + for (_path_id, path_ref) in block_graph.paths.iter() { + let path_name = String::from_utf8_lossy(&path_ref.name); + + if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) { + // Get the path steps and translate their IDs + let mut translated_steps = Vec::new(); + let mut step_positions = Vec::new(); + let mut step_lengths = Vec::new(); + let mut cumulative_pos = start; + + for step in path_ref.nodes.iter() { + let translated_id = id_translation + step.id().into(); + let translated_step = Handle::pack(translated_id, step.is_reverse()); + translated_steps.push(translated_step); + + // Get the sequence length of the node + let node_seq = block_graph.sequence(*step).collect::>(); + let node_length = node_seq.len(); + step_lengths.push(node_length); + + // Record the end position of this step + step_positions.push((cumulative_pos, cumulative_pos + node_length)); + cumulative_pos = cumulative_pos + node_length; } - if !translated_steps.is_empty() { - path_key_ranges.entry(sample_hap_name) - .or_default() - .push(RangeInfo { - start, - end, - gfa_id, - steps: translated_steps, - step_positions, - }); - } else if args.debug { - eprintln!(" Warning: Path '{}' has no steps", path_name); - } + if !translated_steps.is_empty() { + path_key_ranges.entry(sample_hap_name) + .or_default() + .push(RangeInfo { + start, + end, + gfa_id, + steps: translated_steps, + step_positions, + }); + } else if args.debug { + eprintln!(" Warning: Path '{}' has no steps", path_name); } } } From 999e2196215c5e0c4e38b30a454cbd03767dff5a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 14:37:02 -0600 Subject: [PATCH 32/45] tweaks --- src/main.rs | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/main.rs b/src/main.rs index cd938e3..aa8a51a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -165,21 +165,21 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi // Collect and sort paths directly with references to avoid multiple lookups let mut path_entries: Vec<_> = graph.paths.iter().collect(); - path_entries.sort_by_key(|(_, path)| { + path_entries.sort_by_key(|(_, path_ref)| { // Get name directly from path struct instead of doing lookups - path.name.as_slice() + path_ref.name.as_slice() }); // Write paths with new IDs - for (_path_id, path) in path_entries { + for (_path_id, path_ref) in path_entries { // Convert name bytes to string once - let path_name = String::from_utf8_lossy(&path.name); + let path_name = String::from_utf8_lossy(&path_ref.name); // Pre-allocate vector with capacity - let mut path_elements = Vec::with_capacity(path.nodes.len()); + let mut path_elements = Vec::with_capacity(path_ref.nodes.len()); // Process nodes directly from path reference - for handle in &path.nodes { + for handle in &path_ref.nodes { let new_id = id_mapping[u64::from(handle.id()) as usize]; if new_id > 0 { // Only include nodes that weren't removed let orient = if handle.is_reverse() { "-" } else { "+" }; @@ -241,23 +241,21 @@ fn main() { // Get the path steps and translate their IDs let mut translated_steps = Vec::new(); let mut step_positions = Vec::new(); - let mut step_lengths = Vec::new(); let mut cumulative_pos = start; - for step in path_ref.nodes.iter() { - let translated_id = id_translation + step.id().into(); - let translated_step = Handle::pack(translated_id, step.is_reverse()); - translated_steps.push(translated_step); + for step in path_ref.nodes.iter() { + let translated_id = id_translation + step.id().into(); + let translated_step = Handle::pack(translated_id, step.is_reverse()); + translated_steps.push(translated_step); - // Get the sequence length of the node - let node_seq = block_graph.sequence(*step).collect::>(); - let node_length = node_seq.len(); - step_lengths.push(node_length); + // Get the sequence length of the node + let node_seq = block_graph.sequence(*step).collect::>(); + let node_length = node_seq.len(); - // Record the end position of this step - step_positions.push((cumulative_pos, cumulative_pos + node_length)); - cumulative_pos = cumulative_pos + node_length; - } + // Record the end position of this step + step_positions.push((cumulative_pos, cumulative_pos + node_length)); + cumulative_pos = cumulative_pos + node_length; + } if !translated_steps.is_empty() { path_key_ranges.entry(sample_hap_name) @@ -394,11 +392,9 @@ fn main() { let overlap_end = std::cmp::min(r1.end, r2.end); if args.debug { - let overlap_amount = overlap_end - overlap_start; - eprintln!( " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", - r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount + r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start ); } @@ -547,14 +543,17 @@ fn main() { let mut all_contiguous = true; for window in ranges.windows(2) { - if has_overlap(&window[0], &window[1]) { + let r1 = &window[0]; + let r2 = &window[1]; + + if has_overlap(r1, r2) { if args.debug { eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", - window[0].start, window[0].end, window[1].start, window[1].end); + r1.start, r1.end, r2.start, r2.end); } panic!("Unresolved overlaps detected in path key '{}'", path_key); } - if !is_contiguous(&window[0], &window[1]) { + if !is_contiguous(r1, r2) { all_contiguous = false; } } @@ -718,9 +717,8 @@ mod tests { start, end, gfa_id, - steps: vec![], // Empty steps for testing + steps: vec![], // Empty steps for testing step_positions: vec![], // Empty positions for testing - step_lengths: vec![], // Empty lengths for testing } } From 7956b68218121775e58cde77a3a492cb5ebf85b7 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 14:44:19 -0600 Subject: [PATCH 33/45] remove unecessary vector --- src/main.rs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/main.rs b/src/main.rs index aa8a51a..147f482 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,11 +22,11 @@ use tempfile::NamedTempFile; #[derive(Parser, Debug)] #[clap(author, version, about)] struct Args { - /// List of GFA file paths. + /// List of GFA file paths to combine #[clap(short, long, value_parser, num_args = 1.., value_delimiter = ' ')] gfa_list: Vec, - /// Output GFA file path + /// Output GFA file path for the combined graph #[clap(short, long, value_parser)] output: String, @@ -418,7 +418,7 @@ fn main() { // } steps_to_remove.push(idx); } else { - // if args.debug { + // if args.debug && r2.start == 11000 { // eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); // } if step_to_split.is_some() { @@ -428,10 +428,10 @@ fn main() { } } - if args.debug && r2.start == 154089 { - eprintln!(" Total steps to remove: {}", steps_to_remove.len()); - eprintln!(" Step to split: {:?}", step_to_split); - } + // if args.debug && r2.start == 11000 { + // eprintln!(" Total steps to remove: {}", steps_to_remove.len()); + // eprintln!(" Step to split: {:?}", step_to_split); + // } // Initialize new vectors to store updated steps let mut new_steps = Vec::with_capacity(r2.steps.len() / 2); @@ -561,17 +561,15 @@ fn main() { if !all_contiguous && args.debug { let mut current_start = ranges[0].start; let mut current_end = ranges[0].end; - let mut current_gfa_ids = vec![ranges[0].gfa_id]; for i in 1..ranges.len() { if is_contiguous(&ranges[i-1], &ranges[i]) { // Extend current merged range current_end = ranges[i].end; - current_gfa_ids.push(ranges[i].gfa_id); } else { // Print current merged range - eprintln!(" Merged range: start={}, end={}, gfa_ids={:?}", - current_start, current_end, current_gfa_ids); + eprintln!(" Merged range: start={}, end={}", + current_start, current_end); if !has_overlap(&ranges[i-1], &ranges[i]) { // Calculate and print gap @@ -586,13 +584,12 @@ fn main() { // Start new merged range current_start = ranges[i].start; current_end = ranges[i].end; - current_gfa_ids = vec![ranges[i].gfa_id]; } } // Print final merged range - eprintln!(" Merged range: start={}, end={}, gfa_ids={:?}", - current_start, current_end, current_gfa_ids); + eprintln!(" Merged range: start={}, end={}", + current_start, current_end); } if all_contiguous { @@ -654,10 +651,7 @@ fn main() { if args.debug { eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); - } - // After all paths are created but before writing the graph - if args.debug { eprintln!("Total nodes before filtering: {}", combined_graph.node_count()); } From f82584ec94bc2826fb6edb363a2b4fbc54e7378b Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 15:19:34 -0600 Subject: [PATCH 34/45] a bit faster and use less memory --- src/main.rs | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/main.rs b/src/main.rs index 147f482..6d46809 100644 --- a/src/main.rs +++ b/src/main.rs @@ -40,8 +40,8 @@ struct RangeInfo { start: usize, end: usize, gfa_id: usize, - steps: Vec, // Path steps for this range - step_positions: Vec<(usize, usize)>, // Start and end positions of each step + steps: Vec, // Path steps for this range + step_ends: Vec, // End positions of each step (start is either the range start (for index 0) or the previous step's end position) } // Helper function to check if two ranges are contiguous @@ -240,7 +240,7 @@ fn main() { if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) { // Get the path steps and translate their IDs let mut translated_steps = Vec::new(); - let mut step_positions = Vec::new(); + let mut step_ends = Vec::new(); let mut cumulative_pos = start; for step in path_ref.nodes.iter() { @@ -248,13 +248,11 @@ fn main() { let translated_step = Handle::pack(translated_id, step.is_reverse()); translated_steps.push(translated_step); - // Get the sequence length of the node + // Record the end position of this step let node_seq = block_graph.sequence(*step).collect::>(); let node_length = node_seq.len(); - - // Record the end position of this step - step_positions.push((cumulative_pos, cumulative_pos + node_length)); cumulative_pos = cumulative_pos + node_length; + step_ends.push(cumulative_pos); } if !translated_steps.is_empty() { @@ -265,7 +263,7 @@ fn main() { end, gfa_id, steps: translated_steps, - step_positions, + step_ends, }); } else if args.debug { eprintln!(" Warning: Path '{}' has no steps", path_name); @@ -401,7 +399,8 @@ fn main() { // Adjust r2 to remove the overlap let mut steps_to_remove = Vec::new(); let mut step_to_split: Option = None; - for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() { + for (idx, &step_end) in r2.step_ends.iter().enumerate() { + let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; if step_end <= overlap_start { // if args.debug && r2.start == 11000 { // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); @@ -435,12 +434,14 @@ fn main() { // Initialize new vectors to store updated steps let mut new_steps = Vec::with_capacity(r2.steps.len() / 2); - let mut new_step_positions = Vec::with_capacity(r2.steps.len() / 2); + let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2); + let mut range_new_start = None; // Iterate over the original steps for idx in 0..r2.steps.len() { let step_handle = r2.steps[idx]; - let (step_start, step_end) = r2.step_positions[idx]; + let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; + let step_end = r2.step_ends[idx]; if steps_to_remove.contains(&idx) { // Skip steps to remove @@ -477,7 +478,10 @@ fn main() { let new_node = combined_graph.create_handle(&new_seq, node_id); new_steps.push(new_node); - new_step_positions.push((step_start, overlap_start)); + new_step_ends.push(overlap_start); + if range_new_start.is_none() { + range_new_start = Some(step_start); + } } else if step_end > overlap_end { if args.debug { eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); @@ -492,18 +496,24 @@ fn main() { let new_node = combined_graph.create_handle(&new_seq, node_id); new_steps.push(new_node); - new_step_positions.push((overlap_end, step_end)); + new_step_ends.push(step_end); + if range_new_start.is_none() { + range_new_start = Some(overlap_end); + } } } else { // Keep steps that are not to be removed or split new_steps.push(step_handle); - new_step_positions.push((step_start, step_end)); + new_step_ends.push(step_end); + if range_new_start.is_none() { + range_new_start = Some(step_start); + } } } // Update r2 with the new steps r2.steps = new_steps; - r2.step_positions = new_step_positions; + r2.step_ends = new_step_ends; // Update edges for the modified steps for idx in 0..r2.steps.len() { @@ -516,16 +526,15 @@ fn main() { } // Update r2.start and r2.end based on the new step positions - if !r2.step_positions.is_empty() { - r2.start = r2.step_positions.first().unwrap().0; - r2.end = r2.step_positions.last().unwrap().1; + if !r2.step_ends.is_empty() { + r2.start = range_new_start.unwrap(); // Safe because if we have positions, we must have set range_new_start + r2.end = *r2.step_ends.last().unwrap(); } else { // If no steps remain, set start and end to overlap_end to effectively remove this range r2.start = overlap_end; r2.end = overlap_end; } - if args.debug { eprintln!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); } @@ -712,7 +721,7 @@ mod tests { end, gfa_id, steps: vec![], // Empty steps for testing - step_positions: vec![], // Empty positions for testing + step_ends: vec![], // Empty positions for testing } } From 1d2bf00387ca578fb438c63fab1be79f82198298 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 16:21:29 -0600 Subject: [PATCH 35/45] tweaks --- src/main.rs | 67 +++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6d46809..907a961 100644 --- a/src/main.rs +++ b/src/main.rs @@ -43,15 +43,18 @@ struct RangeInfo { steps: Vec, // Path steps for this range step_ends: Vec, // End positions of each step (start is either the range start (for index 0) or the previous step's end position) } +impl RangeInfo { + /// Returns true if this range is immediately followed by another range + /// with no gap between them + fn is_contiguous_with(&self, other: &Self) -> bool { + self.end == other.start + } -// Helper function to check if two ranges are contiguous -fn is_contiguous(r1: &RangeInfo, r2: &RangeInfo) -> bool { - r1.end == r2.start -} - -// Helper function to check if two ranges overlap -fn has_overlap(r1: &RangeInfo, r2: &RangeInfo) -> bool { - r1.start < r2.end && r2.start < r1.end + /// Returns true if this range overlaps with another range + /// Two ranges overlap if one starts before the other ends + fn overlaps_with(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } } fn read_gfa(gfa_path: &str, parser: &GFAParser) -> io::Result> { @@ -120,14 +123,13 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi writeln!(file, "H\tVN:Z:1.0")?; // Collect nodes, excluding marked nodes - let mut nodes: Vec = graph.handles() + let nodes: Vec = graph.handles() .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) .collect(); - nodes.sort_by_key(|handle| handle.id()); - // Create mapping from old IDs to new sequential IDs using a Vec - // We allocate a vector with capacity for the max node ID - let max_id = u64::from(nodes.last().unwrap().id()) as usize; + // Create mapping from old IDs to new sequential IDs + // We allocate a vector with capacity for the max node ID + 1 + let max_id = graph.node_count(); let mut id_mapping = vec![0; max_id + 1]; for (new_id, handle) in nodes.iter().enumerate() { id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1 @@ -136,23 +138,18 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi // Write nodes with new IDs for handle in &nodes { let sequence = graph.sequence(*handle).collect::>(); - let sequence_str = String::from_utf8(sequence) - .unwrap_or_else(|_| String::from("N")); - let new_id = id_mapping[u64::from(handle.id()) as usize]; - writeln!(file, "S\t{}\t{}", new_id, sequence_str)?; + let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N")); + let node_id = id_mapping[u64::from(handle.id()) as usize]; + writeln!(file, "S\t{}\t{}", node_id, sequence_str)?; } // Collect and sort edges, excluding those connected to marked nodes - let mut edges: Vec = graph.edges() + let edges: Vec = graph.edges() .filter(|edge| { !nodes_to_remove[u64::from(edge.0.id()) as usize] && !nodes_to_remove[u64::from(edge.1.id()) as usize] }) .collect(); - edges.sort_by(|a, b| { - a.0.id().cmp(&b.0.id()) - .then(a.1.id().cmp(&b.1.id())) - }); // Write edges with new IDs for edge in edges { @@ -172,22 +169,18 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi // Write paths with new IDs for (_path_id, path_ref) in path_entries { - // Convert name bytes to string once - let path_name = String::from_utf8_lossy(&path_ref.name); - // Pre-allocate vector with capacity let mut path_elements = Vec::with_capacity(path_ref.nodes.len()); // Process nodes directly from path reference for handle in &path_ref.nodes { - let new_id = id_mapping[u64::from(handle.id()) as usize]; - if new_id > 0 { // Only include nodes that weren't removed - let orient = if handle.is_reverse() { "-" } else { "+" }; - path_elements.push(format!("{}{}", new_id, orient)); - } + let node_id = id_mapping[u64::from(handle.id()) as usize]; + let orient = if handle.is_reverse() { "-" } else { "+" }; + path_elements.push(format!("{}{}", node_id, orient)); } if !path_elements.is_empty() { + let path_name = String::from_utf8_lossy(&path_ref.name); writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; } } @@ -384,7 +377,7 @@ fn main() { let r1 = &mut left[left.len()-1]; let r2 = &mut right[0]; - if has_overlap(r1, r2) { + if r1.overlaps_with(&r2) { // Calculate the overlap region - use max/min to get precise overlap bounds let overlap_start = std::cmp::max(r1.start, r2.start); let overlap_end = std::cmp::min(r1.end, r2.end); @@ -555,14 +548,14 @@ fn main() { let r1 = &window[0]; let r2 = &window[1]; - if has_overlap(r1, r2) { + if r1.overlaps_with(&r2) { if args.debug { eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", r1.start, r1.end, r2.start, r2.end); } panic!("Unresolved overlaps detected in path key '{}'", path_key); } - if !is_contiguous(r1, r2) { + if !r1.is_contiguous_with(&r2) { all_contiguous = false; } } @@ -572,7 +565,7 @@ fn main() { let mut current_end = ranges[0].end; for i in 1..ranges.len() { - if is_contiguous(&ranges[i-1], &ranges[i]) { + if ranges[i-1].is_contiguous_with(&ranges[i]) { // Extend current merged range current_end = ranges[i].end; } else { @@ -580,7 +573,7 @@ fn main() { eprintln!(" Merged range: start={}, end={}", current_start, current_end); - if !has_overlap(&ranges[i-1], &ranges[i]) { + if !ranges[i-1].overlaps_with(&ranges[i]) { // Calculate and print gap let gap = ranges[i].start - current_end; eprintln!(" Gap to next range: {} positions", gap); @@ -629,7 +622,7 @@ fn main() { let mut end_range = start_range; // Merge contiguous ranges - while next_idx < ranges.len() && is_contiguous(&ranges[next_idx - 1], &ranges[next_idx]) { + while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { steps.extend(ranges[next_idx].steps.clone()); end_range = &ranges[next_idx]; next_idx += 1; @@ -637,7 +630,6 @@ fn main() { // Create path name with range information let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); - println!("Creating path: {} with {} steps", path_name, steps.len()); let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); // Add steps to the path @@ -677,7 +669,6 @@ fn main() { } } - fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { // Create a bitvector with all nodes initially marked for removal let max_node_id = u64::from(graph.max_node_id()); From 1dd5a368fbd7db0d324e20c53f45461667a36964 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 18:55:40 -0600 Subject: [PATCH 36/45] a bit of structure in the code --- src/main.rs | 999 +++++++++++++++++++++++++++------------------------- 1 file changed, 517 insertions(+), 482 deletions(-) diff --git a/src/main.rs b/src/main.rs index 907a961..bdbeefd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,6 +35,38 @@ struct Args { debug: bool, } +fn main() { + let args = Args::parse(); + + // Create a single combined graph without paths and a map of path key to ranges + let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list, args.debug); + + // Sort ranges and create merged paths in the combined graph + for (path_key, ranges) in path_key_ranges.iter_mut() { + sort_and_filter_ranges(path_key, ranges, args.debug); + trim_range_overlaps(path_key, ranges, &mut combined_graph, args.debug); + create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.debug); + } + + if args.debug { + eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); + + eprintln!("Total nodes before filtering: {}", combined_graph.node_count()); + } + + let nodes_to_skip = mark_nodes_for_removal(&combined_graph); + + if args.debug { + eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones()); + } + + // Write the combined graph to GFA file, skipping unused nodes + match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { + Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)}, + Err(e) => eprintln!("Error writing GFA file: {}", e), + } +} + #[derive(Debug, Clone)] struct RangeInfo { start: usize, @@ -57,148 +89,17 @@ impl RangeInfo { } } -fn read_gfa(gfa_path: &str, parser: &GFAParser) -> io::Result> { - if gfa_path.ends_with(".gz") { - let file = std::fs::File::open(gfa_path).map_err(|e| { - io::Error::new( - e.kind(), - format!("Failed to open gzipped file '{}': {}", gfa_path, e) - ) - })?; - - let (mut reader, _format) = niffler::get_reader(Box::new(file)) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - - let mut decompressed = Vec::new(); - reader.read_to_end(&mut decompressed).map_err(|e| { - io::Error::new( - e.kind(), - format!("Failed to decompress file '{}': {}", gfa_path, e) - ) - })?; - - // Create temporary file in the same directory as the input file for better performance - let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new(".")); - let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| { - io::Error::new( - e.kind(), - format!("Failed to create temporary file: {}", e) - ) - })?; - - // Write decompressed data - temp_file.as_file().write_all(&decompressed).map_err(|e| { - io::Error::new( - e.kind(), - format!("Failed to write to temporary file: {}", e) - ) - })?; - - // Parse GFA - parser.parse_file(temp_file.path().to_str().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "Invalid temporary file path" - ) - })?).map_err(|e| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Failed to parse GFA: {}", e) - ) - }) - } else { - parser.parse_file(gfa_path).map_err(|e| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Failed to parse GFA file '{}': {}", gfa_path, e) - ) - }) - } -} - -fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> { - let mut file = File::create(output_path)?; - - // Write GFA version - writeln!(file, "H\tVN:Z:1.0")?; - - // Collect nodes, excluding marked nodes - let nodes: Vec = graph.handles() - .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) - .collect(); - - // Create mapping from old IDs to new sequential IDs - // We allocate a vector with capacity for the max node ID + 1 - let max_id = graph.node_count(); - let mut id_mapping = vec![0; max_id + 1]; - for (new_id, handle) in nodes.iter().enumerate() { - id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1 - } - - // Write nodes with new IDs - for handle in &nodes { - let sequence = graph.sequence(*handle).collect::>(); - let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N")); - let node_id = id_mapping[u64::from(handle.id()) as usize]; - writeln!(file, "S\t{}\t{}", node_id, sequence_str)?; - } - - // Collect and sort edges, excluding those connected to marked nodes - let edges: Vec = graph.edges() - .filter(|edge| { - !nodes_to_remove[u64::from(edge.0.id()) as usize] && - !nodes_to_remove[u64::from(edge.1.id()) as usize] - }) - .collect(); - - // Write edges with new IDs - for edge in edges { - let from_id = id_mapping[u64::from(edge.0.id()) as usize]; - let to_id = id_mapping[u64::from(edge.1.id()) as usize]; - let from_orient = if edge.0.is_reverse() { "-" } else { "+" }; - let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; - writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; - } - - // Collect and sort paths directly with references to avoid multiple lookups - let mut path_entries: Vec<_> = graph.paths.iter().collect(); - path_entries.sort_by_key(|(_, path_ref)| { - // Get name directly from path struct instead of doing lookups - path_ref.name.as_slice() - }); - - // Write paths with new IDs - for (_path_id, path_ref) in path_entries { - // Pre-allocate vector with capacity - let mut path_elements = Vec::with_capacity(path_ref.nodes.len()); - - // Process nodes directly from path reference - for handle in &path_ref.nodes { - let node_id = id_mapping[u64::from(handle.id()) as usize]; - let orient = if handle.is_reverse() { "-" } else { "+" }; - path_elements.push(format!("{}{}", node_id, orient)); - } - - if !path_elements.is_empty() { - let path_name = String::from_utf8_lossy(&path_ref.name); - writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; - } - } - - Ok(()) -} - -fn main() { - let args = Args::parse(); - - // Create a single combined graph +fn process_gfa_files( + gfa_list: &[String], + debug: bool, +) -> (HashGraph, FxHashMap>) { let mut combined_graph = HashGraph::new(); let mut path_key_ranges: FxHashMap> = FxHashMap::default(); let mut id_translations = Vec::new(); // Process each GFA file let parser = GFAParser::new(); - for (gfa_id, gfa_path) in args.gfa_list.iter().enumerate() { + for (gfa_id, gfa_path) in gfa_list.iter().enumerate() { let gfa = read_gfa(gfa_path, &parser).unwrap(); let block_graph = HashGraph::from_gfa(&gfa); @@ -222,7 +123,7 @@ fn main() { combined_graph.create_edge(translated_edge); } - if args.debug { + if debug { eprintln!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count()); } @@ -258,415 +159,494 @@ fn main() { steps: translated_steps, step_ends, }); - } else if args.debug { + } else if debug { eprintln!(" Warning: Path '{}' has no steps", path_name); } } } } - if args.debug { + if debug { eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::()); } - let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; + (combined_graph, path_key_ranges) +} - // Sort ranges and create merged paths in the combined graph - for (path_key, ranges) in path_key_ranges.iter_mut() { - // Sort ranges by start position - ranges.sort_by_key(|r| (r.start, r.end)); +fn read_gfa(gfa_path: &str, parser: &GFAParser) -> io::Result> { + if gfa_path.ends_with(".gz") { + let file = std::fs::File::open(gfa_path).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to open gzipped file '{}': {}", gfa_path, e) + ) + })?; + + let (mut reader, _format) = niffler::get_reader(Box::new(file)) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + let mut decompressed = Vec::new(); + reader.read_to_end(&mut decompressed).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to decompress file '{}': {}", gfa_path, e) + ) + })?; + + // Create temporary file in the same directory as the input file for better performance + let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new(".")); + let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to create temporary file: {}", e) + ) + })?; + + // Write decompressed data + temp_file.as_file().write_all(&decompressed).map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to write to temporary file: {}", e) + ) + })?; + + // Parse GFA + parser.parse_file(temp_file.path().to_str().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "Invalid temporary file path" + ) + })?).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse GFA: {}", e) + ) + }) + } else { + parser.parse_file(gfa_path).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse GFA file '{}': {}", gfa_path, e) + ) + }) + } +} - if args.debug { - eprintln!("Processing path key '{}'", path_key); - for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); +fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> { + // Find the last ':' to split the range from the key + if let Some(last_colon) = path_name.rfind(':') { + let (key, range_str) = path_name.split_at(last_colon); + // Skip the ':' character + let range_str = &range_str[1..]; + + // Find the '-' in the range portion + if let Some((start_str, end_str)) = range_str.split_once('-') { + if let (Ok(start), Ok(end)) = (start_str.parse(), end_str.parse()) { + return Some((key.to_string(), start, end)); } + } + } + None +} - eprintln!("Removing redundant ranges"); +fn sort_and_filter_ranges( + path_key: &String, + ranges: &mut Vec, + debug: bool +) { + // Sort ranges by start position + ranges.sort_by_key(|r| (r.start, r.end)); + + if debug { + eprintln!("Processing path key '{}'", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } - // Remove ranges that are contained within other ranges - let mut write_idx = 0; - for read_idx in 1..ranges.len() { - let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); - let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); - - if curr_start == prev_start && curr_end == prev_end { - // Skip duplicate range - // Current range is a duplicate of the previous range, skip it - if args.debug { - eprintln!( - " Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.", - curr_start, curr_end - ); - } - continue; - } else if curr_start >= prev_start && curr_end <= prev_end { - // Skip range that is fully contained within previous range - if args.debug { - eprintln!( - " Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.", - curr_start, curr_end, prev_start, prev_end - ); - } - continue; - } else if prev_start >= curr_start && prev_end <= curr_end { - // Previous range is fully contained within current range - if args.debug { - eprintln!( - " Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.", - prev_start, prev_end, curr_start, curr_end - ); - } - ranges.swap(write_idx, read_idx); - } else if curr_start < prev_end { - // Handle overlapping ranges - check both previous and next ranges - let mut should_skip = false; + eprintln!("Removing redundant ranges"); + } + + // Remove ranges that are contained within other ranges + let mut write_idx = 0; + for read_idx in 1..ranges.len() { + let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end); + let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end); + + if curr_start == prev_start && curr_end == prev_end { + // Skip duplicate range + // Current range is a duplicate of the previous range, skip it + if debug { + eprintln!( + " Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.", + curr_start, curr_end + ); + } + continue; + } else if curr_start >= prev_start && curr_end <= prev_end { + // Skip range that is fully contained within previous range + if debug { + eprintln!( + " Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.", + curr_start, curr_end, prev_start, prev_end + ); + } + continue; + } else if prev_start >= curr_start && prev_end <= curr_end { + // Previous range is fully contained within current range + if debug { + eprintln!( + " Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.", + prev_start, prev_end, curr_start, curr_end + ); + } + ranges.swap(write_idx, read_idx); + } else if curr_start < prev_end { + // Handle overlapping ranges - check both previous and next ranges + let mut should_skip = false; + + if read_idx < ranges.len() - 1 { + let next_start = ranges[read_idx + 1].start; - if read_idx < ranges.len() - 1 { - let next_start = ranges[read_idx + 1].start; + // Check if current range is significantly overlapped by both neighbors + if curr_start > prev_start && next_start < curr_end { + let overlap_with_prev = prev_end - curr_start; + let overlap_with_next = curr_end - next_start; + let range_length = curr_end - curr_start; - // Check if current range is significantly overlapped by both neighbors - if curr_start > prev_start && next_start < curr_end { - let overlap_with_prev = prev_end - curr_start; - let overlap_with_next = curr_end - next_start; - let range_length = curr_end - curr_start; - - // Skip if the range is mostly covered by its neighbors - if overlap_with_prev + overlap_with_next > range_length { - should_skip = true; - } + // Skip if the range is mostly covered by its neighbors + if overlap_with_prev + overlap_with_next > range_length { + should_skip = true; } } - - if !should_skip { - if args.debug { - eprintln!( - " Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.", - curr_start, curr_end, prev_start, prev_end - ); - } - write_idx += 1; - if write_idx != read_idx { - ranges.swap(write_idx, read_idx); - } + } + + if !should_skip { + if debug { + eprintln!( + " Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.", + curr_start, curr_end, prev_start, prev_end + ); } - } else { - // No overlap - keep both ranges write_idx += 1; if write_idx != read_idx { ranges.swap(write_idx, read_idx); } } - } - ranges.truncate(write_idx + 1); - - if args.debug { - eprintln!("Path key '{}' without redundancy", path_key); - for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + } else { + // No overlap - keep both ranges + write_idx += 1; + if write_idx != read_idx { + ranges.swap(write_idx, read_idx); } - - eprintln!("Trimming overlapping ranges"); } + } + ranges.truncate(write_idx + 1); + + if debug { + eprintln!("Path key '{}' without redundancy", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + } + } +} - // Process overlaps - for i in 1..ranges.len() { - let (left, right) = ranges.split_at_mut(i); - let r1 = &mut left[left.len()-1]; - let r2 = &mut right[0]; +fn trim_range_overlaps( + path_key: &String, + ranges: &mut Vec, + combined_graph: &mut HashGraph, + debug: bool +) { + // Trim overlaps + if debug { + eprintln!("Trimming overlapping ranges"); + } - if r1.overlaps_with(&r2) { - // Calculate the overlap region - use max/min to get precise overlap bounds - let overlap_start = std::cmp::max(r1.start, r2.start); - let overlap_end = std::cmp::min(r1.end, r2.end); + let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; - if args.debug { - eprintln!( - " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", - r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start - ); - } + for i in 1..ranges.len() { + let (left, right) = ranges.split_at_mut(i); + let r1 = &mut left[left.len()-1]; + let r2 = &mut right[0]; - // Adjust r2 to remove the overlap - let mut steps_to_remove = Vec::new(); - let mut step_to_split: Option = None; - for (idx, &step_end) in r2.step_ends.iter().enumerate() { - let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; - if step_end <= overlap_start { - // if args.debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); - // } - continue; - } else if step_start >= overlap_end { - // if args.debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); - // } - break; - } else if step_start >= overlap_start && step_end <= overlap_end { - // if args.debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); - // } - steps_to_remove.push(idx); - } else { - // if args.debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); - // } - if step_to_split.is_some() { - panic!("Error: More than one step is partially overlapping, which is not allowed."); - } - step_to_split = Some(idx); + if r1.overlaps_with(&r2) { + // Calculate the overlap region - use max/min to get precise overlap bounds + let overlap_start = std::cmp::max(r1.start, r2.start); + let overlap_end = std::cmp::min(r1.end, r2.end); + + if debug { + eprintln!( + " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", + r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start + ); + } + + // Adjust r2 to remove the overlap + let mut steps_to_remove = Vec::new(); + let mut step_to_split: Option = None; + for (idx, &step_end) in r2.step_ends.iter().enumerate() { + let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; + if step_end <= overlap_start { + // if debug && r2.start == 11000 { + // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); + // } + continue; + } else if step_start >= overlap_end { + // if debug && r2.start == 11000 { + // eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); + // } + break; + } else if step_start >= overlap_start && step_end <= overlap_end { + // if debug && r2.start == 11000 { + // eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); + // } + steps_to_remove.push(idx); + } else { + // if debug && r2.start == 11000 { + // eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); + // } + if step_to_split.is_some() { + panic!("Error: More than one step is partially overlapping, which is not allowed."); } + step_to_split = Some(idx); } + } - // if args.debug && r2.start == 11000 { - // eprintln!(" Total steps to remove: {}", steps_to_remove.len()); - // eprintln!(" Step to split: {:?}", step_to_split); - // } - - // Initialize new vectors to store updated steps - let mut new_steps = Vec::with_capacity(r2.steps.len() / 2); - let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2); - let mut range_new_start = None; - - // Iterate over the original steps - for idx in 0..r2.steps.len() { - let step_handle = r2.steps[idx]; - let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; - let step_end = r2.step_ends[idx]; - - if steps_to_remove.contains(&idx) { - // Skip steps to remove - continue; - } else if step_to_split == Some(idx) { - // Split node for the single partially overlapping step - let node_seq = combined_graph.sequence(step_handle).collect::>(); - let overlap_within_step_start = std::cmp::max(step_start, overlap_start); - let overlap_within_step_end = std::cmp::min(step_end, overlap_end); - - // Calculate offsets relative to the node sequence - let node_len = node_seq.len(); - // Calculate offsets consistently regardless of strand - let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len); - let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len); - - if args.debug { - eprintln!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", - idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); - eprintln!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); - } + // if debug && r2.start == 11000 { + // eprintln!(" Total steps to remove: {}", steps_to_remove.len()); + // eprintln!(" Step to split: {:?}", step_to_split); + // } - if step_start < overlap_start { - if args.debug { - eprintln!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); - } - assert!(overlap_start_offset > 0); + // Initialize new vectors to store updated steps + let mut new_steps = Vec::with_capacity(r2.steps.len() / 2); + let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2); + let mut range_new_start = None; - // Keep left part - let new_seq = node_seq[0..overlap_start_offset].to_vec(); + // Iterate over the original steps + for idx in 0..r2.steps.len() { + let step_handle = r2.steps[idx]; + let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; + let step_end = r2.step_ends[idx]; - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(&new_seq, node_id); + if steps_to_remove.contains(&idx) { + // Skip steps to remove + continue; + } else if step_to_split == Some(idx) { + // Split node for the single partially overlapping step + let node_seq = combined_graph.sequence(step_handle).collect::>(); + let overlap_within_step_start = std::cmp::max(step_start, overlap_start); + let overlap_within_step_end = std::cmp::min(step_end, overlap_end); + + // Calculate offsets relative to the node sequence + let node_len = node_seq.len(); + // Calculate offsets consistently regardless of strand + let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len); + let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len); + + if debug { + eprintln!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", + idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); + eprintln!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); + } - new_steps.push(new_node); - new_step_ends.push(overlap_start); - if range_new_start.is_none() { - range_new_start = Some(step_start); - } - } else if step_end > overlap_end { - if args.debug { - eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); - } - assert!(overlap_end_offset < node_len); + if step_start < overlap_start { + if debug { + eprintln!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); + } + assert!(overlap_start_offset > 0); - // Keep right part - let new_seq = node_seq[overlap_end_offset..].to_vec(); + // Keep left part + let new_seq = node_seq[0..overlap_start_offset].to_vec(); - let node_id = NodeId::from(next_node_id_value); - next_node_id_value += 1; - let new_node = combined_graph.create_handle(&new_seq, node_id); + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(&new_seq, node_id); - new_steps.push(new_node); - new_step_ends.push(step_end); - if range_new_start.is_none() { - range_new_start = Some(overlap_end); - } + new_steps.push(new_node); + new_step_ends.push(overlap_start); + if range_new_start.is_none() { + range_new_start = Some(step_start); } - } else { - // Keep steps that are not to be removed or split - new_steps.push(step_handle); + } else if step_end > overlap_end { + if debug { + eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); + } + assert!(overlap_end_offset < node_len); + + // Keep right part + let new_seq = node_seq[overlap_end_offset..].to_vec(); + + let node_id = NodeId::from(next_node_id_value); + next_node_id_value += 1; + let new_node = combined_graph.create_handle(&new_seq, node_id); + + new_steps.push(new_node); new_step_ends.push(step_end); if range_new_start.is_none() { - range_new_start = Some(step_start); + range_new_start = Some(overlap_end); } } + } else { + // Keep steps that are not to be removed or split + new_steps.push(step_handle); + new_step_ends.push(step_end); + if range_new_start.is_none() { + range_new_start = Some(step_start); + } } + } - // Update r2 with the new steps - r2.steps = new_steps; - r2.step_ends = new_step_ends; + // Update r2 with the new steps + r2.steps = new_steps; + r2.step_ends = new_step_ends; - // Update edges for the modified steps - for idx in 0..r2.steps.len() { - if idx > 0 { - let prev_step = r2.steps[idx - 1]; - if !combined_graph.has_edge(prev_step, r2.steps[idx]) { - combined_graph.create_edge(Edge(prev_step, r2.steps[idx])); - } + // Update edges for the modified steps + for idx in 0..r2.steps.len() { + if idx > 0 { + let prev_step = r2.steps[idx - 1]; + if !combined_graph.has_edge(prev_step, r2.steps[idx]) { + combined_graph.create_edge(Edge(prev_step, r2.steps[idx])); } } + } - // Update r2.start and r2.end based on the new step positions - if !r2.step_ends.is_empty() { - r2.start = range_new_start.unwrap(); // Safe because if we have positions, we must have set range_new_start - r2.end = *r2.step_ends.last().unwrap(); - } else { - // If no steps remain, set start and end to overlap_end to effectively remove this range - r2.start = overlap_end; - r2.end = overlap_end; - } + // Update r2.start and r2.end based on the new step positions + if !r2.step_ends.is_empty() { + r2.start = range_new_start.unwrap(); // Safe because if we have positions, we must have set range_new_start + r2.end = *r2.step_ends.last().unwrap(); + } else { + // If no steps remain, set start and end to overlap_end to effectively remove this range + r2.start = overlap_end; + r2.end = overlap_end; + } - if args.debug { - eprintln!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); - } + if debug { + eprintln!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); } } + } - if args.debug { - eprintln!("Path key '{}' without overlaps", path_key); - for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); - } + if debug { + eprintln!("Path key '{}' without overlaps", path_key); + for range in ranges.iter() { + eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } + } +} - // Check for overlaps and contiguity - let mut all_contiguous = true; - - for window in ranges.windows(2) { - let r1 = &window[0]; - let r2 = &window[1]; - - if r1.overlaps_with(&r2) { - if args.debug { - eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", - r1.start, r1.end, r2.start, r2.end); - } - panic!("Unresolved overlaps detected in path key '{}'", path_key); - } - if !r1.is_contiguous_with(&r2) { - all_contiguous = false; +fn create_paths_from_ranges( + path_key: &str, + ranges: &[RangeInfo], + combined_graph: &mut HashGraph, + debug: bool +) { + // Check for overlaps and contiguity + let mut all_contiguous = true; + + for window in ranges.windows(2) { + let r1 = &window[0]; + let r2 = &window[1]; + + if r1.overlaps_with(&r2) { + if debug { + eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", + r1.start, r1.end, r2.start, r2.end); } + panic!("Unresolved overlaps detected in path key '{}'", path_key); } - - if !all_contiguous && args.debug { - let mut current_start = ranges[0].start; - let mut current_end = ranges[0].end; + if !r1.is_contiguous_with(&r2) { + all_contiguous = false; + } + } - for i in 1..ranges.len() { - if ranges[i-1].is_contiguous_with(&ranges[i]) { - // Extend current merged range - current_end = ranges[i].end; + if !all_contiguous && debug { + let mut current_start = ranges[0].start; + let mut current_end = ranges[0].end; + + for i in 1..ranges.len() { + if ranges[i-1].is_contiguous_with(&ranges[i]) { + // Extend current merged range + current_end = ranges[i].end; + } else { + // Print current merged range + eprintln!(" Merged range: start={}, end={}", + current_start, current_end); + + if !ranges[i-1].overlaps_with(&ranges[i]) { + // Calculate and print gap + let gap = ranges[i].start - current_end; + eprintln!(" Gap to next range: {} positions", gap); } else { - // Print current merged range - eprintln!(" Merged range: start={}, end={}", - current_start, current_end); - - if !ranges[i-1].overlaps_with(&ranges[i]) { - // Calculate and print gap - let gap = ranges[i].start - current_end; - eprintln!(" Gap to next range: {} positions", gap); - } else { - // Calculate and print overlap - let overlap = current_end - ranges[i].start; - eprintln!(" Overlap with next range: {} positions", overlap); - } - - // Start new merged range - current_start = ranges[i].start; - current_end = ranges[i].end; + // Calculate and print overlap + let overlap = current_end - ranges[i].start; + eprintln!(" Overlap with next range: {} positions", overlap); } + + // Start new merged range + current_start = ranges[i].start; + current_end = ranges[i].end; } - - // Print final merged range - eprintln!(" Merged range: start={}, end={}", - current_start, current_end); } + + // Print final merged range + eprintln!(" Merged range: start={}, end={}", + current_start, current_end); + } - if all_contiguous { - // Create a single path with the original key - let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap(); - let mut prev_step = None; - - // Add all steps from all ranges - for range in ranges.iter() { - for step in &range.steps { - combined_graph.path_append_step(path_id, *step); - - if let Some(prev) = prev_step { - if !combined_graph.has_edge(prev, *step) { - combined_graph.create_edge(Edge(prev, *step)); - } + if all_contiguous { + // Create a single path with the original key + let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap(); + let mut prev_step = None; + + // Add all steps from all ranges + for range in ranges.iter() { + for step in &range.steps { + combined_graph.path_append_step(path_id, *step); + + if let Some(prev) = prev_step { + if !combined_graph.has_edge(prev, *step) { + combined_graph.create_edge(Edge(prev, *step)); } - prev_step = Some(*step); } + prev_step = Some(*step); } - } else { - // Handle non-contiguous ranges by creating separate paths for each contiguous group - let mut current_range_idx = 0; - while current_range_idx < ranges.len() { - let start_range = &ranges[current_range_idx]; - let mut steps = start_range.steps.clone(); - let mut next_idx = current_range_idx + 1; - let mut end_range = start_range; - - // Merge contiguous ranges - while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { - steps.extend(ranges[next_idx].steps.clone()); - end_range = &ranges[next_idx]; - next_idx += 1; - } - - // Create path name with range information - let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); - let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); + } + } else { + // Handle non-contiguous ranges by creating separate paths for each contiguous group + let mut current_range_idx = 0; + while current_range_idx < ranges.len() { + let start_range = &ranges[current_range_idx]; + let mut steps = start_range.steps.clone(); + let mut next_idx = current_range_idx + 1; + let mut end_range = start_range; + + // Merge contiguous ranges + while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { + steps.extend(ranges[next_idx].steps.clone()); + end_range = &ranges[next_idx]; + next_idx += 1; + } + + // Create path name with range information + let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); + let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); + + // Add steps to the path + let mut prev_step = None; + for step in steps { + combined_graph.path_append_step(path_id, step); - // Add steps to the path - let mut prev_step = None; - for step in steps { - combined_graph.path_append_step(path_id, step); - - if let Some(prev) = prev_step { - if !combined_graph.has_edge(prev, step) { - combined_graph.create_edge(Edge(prev, step)); - } + if let Some(prev) = prev_step { + if !combined_graph.has_edge(prev, step) { + combined_graph.create_edge(Edge(prev, step)); } - prev_step = Some(step); } - - current_range_idx = next_idx; + prev_step = Some(step); } + + current_range_idx = next_idx; } } - - if args.debug { - eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); - - eprintln!("Total nodes before filtering: {}", combined_graph.node_count()); - } - - let nodes_to_skip = mark_nodes_for_removal(&combined_graph); - - if args.debug { - eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones()); - } - - // Write the combined graph to GFA file, skipping unused nodes - match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { - Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)}, - Err(e) => eprintln!("Error writing GFA file: {}", e), - } } fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { @@ -684,21 +664,76 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { nodes_to_remove } -fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> { - // Find the last ':' to split the range from the key - if let Some(last_colon) = path_name.rfind(':') { - let (key, range_str) = path_name.split_at(last_colon); - // Skip the ':' character - let range_str = &range_str[1..]; +fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> { + let mut file = File::create(output_path)?; + + // Write GFA version + writeln!(file, "H\tVN:Z:1.0")?; + + // Collect nodes, excluding marked nodes + let nodes: Vec = graph.handles() + .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) + .collect(); + + // Create mapping from old IDs to new sequential IDs + // We allocate a vector with capacity for the max node ID + 1 + let max_id = graph.node_count(); + let mut id_mapping = vec![0; max_id + 1]; + for (new_id, handle) in nodes.iter().enumerate() { + id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1 + } + + // Write nodes with new IDs + for handle in &nodes { + let sequence = graph.sequence(*handle).collect::>(); + let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N")); + let node_id = id_mapping[u64::from(handle.id()) as usize]; + writeln!(file, "S\t{}\t{}", node_id, sequence_str)?; + } + + // Collect and sort edges, excluding those connected to marked nodes + let edges: Vec = graph.edges() + .filter(|edge| { + !nodes_to_remove[u64::from(edge.0.id()) as usize] && + !nodes_to_remove[u64::from(edge.1.id()) as usize] + }) + .collect(); + + // Write edges with new IDs + for edge in edges { + let from_id = id_mapping[u64::from(edge.0.id()) as usize]; + let to_id = id_mapping[u64::from(edge.1.id()) as usize]; + let from_orient = if edge.0.is_reverse() { "-" } else { "+" }; + let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; + writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; + } + + // Collect and sort paths directly with references to avoid multiple lookups + let mut path_entries: Vec<_> = graph.paths.iter().collect(); + path_entries.sort_by_key(|(_, path_ref)| { + // Get name directly from path struct instead of doing lookups + path_ref.name.as_slice() + }); + + // Write paths with new IDs + for (_path_id, path_ref) in path_entries { + // Pre-allocate vector with capacity + let mut path_elements = Vec::with_capacity(path_ref.nodes.len()); - // Find the '-' in the range portion - if let Some((start_str, end_str)) = range_str.split_once('-') { - if let (Ok(start), Ok(end)) = (start_str.parse(), end_str.parse()) { - return Some((key.to_string(), start, end)); - } + // Process nodes directly from path reference + for handle in &path_ref.nodes { + let node_id = id_mapping[u64::from(handle.id()) as usize]; + let orient = if handle.is_reverse() { "-" } else { "+" }; + path_elements.push(format!("{}{}", node_id, orient)); + } + + if !path_elements.is_empty() { + let path_name = String::from_utf8_lossy(&path_ref.name); + writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; } } - None + + Ok(()) } #[cfg(test)] From 9e2de3e7f1bb4a3093ab19d33e22836660ce51b0 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 19:24:27 -0600 Subject: [PATCH 37/45] clippy is happy --- src/main.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main.rs b/src/main.rs index bdbeefd..d9ee1f7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -145,7 +145,7 @@ fn process_gfa_files( // Record the end position of this step let node_seq = block_graph.sequence(*step).collect::>(); let node_length = node_seq.len(); - cumulative_pos = cumulative_pos + node_length; + cumulative_pos += node_length; step_ends.push(cumulative_pos); } @@ -168,7 +168,7 @@ fn process_gfa_files( if debug { eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", - combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::()); + combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::()); } (combined_graph, path_key_ranges) @@ -251,7 +251,7 @@ fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> { } fn sort_and_filter_ranges( - path_key: &String, + path_key: &str, ranges: &mut Vec, debug: bool ) { @@ -352,8 +352,8 @@ fn sort_and_filter_ranges( } fn trim_range_overlaps( - path_key: &String, - ranges: &mut Vec, + path_key: &str, + ranges: &mut [RangeInfo], combined_graph: &mut HashGraph, debug: bool ) { @@ -369,7 +369,7 @@ fn trim_range_overlaps( let r1 = &mut left[left.len()-1]; let r2 = &mut right[0]; - if r1.overlaps_with(&r2) { + if r1.overlaps_with(r2) { // Calculate the overlap region - use max/min to get precise overlap bounds let overlap_start = std::cmp::max(r1.start, r2.start); let overlap_end = std::cmp::min(r1.end, r2.end); @@ -547,14 +547,14 @@ fn create_paths_from_ranges( let r1 = &window[0]; let r2 = &window[1]; - if r1.overlaps_with(&r2) { + if r1.overlaps_with(r2) { if debug { eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", r1.start, r1.end, r2.start, r2.end); } panic!("Unresolved overlaps detected in path key '{}'", path_key); } - if !r1.is_contiguous_with(&r2) { + if !r1.is_contiguous_with(r2) { all_contiguous = false; } } From 64372b0ca04c7154a1e7a04468014ae0d4e92af9 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 14 Dec 2024 22:33:00 -0600 Subject: [PATCH 38/45] logging --- Cargo.lock | 33 ++++++++++++++- Cargo.toml | 2 + src/main.rs | 115 +++++++++++++++++++++++++++------------------------- 3 files changed, 93 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea3e602..8f49aaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler2" @@ -311,6 +311,29 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "env_filter" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "errno" version = "0.3.10" @@ -372,8 +395,10 @@ dependencies = [ "bitvec", "bstr 1.11.1", "clap", + "env_logger", "gfa", "handlegraph", + "log", "niffler", "rustc-hash", "tempfile", @@ -401,6 +426,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" diff --git a/Cargo.toml b/Cargo.toml index 9110992..756012d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,5 @@ gfa = "0.10.1" bitvec = "1.0.1" tempfile = "3.14.0" rustc-hash = "2.1.0" +log = "0.4.22" +env_logger = "0.11.5" diff --git a/src/main.rs b/src/main.rs index d9ee1f7..098790e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,6 +18,7 @@ use handlegraph::{ use gfa::{gfa::GFA, parser::GFAParser}; use bitvec::{bitvec, prelude::BitVec}; use tempfile::NamedTempFile; +use log::{debug, info, warn, error}; #[derive(Parser, Debug)] #[clap(author, version, about)] @@ -30,40 +31,45 @@ struct Args { #[clap(short, long, value_parser)] output: String, - /// Enable debug output - #[clap(short, long)] - debug: bool, + /// Verbosity level (0 = error, 1 = info, 2 = debug) + #[clap(short, long, default_value = "0")] + verbose: u8, } fn main() { let args = Args::parse(); + // Initialize logger based on verbosity + env_logger::Builder::new() + .filter_level(match args.verbose { + 0 => log::LevelFilter::Error, + 1 => log::LevelFilter::Info, + _ => log::LevelFilter::Debug, + }) + .init(); + // Create a single combined graph without paths and a map of path key to ranges - let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list, args.debug); + let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list); // Sort ranges and create merged paths in the combined graph for (path_key, ranges) in path_key_ranges.iter_mut() { - sort_and_filter_ranges(path_key, ranges, args.debug); - trim_range_overlaps(path_key, ranges, &mut combined_graph, args.debug); - create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.debug); + info!("Processing path key '{}' with {} ranges", path_key, ranges.len()); + sort_and_filter_ranges(path_key, ranges, args.verbose > 1); + trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1); + create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1); } + info!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); - if args.debug { - eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); - - eprintln!("Total nodes before filtering: {}", combined_graph.node_count()); - } + info!("Total nodes before filtering: {}", combined_graph.node_count()); let nodes_to_skip = mark_nodes_for_removal(&combined_graph); - if args.debug { - eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones()); - } - + info!("Total nodes to be filtered out: {}", nodes_to_skip.count_ones()); + // Write the combined graph to GFA file, skipping unused nodes match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { - Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)}, - Err(e) => eprintln!("Error writing GFA file: {}", e), + Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), + Err(e) => error!("Error writing the GFA file: {}", e), } } @@ -91,12 +97,13 @@ impl RangeInfo { fn process_gfa_files( gfa_list: &[String], - debug: bool, ) -> (HashGraph, FxHashMap>) { let mut combined_graph = HashGraph::new(); let mut path_key_ranges: FxHashMap> = FxHashMap::default(); let mut id_translations = Vec::new(); + info!("Processing {} GFA files", gfa_list.len()); + // Process each GFA file let parser = GFAParser::new(); for (gfa_id, gfa_path) in gfa_list.iter().enumerate() { @@ -123,9 +130,7 @@ fn process_gfa_files( combined_graph.create_edge(translated_edge); } - if debug { - eprintln!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count()); - } + debug!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count()); // Process paths and collect ranges with their steps for (_path_id, path_ref) in block_graph.paths.iter() { @@ -159,17 +164,15 @@ fn process_gfa_files( steps: translated_steps, step_ends, }); - } else if debug { - eprintln!(" Warning: Path '{}' has no steps", path_name); + } else { + warn!(" Path '{}' has no steps", path_name); } } } } - if debug { - eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", + info!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::()); - } (combined_graph, path_key_ranges) } @@ -259,12 +262,12 @@ fn sort_and_filter_ranges( ranges.sort_by_key(|r| (r.start, r.end)); if debug { - eprintln!("Processing path key '{}'", path_key); + debug!("Processing path key '{}'", path_key); for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + debug!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } - eprintln!("Removing redundant ranges"); + debug!("Removing redundant ranges"); } // Remove ranges that are contained within other ranges @@ -277,7 +280,7 @@ fn sort_and_filter_ranges( // Skip duplicate range // Current range is a duplicate of the previous range, skip it if debug { - eprintln!( + debug!( " Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.", curr_start, curr_end ); @@ -286,7 +289,7 @@ fn sort_and_filter_ranges( } else if curr_start >= prev_start && curr_end <= prev_end { // Skip range that is fully contained within previous range if debug { - eprintln!( + debug!( " Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.", curr_start, curr_end, prev_start, prev_end ); @@ -295,7 +298,7 @@ fn sort_and_filter_ranges( } else if prev_start >= curr_start && prev_end <= curr_end { // Previous range is fully contained within current range if debug { - eprintln!( + debug!( " Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.", prev_start, prev_end, curr_start, curr_end ); @@ -323,7 +326,7 @@ fn sort_and_filter_ranges( if !should_skip { if debug { - eprintln!( + debug!( " Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.", curr_start, curr_end, prev_start, prev_end ); @@ -344,9 +347,9 @@ fn sort_and_filter_ranges( ranges.truncate(write_idx + 1); if debug { - eprintln!("Path key '{}' without redundancy", path_key); + debug!("Path key '{}' without redundancy", path_key); for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + debug!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } } } @@ -359,7 +362,7 @@ fn trim_range_overlaps( ) { // Trim overlaps if debug { - eprintln!("Trimming overlapping ranges"); + debug!("Trimming overlapping ranges"); } let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; @@ -375,7 +378,7 @@ fn trim_range_overlaps( let overlap_end = std::cmp::min(r1.end, r2.end); if debug { - eprintln!( + debug!( " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start ); @@ -388,22 +391,22 @@ fn trim_range_overlaps( let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] }; if step_end <= overlap_start { // if debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); + // debug!(" Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start); // } continue; } else if step_start >= overlap_end { // if debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); + // debug!(" Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start); // } break; } else if step_start >= overlap_start && step_end <= overlap_end { // if debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); + // debug!(" Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start); // } steps_to_remove.push(idx); } else { // if debug && r2.start == 11000 { - // eprintln!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); + // debug!(" Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start); // } if step_to_split.is_some() { panic!("Error: More than one step is partially overlapping, which is not allowed."); @@ -413,8 +416,8 @@ fn trim_range_overlaps( } // if debug && r2.start == 11000 { - // eprintln!(" Total steps to remove: {}", steps_to_remove.len()); - // eprintln!(" Step to split: {:?}", step_to_split); + // debug!(" Total steps to remove: {}", steps_to_remove.len()); + // debug!(" Step to split: {:?}", step_to_split); // } // Initialize new vectors to store updated steps @@ -444,14 +447,14 @@ fn trim_range_overlaps( let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len); if debug { - eprintln!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", + debug!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); - eprintln!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); + debug!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); } if step_start < overlap_start { if debug { - eprintln!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); + debug!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); } assert!(overlap_start_offset > 0); @@ -469,7 +472,7 @@ fn trim_range_overlaps( } } else if step_end > overlap_end { if debug { - eprintln!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); + debug!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); } assert!(overlap_end_offset < node_len); @@ -521,15 +524,15 @@ fn trim_range_overlaps( } if debug { - eprintln!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); + debug!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); } } } if debug { - eprintln!("Path key '{}' without overlaps", path_key); + debug!("Path key '{}' without overlaps", path_key); for range in ranges.iter() { - eprintln!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); + debug!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } } } @@ -549,7 +552,7 @@ fn create_paths_from_ranges( if r1.overlaps_with(r2) { if debug { - eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", + debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", r1.start, r1.end, r2.start, r2.end); } panic!("Unresolved overlaps detected in path key '{}'", path_key); @@ -569,17 +572,17 @@ fn create_paths_from_ranges( current_end = ranges[i].end; } else { // Print current merged range - eprintln!(" Merged range: start={}, end={}", + debug!(" Merged range: start={}, end={}", current_start, current_end); if !ranges[i-1].overlaps_with(&ranges[i]) { // Calculate and print gap let gap = ranges[i].start - current_end; - eprintln!(" Gap to next range: {} positions", gap); + debug!(" Gap to next range: {} positions", gap); } else { // Calculate and print overlap let overlap = current_end - ranges[i].start; - eprintln!(" Overlap with next range: {} positions", overlap); + debug!(" Overlap with next range: {} positions", overlap); } // Start new merged range @@ -589,7 +592,7 @@ fn create_paths_from_ranges( } // Print final merged range - eprintln!(" Merged range: start={}, end={}", + debug!(" Merged range: start={}, end={}", current_start, current_end); } From 5edcf0ce18e6c4b718ce218e87497eac91950835 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 10:03:11 -0600 Subject: [PATCH 39/45] more log --- Cargo.lock | 12 ++++++------ src/main.rs | 23 ++++++++++++----------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f49aaa..80c0fc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -273,18 +273,18 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -301,9 +301,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "either" diff --git a/src/main.rs b/src/main.rs index 098790e..215e462 100644 --- a/src/main.rs +++ b/src/main.rs @@ -51,22 +51,23 @@ fn main() { // Create a single combined graph without paths and a map of path key to ranges let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list); - // Sort ranges and create merged paths in the combined graph + // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph + info!("Processing {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); for (path_key, ranges) in path_key_ranges.iter_mut() { - info!("Processing path key '{}' with {} ranges", path_key, ranges.len()); + debug!("Processing path key '{}' with {} ranges", path_key, ranges.len()); + sort_and_filter_ranges(path_key, ranges, args.verbose > 1); trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1); create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1); } - info!("Total paths created: {}", GraphPaths::path_count(&combined_graph)); + info!("Created {} nodes, {} edges, and {} paths in the combined graph", + combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count()); - info!("Total nodes before filtering: {}", combined_graph.node_count()); - - let nodes_to_skip = mark_nodes_for_removal(&combined_graph); - - info!("Total nodes to be filtered out: {}", nodes_to_skip.count_ones()); + info!("Marking unused nodes for removal"); + let nodes_to_skip = mark_nodes_for_removal(&combined_graph); + info!("Total unused nodes to remove: {}", nodes_to_skip.count_ones()); - // Write the combined graph to GFA file, skipping unused nodes + info!("Writing the combined graph by removing unused nodes and compacting node IDs"); match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), @@ -171,8 +172,8 @@ fn process_gfa_files( } } - info!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files", - combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::()); + info!("Collected {} nodes, {} edges, and {} path keys", + combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len()); (combined_graph, path_key_ranges) } From 4845e06394288d784e4e9b052cfd43792bb67c7f Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 11:54:10 -0600 Subject: [PATCH 40/45] tweaks --- src/main.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main.rs b/src/main.rs index 215e462..e58c0e0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,10 +49,10 @@ fn main() { .init(); // Create a single combined graph without paths and a map of path key to ranges - let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list); + let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list); // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph - info!("Processing {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); + info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); for (path_key, ranges) in path_key_ranges.iter_mut() { debug!("Processing path key '{}' with {} ranges", path_key, ranges.len()); @@ -60,15 +60,11 @@ fn main() { trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1); create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1); } - info!("Created {} nodes, {} edges, and {} paths in the combined graph", + info!("Created {} nodes, {} edges, and {} paths", combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count()); - info!("Marking unused nodes for removal"); - let nodes_to_skip = mark_nodes_for_removal(&combined_graph); - info!("Total unused nodes to remove: {}", nodes_to_skip.count_ones()); - - info!("Writing the combined graph by removing unused nodes and compacting node IDs"); - match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) { + info!("Writing the result by removing unused nodes/edges and compacting node IDs"); + match write_graph_to_gfa(&combined_graph, &args.output) { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), } @@ -96,14 +92,14 @@ impl RangeInfo { } } -fn process_gfa_files( +fn read_gfa_files( gfa_list: &[String], ) -> (HashGraph, FxHashMap>) { let mut combined_graph = HashGraph::new(); let mut path_key_ranges: FxHashMap> = FxHashMap::default(); let mut id_translations = Vec::new(); - info!("Processing {} GFA files", gfa_list.len()); + info!("Reading {} GFA files", gfa_list.len()); // Process each GFA file let parser = GFAParser::new(); @@ -668,7 +664,11 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { nodes_to_remove } -fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> { +fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<()> { + debug!("Marking unused nodes for removal"); + let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph); + debug!("Marked {} nodes", nodes_to_remove.count_ones()); + let mut file = File::create(output_path)?; // Write GFA version From 62904bdceff999f4d9dc009d568013ce8e7c618b Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 13:37:54 -0600 Subject: [PATCH 41/45] reduce memory usage --- src/main.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index e58c0e0..3db24c6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,12 +53,12 @@ fn main() { // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); - for (path_key, ranges) in path_key_ranges.iter_mut() { + for (path_key, mut ranges) in path_key_ranges.drain() { debug!("Processing path key '{}' with {} ranges", path_key, ranges.len()); - - sort_and_filter_ranges(path_key, ranges, args.verbose > 1); - trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1); - create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1); + + sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1); + trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1); + create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1); } info!("Created {} nodes, {} edges, and {} paths", combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count()); @@ -68,6 +68,7 @@ fn main() { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), } + } #[derive(Debug, Clone)] @@ -446,7 +447,7 @@ fn trim_range_overlaps( if debug { debug!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); - debug!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); + debug!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); } if step_start < overlap_start { From 2be3e3e2ba91993ef3fde6724d1dbcb1d709566a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 14:06:31 -0600 Subject: [PATCH 42/45] tiny improvements; profile memory --- src/main.rs | 91 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3db24c6..3c11f3a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,6 +20,24 @@ use bitvec::{bitvec, prelude::BitVec}; use tempfile::NamedTempFile; use log::{debug, info, warn, error}; +use std::process::Command; + +#[cfg(not(debug_assertions))] +fn log_memory_usage(stage: &str) { + let output = Command::new("ps") + .args(&["-o", "rss=", "-p", &std::process::id().to_string()]) + .output() + .expect("Failed to execute ps command"); + + let memory_kb = String::from_utf8_lossy(&output.stdout) + .trim() + .parse::() + .unwrap_or(0); + + let memory_mb = memory_kb as f64 / 1024.0; + info!("Memory usage at {}: {:.2} MB", stage, memory_mb); +} + #[derive(Parser, Debug)] #[clap(author, version, about)] struct Args { @@ -48,9 +66,16 @@ fn main() { }) .init(); + log_memory_usage("start"); + // Create a single combined graph without paths and a map of path key to ranges let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list); + log_memory_usage("after_reading_files"); + + let total_path_keys = path_key_ranges.len(); + let mut processed_path_keys = 0; + // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); for (path_key, mut ranges) in path_key_ranges.drain() { @@ -59,16 +84,25 @@ fn main() { sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1); trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1); create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1); + + processed_path_keys += 1; + if processed_path_keys % 10 == 0 { + info!("Processed {}/{} path keys", processed_path_keys, total_path_keys); + log_memory_usage(&format!("Processed {} path keys", processed_path_keys)); + } } info!("Created {} nodes, {} edges, and {} paths", combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count()); + log_memory_usage("before_writing"); + info!("Writing the result by removing unused nodes/edges and compacting node IDs"); match write_graph_to_gfa(&combined_graph, &args.output) { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), } + log_memory_usage("end"); } #[derive(Debug, Clone)] @@ -675,42 +709,35 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( // Write GFA version writeln!(file, "H\tVN:Z:1.0")?; - // Collect nodes, excluding marked nodes - let nodes: Vec = graph.handles() - .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize]) - .collect(); - - // Create mapping from old IDs to new sequential IDs - // We allocate a vector with capacity for the max node ID + 1 + + // Write nodes by exluding marked ones and create the id_mapping let max_id = graph.node_count(); let mut id_mapping = vec![0; max_id + 1]; - for (new_id, handle) in nodes.iter().enumerate() { - id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1 - } - - // Write nodes with new IDs - for handle in &nodes { - let sequence = graph.sequence(*handle).collect::>(); - let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N")); - let node_id = id_mapping[u64::from(handle.id()) as usize]; - writeln!(file, "S\t{}\t{}", node_id, sequence_str)?; + let mut new_id = 1; // Start from 1 + for handle in graph.handles() { + let node_id = usize::from(handle.id()); + if !nodes_to_remove[node_id] { + id_mapping[node_id] = new_id; + + let sequence = graph.sequence(handle).collect::>(); + let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N")); + writeln!(file, "S\t{}\t{}", new_id, sequence_str)?; + + new_id += 1; + } } - // Collect and sort edges, excluding those connected to marked nodes - let edges: Vec = graph.edges() - .filter(|edge| { - !nodes_to_remove[u64::from(edge.0.id()) as usize] && - !nodes_to_remove[u64::from(edge.1.id()) as usize] - }) - .collect(); - - // Write edges with new IDs - for edge in edges { - let from_id = id_mapping[u64::from(edge.0.id()) as usize]; - let to_id = id_mapping[u64::from(edge.1.id()) as usize]; - let from_orient = if edge.0.is_reverse() { "-" } else { "+" }; - let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; - writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; + // Write edges by excluding those connected to marked nodes + for edge in graph.edges() { + if !nodes_to_remove[u64::from(edge.0.id()) as usize] && + !nodes_to_remove[u64::from(edge.1.id()) as usize] + { + let from_id = id_mapping[u64::from(edge.0.id()) as usize]; + let to_id = id_mapping[u64::from(edge.1.id()) as usize]; + let from_orient = if edge.0.is_reverse() { "-" } else { "+" }; + let to_orient = if edge.1.is_reverse() { "-" } else { "+" }; + writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?; + } } // Collect and sort paths directly with references to avoid multiple lookups From c9e482d28b00c8f98a0cc0509998f07f3e8b915d Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 15:06:01 -0600 Subject: [PATCH 43/45] faster and 2X less memory --- src/main.rs | 458 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 274 insertions(+), 184 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3c11f3a..c4f719d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,10 +9,7 @@ use handlegraph::{ handle::{Handle, NodeId, Edge}, handlegraph::*, mutablehandlegraph::*, - pathhandlegraph::{ - MutableGraphPaths, - GraphPaths - }, + pathhandlegraph::GraphPaths, hashgraph::HashGraph, }; use gfa::{gfa::GFA, parser::GFAParser}; @@ -20,23 +17,23 @@ use bitvec::{bitvec, prelude::BitVec}; use tempfile::NamedTempFile; use log::{debug, info, warn, error}; -use std::process::Command; +// use std::process::Command; -#[cfg(not(debug_assertions))] -fn log_memory_usage(stage: &str) { - let output = Command::new("ps") - .args(&["-o", "rss=", "-p", &std::process::id().to_string()]) - .output() - .expect("Failed to execute ps command"); +// #[cfg(not(debug_assertions))] +// fn log_memory_usage(stage: &str) { +// let output = Command::new("ps") +// .args(&["-o", "rss=", "-p", &std::process::id().to_string()]) +// .output() +// .expect("Failed to execute ps command"); - let memory_kb = String::from_utf8_lossy(&output.stdout) - .trim() - .parse::() - .unwrap_or(0); +// let memory_kb = String::from_utf8_lossy(&output.stdout) +// .trim() +// .parse::() +// .unwrap_or(0); - let memory_mb = memory_kb as f64 / 1024.0; - info!("Memory usage at {}: {:.2} MB", stage, memory_mb); -} +// let memory_mb = memory_kb as f64 / 1024.0; +// info!("Memory usage at {}: {:.2} MB", stage, memory_mb); +// } #[derive(Parser, Debug)] #[clap(author, version, about)] @@ -66,43 +63,34 @@ fn main() { }) .init(); - log_memory_usage("start"); + // log_memory_usage("start"); // Create a single combined graph without paths and a map of path key to ranges let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list); - log_memory_usage("after_reading_files"); - - let total_path_keys = path_key_ranges.len(); - let mut processed_path_keys = 0; + // log_memory_usage("after_reading_files"); - // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph + // Sort, deduplicate, trim, and link path ranges info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); - for (path_key, mut ranges) in path_key_ranges.drain() { + for (path_key, ranges) in path_key_ranges.iter_mut() { debug!("Processing path key '{}' with {} ranges", path_key, ranges.len()); - sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1); - trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1); - create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1); - - processed_path_keys += 1; - if processed_path_keys % 10 == 0 { - info!("Processed {}/{} path keys", processed_path_keys, total_path_keys); - log_memory_usage(&format!("Processed {} path keys", processed_path_keys)); - } + sort_and_filter_ranges(path_key, ranges, args.verbose > 1); + trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1); + link_contiguous_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1); } info!("Created {} nodes, {} edges, and {} paths", combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count()); - log_memory_usage("before_writing"); + // log_memory_usage("before_writing"); - info!("Writing the result by removing unused nodes/edges and compacting node IDs"); - match write_graph_to_gfa(&combined_graph, &args.output) { + info!("Writing the result by adding paths, removing unused nodes/edges, and compacting node IDs"); + match write_graph_to_gfa(&combined_graph, &path_key_ranges, &args.output, args.verbose > 1) { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), } - log_memory_usage("end"); + // log_memory_usage("end"); } #[derive(Debug, Clone)] @@ -393,9 +381,7 @@ fn trim_range_overlaps( debug: bool ) { // Trim overlaps - if debug { - debug!("Trimming overlapping ranges"); - } + debug!("Trimming overlapping ranges"); let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1; @@ -409,12 +395,10 @@ fn trim_range_overlaps( let overlap_start = std::cmp::max(r1.start, r2.start); let overlap_end = std::cmp::min(r1.end, r2.end); - if debug { - debug!( - " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", - r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start - ); - } + debug!( + " Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}", + r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start + ); // Adjust r2 to remove the overlap let mut steps_to_remove = Vec::new(); @@ -478,16 +462,11 @@ fn trim_range_overlaps( let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len); let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len); - if debug { - debug!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", - idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end); - debug!(" Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset); - } + debug!(" Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]\n Overlap offsets: start={}, end={}", + idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end, overlap_start_offset, overlap_end_offset); if step_start < overlap_start { - if debug { - debug!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); - } + debug!(" Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start); assert!(overlap_start_offset > 0); // Keep left part @@ -503,9 +482,7 @@ fn trim_range_overlaps( range_new_start = Some(step_start); } } else if step_end > overlap_end { - if debug { - debug!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); - } + debug!(" Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end); assert!(overlap_end_offset < node_len); // Keep right part @@ -555,9 +532,7 @@ fn trim_range_overlaps( r2.end = overlap_end; } - if debug { - debug!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); - } + debug!(" Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end); } } @@ -569,139 +544,184 @@ fn trim_range_overlaps( } } -fn create_paths_from_ranges( +fn link_contiguous_ranges( path_key: &str, - ranges: &[RangeInfo], + ranges: &mut [RangeInfo], combined_graph: &mut HashGraph, debug: bool ) { - // Check for overlaps and contiguity - let mut all_contiguous = true; - - for window in ranges.windows(2) { - let r1 = &window[0]; - let r2 = &window[1]; + // Trim overlaps + debug!("Linking overlapping ranges"); - if r1.overlaps_with(r2) { - if debug { - debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", - r1.start, r1.end, r2.start, r2.end); + for i in 1..ranges.len() { + let (left, right) = ranges.split_at_mut(i); + let r1 = &mut left[left.len()-1]; + let r2 = &mut right[0]; + + // Check if ranges are contiguous + if r1.is_contiguous_with(r2) { + // Get last handle from previous range and first handle from current range + if let (Some(&last_handle), Some(&first_handle)) = (r1.steps.last(), r2.steps.first()) { + // Create edge if it doesn't exist + if !combined_graph.has_edge(last_handle, first_handle) { + combined_graph.create_edge(Edge(last_handle, first_handle)); + debug!(" Created edge between contiguous ranges at position {}", r1.end); + } } - panic!("Unresolved overlaps detected in path key '{}'", path_key); } - if !r1.is_contiguous_with(r2) { - all_contiguous = false; + } + + if debug { + debug!("Path key '{}' without overlaps", path_key); + for range in ranges.iter() { + debug!(" Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id); } } +} + +// fn create_paths_from_ranges( +// path_key: &str, +// ranges: &[RangeInfo], +// combined_graph: &mut HashGraph, +// debug: bool +// ) { +// // Check for overlaps and contiguity +// let mut all_contiguous = true; + +// for window in ranges.windows(2) { +// let r1 = &window[0]; +// let r2 = &window[1]; + +// if r1.overlaps_with(r2) { +// if debug { +// debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", +// r1.start, r1.end, r2.start, r2.end); +// } +// panic!("Unresolved overlaps detected in path key '{}'", path_key); +// } +// if !r1.is_contiguous_with(r2) { +// all_contiguous = false; +// } +// } - if !all_contiguous && debug { - let mut current_start = ranges[0].start; - let mut current_end = ranges[0].end; +// if !all_contiguous && debug { +// let mut current_start = ranges[0].start; +// let mut current_end = ranges[0].end; - for i in 1..ranges.len() { - if ranges[i-1].is_contiguous_with(&ranges[i]) { - // Extend current merged range - current_end = ranges[i].end; - } else { - // Print current merged range - debug!(" Merged range: start={}, end={}", - current_start, current_end); +// for i in 1..ranges.len() { +// if ranges[i-1].is_contiguous_with(&ranges[i]) { +// // Extend current merged range +// current_end = ranges[i].end; +// } else { +// // Print current merged range +// debug!(" Merged range: start={}, end={}", +// current_start, current_end); - if !ranges[i-1].overlaps_with(&ranges[i]) { - // Calculate and print gap - let gap = ranges[i].start - current_end; - debug!(" Gap to next range: {} positions", gap); - } else { - // Calculate and print overlap - let overlap = current_end - ranges[i].start; - debug!(" Overlap with next range: {} positions", overlap); - } - - // Start new merged range - current_start = ranges[i].start; - current_end = ranges[i].end; - } - } +// if !ranges[i-1].overlaps_with(&ranges[i]) { +// // Calculate and print gap +// let gap = ranges[i].start - current_end; +// debug!(" Gap to next range: {} positions", gap); +// } else { +// // Calculate and print overlap +// let overlap = current_end - ranges[i].start; +// debug!(" Overlap with next range: {} positions", overlap); +// } + +// // Start new merged range +// current_start = ranges[i].start; +// current_end = ranges[i].end; +// } +// } - // Print final merged range - debug!(" Merged range: start={}, end={}", - current_start, current_end); - } - - if all_contiguous { - // Create a single path with the original key - let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap(); - let mut prev_step = None; +// // Print final merged range +// debug!(" Merged range: start={}, end={}", +// current_start, current_end); +// } + +// if all_contiguous { +// // Create a single path with the original key +// let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap(); +// let mut prev_step = None; - // Add all steps from all ranges - for range in ranges.iter() { - for step in &range.steps { - combined_graph.path_append_step(path_id, *step); +// // Add all steps from all ranges +// for range in ranges.iter() { +// for step in &range.steps { +// combined_graph.path_append_step(path_id, *step); - if let Some(prev) = prev_step { - if !combined_graph.has_edge(prev, *step) { - combined_graph.create_edge(Edge(prev, *step)); - } - } - prev_step = Some(*step); - } - } - } else { - // Handle non-contiguous ranges by creating separate paths for each contiguous group - let mut current_range_idx = 0; - while current_range_idx < ranges.len() { - let start_range = &ranges[current_range_idx]; - let mut steps = start_range.steps.clone(); - let mut next_idx = current_range_idx + 1; - let mut end_range = start_range; +// if let Some(prev) = prev_step { +// if !combined_graph.has_edge(prev, *step) { +// combined_graph.create_edge(Edge(prev, *step)); +// } +// } +// prev_step = Some(*step); +// } +// } +// } else { +// // Handle non-contiguous ranges by creating separate paths for each contiguous group +// let mut current_range_idx = 0; +// while current_range_idx < ranges.len() { +// let start_range = &ranges[current_range_idx]; +// let mut steps = start_range.steps.clone(); +// let mut next_idx = current_range_idx + 1; +// let mut end_range = start_range; - // Merge contiguous ranges - while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { - steps.extend(ranges[next_idx].steps.clone()); - end_range = &ranges[next_idx]; - next_idx += 1; - } +// // Merge contiguous ranges +// while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { +// steps.extend(ranges[next_idx].steps.clone()); +// end_range = &ranges[next_idx]; +// next_idx += 1; +// } - // Create path name with range information - let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); - let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); +// // Create path name with range information +// let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end); +// let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap(); - // Add steps to the path - let mut prev_step = None; - for step in steps { - combined_graph.path_append_step(path_id, step); +// // Add steps to the path +// let mut prev_step = None; +// for step in steps { +// combined_graph.path_append_step(path_id, step); - if let Some(prev) = prev_step { - if !combined_graph.has_edge(prev, step) { - combined_graph.create_edge(Edge(prev, step)); - } - } - prev_step = Some(step); - } +// if let Some(prev) = prev_step { +// if !combined_graph.has_edge(prev, step) { +// combined_graph.create_edge(Edge(prev, step)); +// } +// } +// prev_step = Some(step); +// } - current_range_idx = next_idx; - } - } -} - -fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec { +// current_range_idx = next_idx; +// } +// } +// } + +fn mark_nodes_for_removal( + graph: &HashGraph, + path_key_ranges: &FxHashMap> +) -> BitVec { // Create a bitvector with all nodes initially marked for removal let max_node_id = u64::from(graph.max_node_id()); let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1]; - // Mark nodes used in paths as not to be removed (set bit to 0) - for (_path_id, path_ref) in graph.paths.iter() { - for handle in &path_ref.nodes { - nodes_to_remove.set(u64::from(handle.id()) as usize, false); + // Mark nodes used in path ranges as not to be removed (set bit to 0) + for ranges in path_key_ranges.values() { + for range in ranges { + for handle in &range.steps { + nodes_to_remove.set(u64::from(handle.id()) as usize, false); + } } } nodes_to_remove } -fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<()> { +fn write_graph_to_gfa( + graph: &HashGraph, + path_key_ranges: &FxHashMap>, + output_path: &str, + debug: bool +) -> std::io::Result<()> { debug!("Marking unused nodes for removal"); - let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph); + let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph, path_key_ranges); debug!("Marked {} nodes", nodes_to_remove.count_ones()); let mut file = File::create(output_path)?; @@ -709,7 +729,6 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( // Write GFA version writeln!(file, "H\tVN:Z:1.0")?; - // Write nodes by exluding marked ones and create the id_mapping let max_id = graph.node_count(); let mut id_mapping = vec![0; max_id + 1]; @@ -740,28 +759,99 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<( } } - // Collect and sort paths directly with references to avoid multiple lookups - let mut path_entries: Vec<_> = graph.paths.iter().collect(); - path_entries.sort_by_key(|(_, path_ref)| { - // Get name directly from path struct instead of doing lookups - path_ref.name.as_slice() - }); - - // Write paths with new IDs - for (_path_id, path_ref) in path_entries { - // Pre-allocate vector with capacity - let mut path_elements = Vec::with_capacity(path_ref.nodes.len()); + // Write paths by processing ranges directly + let mut path_key_vec: Vec<_> = path_key_ranges.keys().collect(); + path_key_vec.sort(); // Sort path keys for consistent output + + for path_key in path_key_vec { + let ranges = &path_key_ranges[path_key]; + + // Check for overlaps and contiguity + let mut all_contiguous = true; - // Process nodes directly from path reference - for handle in &path_ref.nodes { - let node_id = id_mapping[u64::from(handle.id()) as usize]; - let orient = if handle.is_reverse() { "-" } else { "+" }; - path_elements.push(format!("{}{}", node_id, orient)); + for window in ranges.windows(2) { + let r1 = &window[0]; + let r2 = &window[1]; + + if r1.overlaps_with(r2) { + panic!("Unresolved overlaps detected in path key '{}': [start={}, end={}] and [start={}, end={}]", path_key, + r1.start, r1.end, r2.start, r2.end); + } + if !r1.is_contiguous_with(r2) { + all_contiguous = false; + } } - - if !path_elements.is_empty() { - let path_name = String::from_utf8_lossy(&path_ref.name); - writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + + if !all_contiguous && debug { + let mut current_start = ranges[0].start; + let mut current_end = ranges[0].end; + + for i in 1..ranges.len() { + if ranges[i-1].is_contiguous_with(&ranges[i]) { + // Extend current merged range + current_end = ranges[i].end; + } else { + // Print current merged range + debug!(" Merged range: start={}, end={}", + current_start, current_end); + + if !ranges[i-1].overlaps_with(&ranges[i]) { + // Calculate and print gap + let gap = ranges[i].start - current_end; + debug!(" Gap to next range: {} positions", gap); + } else { + // Calculate and print overlap + let overlap = current_end - ranges[i].start; + debug!(" Overlap with next range: {} positions", overlap); + } + + // Start new merged range + current_start = ranges[i].start; + current_end = ranges[i].end; + } + } + + // Print final merged range + debug!(" Merged range: start={}, end={}", + current_start, current_end); + } + + let mut current_range_idx = 0; + + while current_range_idx < ranges.len() { + let start_range = &ranges[current_range_idx]; + let mut next_idx = current_range_idx + 1; + let mut end_range = start_range; + + // Find contiguous ranges + while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) { + end_range = &ranges[next_idx]; + next_idx += 1; + } + + // Create path elements for contiguous range group + let mut path_elements = Vec::new(); + for idx in current_range_idx..next_idx { + for handle in &ranges[idx].steps { + let node_id = id_mapping[u64::from(handle.id()) as usize]; + let orient = if handle.is_reverse() { "-" } else { "+" }; + path_elements.push(format!("{}{}", node_id, orient)); + } + } + + if !path_elements.is_empty() { + let path_name = if next_idx - current_range_idx == ranges.len() { + // All ranges are contiguous - use original path key + path_key.to_string() + } else { + // Create path name with range information + format!("{}:{}-{}", path_key, start_range.start, end_range.end) + }; + + writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?; + } + + current_range_idx = next_idx; } } From ed5306f986d4a700a95081c79ba08f2826976daf Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 15:19:29 -0600 Subject: [PATCH 44/45] tweak log --- src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index c4f719d..bf9a40c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -84,7 +84,6 @@ fn main() { // log_memory_usage("before_writing"); - info!("Writing the result by adding paths, removing unused nodes/edges, and compacting node IDs"); match write_graph_to_gfa(&combined_graph, &path_key_ranges, &args.output, args.verbose > 1) { Ok(_) => info!("Successfully wrote the combined graph to {}", args.output), Err(e) => error!("Error writing the GFA file: {}", e), @@ -720,7 +719,7 @@ fn write_graph_to_gfa( output_path: &str, debug: bool ) -> std::io::Result<()> { - debug!("Marking unused nodes for removal"); + info!("Marking unused nodes"); let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph, path_key_ranges); debug!("Marked {} nodes", nodes_to_remove.count_ones()); @@ -730,6 +729,7 @@ fn write_graph_to_gfa( writeln!(file, "H\tVN:Z:1.0")?; // Write nodes by exluding marked ones and create the id_mapping + info!("Writing used nodes by compacting their IDs"); let max_id = graph.node_count(); let mut id_mapping = vec![0; max_id + 1]; let mut new_id = 1; // Start from 1 @@ -747,6 +747,7 @@ fn write_graph_to_gfa( } // Write edges by excluding those connected to marked nodes + info!("Writing edges connecting used nodes"); for edge in graph.edges() { if !nodes_to_remove[u64::from(edge.0.id()) as usize] && !nodes_to_remove[u64::from(edge.1.id()) as usize] @@ -760,6 +761,7 @@ fn write_graph_to_gfa( } // Write paths by processing ranges directly + info!("Writing paths by merging contiguous path ranges"); let mut path_key_vec: Vec<_> = path_key_ranges.keys().collect(); path_key_vec.sort(); // Sort path keys for consistent output From 2445a8ec84c41680d5fde022548bad2c732d5eae Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 15 Dec 2024 15:38:20 -0600 Subject: [PATCH 45/45] typo --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index bf9a40c..6136085 100644 --- a/src/main.rs +++ b/src/main.rs @@ -71,7 +71,7 @@ fn main() { // log_memory_usage("after_reading_files"); // Sort, deduplicate, trim, and link path ranges - info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); + info!("Sorting, deduplicating, trimming, and linking {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::()); for (path_key, ranges) in path_key_ranges.iter_mut() { debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());