From e23b183cde41b496146055e43ef7dc2a177b16be Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Wed, 11 Dec 2024 20:56:55 -0600
Subject: [PATCH 01/45] it seems to work!

---
 src/main.rs | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 3 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index 6f63cf2..bc159d7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -31,7 +31,9 @@ struct RangeInfo {
     start: usize,
     end: usize,
     gfa_id: usize,
-    steps: Vec<Handle>,  // Store the path steps for this range
+    steps: Vec<Handle>,                    // Path steps for this range
+    step_positions: Vec<(usize, usize)>,   // Start and end positions of each step
+    step_lengths: Vec<usize>,              // Lengths of each step
 }
 
 // Helper function to check if two ranges are contiguous
@@ -176,10 +178,24 @@ fn main() {
                 if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) {
                     // Get the path steps and translate their IDs
                     let mut translated_steps = Vec::new();
+                    let mut step_positions = Vec::new();
+                    let mut step_lengths = Vec::new();
+                    let mut cumulative_pos = start;
+
                     if let Some(path_ref) = block_graph.get_path_ref(path_id) {
                         for step in path_ref.nodes.iter() {
                             let translated_id = id_translation + step.id().into();
-                            translated_steps.push(Handle::pack(translated_id, step.is_reverse()));
+                            let translated_step = Handle::pack(translated_id, step.is_reverse());
+                            translated_steps.push(translated_step);
+
+                            // Get the sequence length of the node
+                            let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
+                            let node_length = node_seq.len();
+                            step_lengths.push(node_length);
+
+                            // Record the end position of this step
+                            step_positions.push((cumulative_pos, cumulative_pos + node_length));
+                            cumulative_pos = cumulative_pos + node_length;
                         }
                     }
                     
@@ -190,6 +206,8 @@ fn main() {
                             end, 
                             gfa_id,
                             steps: translated_steps,
+                            step_positions,
+                            step_lengths,
                         });
                 }
             }
@@ -202,11 +220,156 @@ fn main() {
         eprintln!("Total path ranges collected: {}", path_key_ranges.len());
     }
 
+    let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
+
     // Sort ranges and create merged paths in the combined graph
     for (path_key, ranges) in path_key_ranges.iter_mut() {
         // Sort ranges by start position
         ranges.sort_by_key(|r| (r.start, r.end));
-        
+
+        // Process overlaps
+        for i in 1..ranges.len() {
+            let (left, right) = ranges.split_at_mut(i);
+            let r1 = &mut left[left.len()-1];
+            let r2 = &mut right[0];
+
+            if has_overlap(r1, r2) {
+                // Calculate the overlap region
+                let overlap_start = r2.start;
+                let overlap_end = r1.end;
+
+                // Adjust r2 to remove the overlap
+                let mut steps_to_remove = Vec::new();
+                let mut steps_to_split = Vec::new();
+
+                for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
+                    if step_end <= overlap_start {
+                        //println!("\tStep {} before overlap", idx);
+                        continue;
+                    } else if step_start >= overlap_end {
+                        //println!("\tStep {} after overlap", idx);
+                        break;
+                    } else if step_start >= overlap_start && step_end <= overlap_end {
+                        //println!("\tStep {} within overlap", idx);
+                        steps_to_remove.push(idx);
+                    } else {
+                        //println!("\tStep {} partially overlaps", idx);
+                        steps_to_split.push(idx);
+                    }
+                }
+
+                // Initialize new vectors to store updated steps
+                let mut new_steps = Vec::new();
+                let mut new_step_positions = Vec::new();
+                let mut new_step_lengths = Vec::new();
+
+                // Iterate over the original steps
+                for idx in 0..r2.steps.len() {
+                    let step_handle = r2.steps[idx];
+                    let (step_start, step_end) = r2.step_positions[idx];
+
+                    if steps_to_remove.contains(&idx) {
+                        // Skip steps to remove
+                        continue;
+                    } else if steps_to_split.contains(&idx) {
+                        // Split nodes for steps that partially overlap
+                        let node_seq = combined_graph.sequence(step_handle).collect::<Vec<_>>();
+                        let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
+                        let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
+                        let overlap_start_offset = overlap_within_step_start - step_start;
+                        let overlap_end_offset = overlap_within_step_end - step_start;
+
+                        if step_start < overlap_start && step_end > overlap_end {
+                            // Split into three parts
+                            let left_seq = &node_seq[0..overlap_start_offset];
+                            let right_seq = &node_seq[overlap_end_offset..];
+
+                            let left_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+                            let right_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+
+                            let left_node = combined_graph.create_handle(left_seq, left_id);
+                            let right_node = combined_graph.create_handle(right_seq, right_id);
+
+                            let left_handle = if step_handle.is_reverse() {
+                                left_node.flip()
+                            } else {
+                                left_node
+                            };
+                            let right_handle = if step_handle.is_reverse() {
+                                right_node.flip()
+                            } else {
+                                right_node
+                            };
+
+                            new_steps.push(left_handle);
+                            new_step_positions.push((step_start, overlap_start));
+                            new_step_lengths.push(left_seq.len());
+
+                            new_steps.push(right_handle);
+                            new_step_positions.push((overlap_end, step_end));
+                            new_step_lengths.push(right_seq.len());
+
+                            combined_graph.create_edge(Edge(left_handle, right_handle));
+                        } else if step_start < overlap_start {
+                            // Keep left part
+                            let new_seq = &node_seq[0..overlap_start_offset];
+                            let node_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+                            let new_node = combined_graph.create_handle(new_seq, node_id);
+                            let new_handle = if step_handle.is_reverse() {
+                                new_node.flip()
+                            } else {
+                                new_node
+                            };
+
+                            new_steps.push(new_handle);
+                            new_step_positions.push((step_start, overlap_start));
+                            new_step_lengths.push(new_seq.len());
+                        } else if step_end > overlap_end {
+                            // Keep right part
+                            let new_seq = &node_seq[overlap_end_offset..];
+                            let node_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+                            let new_node = combined_graph.create_handle(new_seq, node_id);
+                            let new_handle = if step_handle.is_reverse() {
+                                new_node.flip()
+                            } else {
+                                new_node
+                            };
+
+                            new_steps.push(new_handle);
+                            new_step_positions.push((overlap_end, step_end));
+                            new_step_lengths.push(new_seq.len());
+                        }
+                    } else {
+                        // Keep steps that are not to be removed or split
+                        new_steps.push(step_handle);
+                        new_step_positions.push((step_start, step_end));
+                        new_step_lengths.push(r2.step_lengths[idx]);
+                    }
+                }
+
+                // Update r2 with the new steps
+                r2.steps = new_steps;
+                r2.step_positions = new_step_positions;
+                r2.step_lengths = new_step_lengths;
+
+                // Update edges for the modified steps
+                for idx in 0..r2.steps.len() {
+                    if idx > 0 {
+                        let prev_step = r2.steps[idx - 1];
+                        if !combined_graph.has_edge(prev_step, r2.steps[idx]) {
+                            combined_graph.create_edge(Edge(prev_step, r2.steps[idx]));
+                        }
+                    }
+                }
+
+                r2.start = overlap_end;
+            }
+        }
+
         // Check for overlaps and contiguity
         let mut has_overlaps = false;
         let mut all_contiguous = true;

From 3391811ade869011eaaade234fc2ad33b91e405b Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Wed, 11 Dec 2024 21:15:49 -0600
Subject: [PATCH 02/45] style: Improve debug output formatting in main function

---
 src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index bc159d7..348ce02 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -215,8 +215,8 @@ fn main() {
     }
 
     if args.debug {
-        eprintln!("Total nodes in combined graph: {}", combined_graph.node_count());
-        eprintln!("Total edges in combined graph: {}", combined_graph.edge_count());
+        eprintln!("Total nodes in the combined graph: {}", combined_graph.node_count());
+        eprintln!("Total edges in the combined graph: {}", combined_graph.edge_count());
         eprintln!("Total path ranges collected: {}", path_key_ranges.len());
     }
 

From 287fc69eb2c2bd6fae232e5a29b5e1d59d5a9e05 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Wed, 11 Dec 2024 21:17:53 -0600
Subject: [PATCH 03/45] feat: Add debug message for detected overlaps with
 range details

---
 src/main.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/main.rs b/src/main.rs
index 348ce02..ac5e7df 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -234,6 +234,17 @@ fn main() {
             let r2 = &mut right[0];
 
             if has_overlap(r1, r2) {
+                if args.debug {
+                    let overlap_start = std::cmp::max(r1.start, r2.start);
+                    let overlap_end = std::cmp::min(r1.end, r2.end);
+                    let overlap_amount = overlap_end - overlap_start;
+
+                    eprintln!(
+                        "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}",
+                        path_key, r1.start, r1.end, r2.start, r2.start, overlap_amount
+                    );
+                }
+
                 // Calculate the overlap region
                 let overlap_start = r2.start;
                 let overlap_end = r1.end;

From f4f230e3e20d8afd6ae3af27b4119e186d17127e Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 11:41:09 -0600
Subject: [PATCH 04/45] typo

---
 Cargo.lock  | 38 +++++++++++++++++++-------------------
 src/main.rs |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 42c737b..0880f37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -125,9 +125,9 @@ dependencies = [
 
 [[package]]
 name = "bstr"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a68f1f47cdf0ec8ee4b941b2eee2a80cb796db73118c0dd09ac63fbe405be22"
+checksum = "786a307d683a5bf92e6fd5fd69a7eb613751668d1d8d67d802846dfe367c62c8"
 dependencies = [
  "memchr",
  "regex-automata 0.4.9",
@@ -183,9 +183,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.2"
+version = "1.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc"
+checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d"
 dependencies = [
  "jobserver",
  "libc",
@@ -200,9 +200,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "clap"
-version = "4.5.22"
+version = "4.5.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
+checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -210,9 +210,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.22"
+version = "4.5.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
+checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
 dependencies = [
  "anstream",
  "anstyle",
@@ -234,9 +234,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "colorchoice"
@@ -329,7 +329,7 @@ dependencies = [
 name = "gfalace"
 version = "0.1.0"
 dependencies = [
- "bstr 1.11.0",
+ "bstr 1.11.1",
  "clap",
  "gfa",
  "handlegraph",
@@ -394,9 +394,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.168"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d"
 
 [[package]]
 name = "liblzma"
@@ -409,9 +409,9 @@ dependencies = [
 
 [[package]]
 name = "liblzma-sys"
-version = "0.3.9"
+version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6630cb23edeb2e563cd6c30d4117554c69646871455843c33ddcb1d9aef82ecf"
+checksum = "41e2171ce6827cbab9bc97238a58361bf9a526080475f21dbc470e1842258b2d"
 dependencies = [
  "cc",
  "libc",
@@ -577,18 +577,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
 [[package]]
 name = "serde"
-version = "1.0.215"
+version = "1.0.216"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
+checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.215"
+version = "1.0.216"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
+checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/src/main.rs b/src/main.rs
index ac5e7df..e7a6b0a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -241,7 +241,7 @@ fn main() {
 
                     eprintln!(
                         "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}",
-                        path_key, r1.start, r1.end, r2.start, r2.start, overlap_amount
+                        path_key, r1.start, r1.end, r2.start, r2.end, overlap_amount
                     );
                 }
 

From febd44ecfe9625bf46cef4b694b15717814e9cbd Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 11:59:14 -0600
Subject: [PATCH 05/45] fix: Improve overlap detection logging in main function

---
 src/main.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index e7a6b0a..e05ee18 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -234,25 +234,22 @@ fn main() {
             let r2 = &mut right[0];
 
             if has_overlap(r1, r2) {
+                // Calculate the overlap region
+                let overlap_start = r2.start;
+                let overlap_end = r1.end;
+
                 if args.debug {
-                    let overlap_start = std::cmp::max(r1.start, r2.start);
-                    let overlap_end = std::cmp::min(r1.end, r2.end);
                     let overlap_amount = overlap_end - overlap_start;
 
                     eprintln!(
-                        "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], overlap size={}",
-                        path_key, r1.start, r1.end, r2.start, r2.end, overlap_amount
+                        "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
+                        path_key, r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount
                     );
                 }
 
-                // Calculate the overlap region
-                let overlap_start = r2.start;
-                let overlap_end = r1.end;
-
                 // Adjust r2 to remove the overlap
                 let mut steps_to_remove = Vec::new();
                 let mut steps_to_split = Vec::new();
-
                 for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
                     if step_end <= overlap_start {
                         //println!("\tStep {} before overlap", idx);

From 84f0a91b3876ad7c4eb739689397c2e7ef3a24c0 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 11:59:15 -0600
Subject: [PATCH 06/45] feat: Remove ranges contained within other ranges
 before processing overlaps

---
 src/main.rs | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index e05ee18..3ad7062 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -26,7 +26,7 @@ struct Args {
     debug: bool,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct RangeInfo {
     start: usize,
     end: usize,
@@ -227,6 +227,31 @@ fn main() {
         // Sort ranges by start position
         ranges.sort_by_key(|r| (r.start, r.end));
 
+        // Remove ranges that are contained within other ranges
+        if !ranges.is_empty() {
+            let mut filtered_ranges = Vec::new();
+            let mut prev_range = &ranges[0];
+            filtered_ranges.push(prev_range.clone());
+
+            for range in ranges.iter().skip(1) {
+                if range.start >= prev_range.start && range.end <= prev_range.end {
+                    // Current range is fully contained within prev_range, skip it
+                    if args.debug {
+                        eprintln!(
+                            "Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.",
+                            range.start, range.end, prev_range.start, prev_range.end
+                        );
+                    }
+                    continue;
+                } else {
+                    filtered_ranges.push(range.clone());
+                    prev_range = range;
+                }
+            }
+
+            *ranges = filtered_ranges;
+        }
+
         // Process overlaps
         for i in 1..ranges.len() {
             let (left, right) = ranges.split_at_mut(i);

From 4098f5e1ad2c9f14a1498befa2cffcd667654d78 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 12:10:15 -0600
Subject: [PATCH 07/45] feat: Add debug logging for range processing and
 overlap detection

---
 src/main.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 3ad7062..5cbe607 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -227,6 +227,13 @@ fn main() {
         // Sort ranges by start position
         ranges.sort_by_key(|r| (r.start, r.end));
 
+        if args.debug {
+            eprintln!("Processing path key '{}'", path_key);
+            for range in ranges.iter() {
+                eprintln!("  Range: start={}, end={}, gfa_id={}", range.start, range.end, range.gfa_id);
+            }
+        }
+
         // Remove ranges that are contained within other ranges
         if !ranges.is_empty() {
             let mut filtered_ranges = Vec::new();
@@ -238,7 +245,7 @@ fn main() {
                     // Current range is fully contained within prev_range, skip it
                     if args.debug {
                         eprintln!(
-                            "Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.",
+                            "Redundant range detected: Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.",
                             range.start, range.end, prev_range.start, prev_range.end
                         );
                     }
@@ -267,8 +274,8 @@ fn main() {
                     let overlap_amount = overlap_end - overlap_start;
 
                     eprintln!(
-                        "Overlap detected in path '{}': Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
-                        path_key, r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount
+                        "Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
+                        r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount
                     );
                 }
 

From c3a23ad38ce47d03a9125eeb63ce0a7449965d66 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 12:10:16 -0600
Subject: [PATCH 08/45] fix: Handle bidirectional range containment to remove
 redundancies

---
 src/main.rs | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 5cbe607..021705f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -237,10 +237,9 @@ fn main() {
         // Remove ranges that are contained within other ranges
         if !ranges.is_empty() {
             let mut filtered_ranges = Vec::new();
-            let mut prev_range = &ranges[0];
-            filtered_ranges.push(prev_range.clone());
+            let mut prev_range = ranges[0].clone();
 
-            for range in ranges.iter().skip(1) {
+            for range in ranges.iter() {
                 if range.start >= prev_range.start && range.end <= prev_range.end {
                     // Current range is fully contained within prev_range, skip it
                     if args.debug {
@@ -250,12 +249,25 @@ fn main() {
                         );
                     }
                     continue;
+                } else if prev_range.start >= range.start && prev_range.end <= range.end {
+                    // Prev_range is fully contained within current range, replace prev_range with current range
+                    if args.debug {
+                        eprintln!(
+                            "Redundant range detected: Previous range [start={}, end={}] is contained within [start={}, end={}] and will be replaced.",
+                            prev_range.start, prev_range.end, range.start, range.end
+                        );
+                    }
+                    prev_range = range.clone();
                 } else {
-                    filtered_ranges.push(range.clone());
-                    prev_range = range;
+                    // No containment, add the previous range and update prev_range
+                    filtered_ranges.push(prev_range.clone());
+                    prev_range = range.clone();
                 }
             }
 
+            // Don't forget to add the last range
+            filtered_ranges.push(prev_range);
+
             *ranges = filtered_ranges;
         }
 

From 24c302604bc015ed6f59653a2c81542f47cadd12 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 15:43:59 -0600
Subject: [PATCH 09/45] refactor: Skip the first range in the iteration for
 clarity

---
 src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index 021705f..90c52cc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -239,7 +239,7 @@ fn main() {
             let mut filtered_ranges = Vec::new();
             let mut prev_range = ranges[0].clone();
 
-            for range in ranges.iter() {
+            for range in ranges.iter().skip(1)  {
                 if range.start >= prev_range.start && range.end <= prev_range.end {
                     // Current range is fully contained within prev_range, skip it
                     if args.debug {

From bf88718e2923d758a5b5061b9f21e4452f598e0c Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 15:44:00 -0600
Subject: [PATCH 10/45] fix: Adjust overlap offset calculations for handle
 orientation in sequences

---
 src/main.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 90c52cc..f38d52d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -328,8 +328,18 @@ fn main() {
                         let node_seq = combined_graph.sequence(step_handle).collect::<Vec<_>>();
                         let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
                         let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
-                        let overlap_start_offset = overlap_within_step_start - step_start;
-                        let overlap_end_offset = overlap_within_step_end - step_start;
+                        
+                        let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() {
+                            // Forward handle
+                            let start_offset = overlap_within_step_start - step_start;
+                            let end_offset = overlap_within_step_end - step_start;
+                            (start_offset, end_offset)
+                        } else {
+                            // Reverse handle
+                            let start_offset = step_end - overlap_within_step_end;
+                            let end_offset = step_end - overlap_within_step_start;
+                            (start_offset, end_offset)
+                        };
 
                         if step_start < overlap_start && step_end > overlap_end {
                             // Split into three parts

From 3a2b2597af3c95a4fd1c1f412a59813a39bc6a2f Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 15:47:00 -0600
Subject: [PATCH 11/45] fix: Prevent panic by checking for empty sequences
 before node creation

---
 src/main.rs | 116 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 52 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index f38d52d..b05748e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -346,64 +346,76 @@ fn main() {
                             let left_seq = &node_seq[0..overlap_start_offset];
                             let right_seq = &node_seq[overlap_end_offset..];
 
-                            let left_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-                            let right_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-
-                            let left_node = combined_graph.create_handle(left_seq, left_id);
-                            let right_node = combined_graph.create_handle(right_seq, right_id);
-
-                            let left_handle = if step_handle.is_reverse() {
-                                left_node.flip()
-                            } else {
-                                left_node
-                            };
-                            let right_handle = if step_handle.is_reverse() {
-                                right_node.flip()
-                            } else {
-                                right_node
-                            };
-
-                            new_steps.push(left_handle);
-                            new_step_positions.push((step_start, overlap_start));
-                            new_step_lengths.push(left_seq.len());
-
-                            new_steps.push(right_handle);
-                            new_step_positions.push((overlap_end, step_end));
-                            new_step_lengths.push(right_seq.len());
-
-                            combined_graph.create_edge(Edge(left_handle, right_handle));
+                            // Only create left node if sequence is not empty
+                            let mut left_handle = None;
+                            if !left_seq.is_empty() {
+                                let left_id = NodeId::from(next_node_id_value);
+                                next_node_id_value += 1;
+                                let left_node = combined_graph.create_handle(left_seq, left_id);
+                                left_handle = Some(if step_handle.is_reverse() {
+                                    left_node.flip()
+                                } else {
+                                    left_node
+                                });
+                                new_steps.push(left_handle.unwrap());
+                                new_step_positions.push((step_start, overlap_start));
+                                new_step_lengths.push(left_seq.len());
+                            }
+
+                            // Only create right node if sequence is not empty
+                            let mut right_handle = None;
+                            if !right_seq.is_empty() {
+                                let right_id = NodeId::from(next_node_id_value);
+                                next_node_id_value += 1;
+                                let right_node = combined_graph.create_handle(right_seq, right_id);
+                                right_handle = Some(if step_handle.is_reverse() {
+                                    right_node.flip()
+                                } else {
+                                    right_node
+                                });
+                                new_steps.push(right_handle.unwrap());
+                                new_step_positions.push((overlap_end, step_end));
+                                new_step_lengths.push(right_seq.len());
+                            }
+
+                            // Create edge only if both nodes exist
+                            if let (Some(left), Some(right)) = (left_handle, right_handle) {
+                                combined_graph.create_edge(Edge(left, right));
+                            }
                         } else if step_start < overlap_start {
                             // Keep left part
                             let new_seq = &node_seq[0..overlap_start_offset];
-                            let node_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-                            let new_node = combined_graph.create_handle(new_seq, node_id);
-                            let new_handle = if step_handle.is_reverse() {
-                                new_node.flip()
-                            } else {
-                                new_node
-                            };
-
-                            new_steps.push(new_handle);
-                            new_step_positions.push((step_start, overlap_start));
-                            new_step_lengths.push(new_seq.len());
+                            if !new_seq.is_empty() {
+                                let node_id = NodeId::from(next_node_id_value);
+                                next_node_id_value += 1;
+                                let new_node = combined_graph.create_handle(new_seq, node_id);
+                                let new_handle = if step_handle.is_reverse() {
+                                    new_node.flip()
+                                } else {
+                                    new_node
+                                };
+
+                                new_steps.push(new_handle);
+                                new_step_positions.push((step_start, overlap_start));
+                                new_step_lengths.push(new_seq.len());
+                            }
                         } else if step_end > overlap_end {
                             // Keep right part
                             let new_seq = &node_seq[overlap_end_offset..];
-                            let node_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-                            let new_node = combined_graph.create_handle(new_seq, node_id);
-                            let new_handle = if step_handle.is_reverse() {
-                                new_node.flip()
-                            } else {
-                                new_node
-                            };
-
-                            new_steps.push(new_handle);
-                            new_step_positions.push((overlap_end, step_end));
-                            new_step_lengths.push(new_seq.len());
+                            if !new_seq.is_empty() {
+                                let node_id = NodeId::from(next_node_id_value);
+                                next_node_id_value += 1;
+                                let new_node = combined_graph.create_handle(new_seq, node_id);
+                                let new_handle = if step_handle.is_reverse() {
+                                    new_node.flip()
+                                } else {
+                                    new_node
+                                };
+
+                                new_steps.push(new_handle);
+                                new_step_positions.push((overlap_end, step_end));
+                                new_step_lengths.push(new_seq.len());
+                            }
                         }
                     } else {
                         // Keep steps that are not to be removed or split

From 1fc2e9f205935e655166e58eacd674fd0e29dcc6 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 15:49:18 -0600
Subject: [PATCH 12/45] fix: Improve overlap handling to prevent sequence
 discrepancies in GFA processing

---
 src/main.rs | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index b05748e..8c81447 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -329,15 +329,17 @@ fn main() {
                         let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
                         let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
                         
+                        // Calculate offsets relative to the node sequence
+                        let node_len = node_seq.len();
                         let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() {
-                            // Forward handle
-                            let start_offset = overlap_within_step_start - step_start;
-                            let end_offset = overlap_within_step_end - step_start;
+                            // Forward handle - offsets from start of sequence
+                            let start_offset = (overlap_within_step_start - step_start).min(node_len);
+                            let end_offset = (overlap_within_step_end - step_start).min(node_len);
                             (start_offset, end_offset)
                         } else {
-                            // Reverse handle
-                            let start_offset = step_end - overlap_within_step_end;
-                            let end_offset = step_end - overlap_within_step_start;
+                            // Reverse handle - offsets from end of sequence
+                            let start_offset = (step_end - overlap_within_step_end).min(node_len);
+                            let end_offset = (step_end - overlap_within_step_start).min(node_len);
                             (start_offset, end_offset)
                         };
 
@@ -346,9 +348,9 @@ fn main() {
                             let left_seq = &node_seq[0..overlap_start_offset];
                             let right_seq = &node_seq[overlap_end_offset..];
 
-                            // Only create left node if sequence is not empty
+                            // Only create left node if sequence is not empty and within bounds
                             let mut left_handle = None;
-                            if !left_seq.is_empty() {
+                            if !left_seq.is_empty() && overlap_start_offset > 0 {
                                 let left_id = NodeId::from(next_node_id_value);
                                 next_node_id_value += 1;
                                 let left_node = combined_graph.create_handle(left_seq, left_id);
@@ -358,13 +360,15 @@ fn main() {
                                     left_node
                                 });
                                 new_steps.push(left_handle.unwrap());
-                                new_step_positions.push((step_start, overlap_start));
+                                let left_start = step_start;
+                                let left_end = left_start + left_seq.len();
+                                new_step_positions.push((left_start, left_end));
                                 new_step_lengths.push(left_seq.len());
                             }
 
-                            // Only create right node if sequence is not empty
+                            // Only create right node if sequence is not empty and within bounds
                             let mut right_handle = None;
-                            if !right_seq.is_empty() {
+                            if !right_seq.is_empty() && overlap_end_offset < node_seq.len() {
                                 let right_id = NodeId::from(next_node_id_value);
                                 next_node_id_value += 1;
                                 let right_node = combined_graph.create_handle(right_seq, right_id);
@@ -374,7 +378,9 @@ fn main() {
                                     right_node
                                 });
                                 new_steps.push(right_handle.unwrap());
-                                new_step_positions.push((overlap_end, step_end));
+                                let right_start = overlap_end;
+                                let right_end = right_start + right_seq.len();
+                                new_step_positions.push((right_start, right_end));
                                 new_step_lengths.push(right_seq.len());
                             }
 

From 0a120485c5e652cdf6a14bfeaca0b083bc0c8492 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 15:51:36 -0600
Subject: [PATCH 13/45] fix: Correct overlap region calculation to prevent
 extra bases in alignment

---
 src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 8c81447..95f6f6e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -278,9 +278,9 @@ fn main() {
             let r2 = &mut right[0];
 
             if has_overlap(r1, r2) {
-                // Calculate the overlap region
-                let overlap_start = r2.start;
-                let overlap_end = r1.end;
+                // Calculate the overlap region - use max/min to get precise overlap bounds
+                let overlap_start = std::cmp::max(r1.start, r2.start);
+                let overlap_end = std::cmp::min(r1.end, r2.end);
 
                 if args.debug {
                     let overlap_amount = overlap_end - overlap_start;

From fa68310aeb34da583786e855a1b10b53e7f51519 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Thu, 12 Dec 2024 16:46:24 -0600
Subject: [PATCH 14/45] fix: Update r2.start and r2.end after trimming
 overlapping steps

---
 src/main.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index 95f6f6e..a845b8d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -446,7 +446,15 @@ fn main() {
                     }
                 }
 
-                r2.start = overlap_end;
+                // Update r2.start and r2.end based on the new step positions
+                if !r2.step_positions.is_empty() {
+                    r2.start = r2.step_positions.first().unwrap().0;
+                    r2.end = r2.step_positions.last().unwrap().1;
+                } else {
+                    // If no steps remain, set start and end to overlap_end to effectively remove this range
+                    r2.start = overlap_end;
+                    r2.end = overlap_end;
+                }
             }
         }
 

From 500108a943db5a1944d15b383958884e5946a68b Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 14:02:25 -0600
Subject: [PATCH 15/45] wip

---
 src/main.rs | 120 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 76 insertions(+), 44 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index a845b8d..67b633e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -198,8 +198,9 @@ fn main() {
                             cumulative_pos = cumulative_pos + node_length;
                         }
                     }
-                    
-                    path_key_ranges.entry(sample_hap_name)
+
+                    if !translated_steps.is_empty() {
+                        path_key_ranges.entry(sample_hap_name)
                         .or_default()
                         .push(RangeInfo { 
                             start, 
@@ -209,15 +210,17 @@ fn main() {
                             step_positions,
                             step_lengths,
                         });
+                    } else if args.debug {
+                        eprintln!("  Warning: Path '{}' has no steps", path_name);
+                    }
                 }
             }
         }
     }
 
     if args.debug {
-        eprintln!("Total nodes in the combined graph: {}", combined_graph.node_count());
-        eprintln!("Total edges in the combined graph: {}", combined_graph.edge_count());
-        eprintln!("Total path ranges collected: {}", path_key_ranges.len());
+        eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
+            combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::<usize>());
     }
 
     let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
@@ -230,45 +233,60 @@ fn main() {
         if args.debug {
             eprintln!("Processing path key '{}'", path_key);
             for range in ranges.iter() {
-                eprintln!("  Range: start={}, end={}, gfa_id={}", range.start, range.end, range.gfa_id);
+                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
             }
+
+            eprintln!("Removing redundant ranges");
         }
 
         // Remove ranges that are contained within other ranges
-        if !ranges.is_empty() {
-            let mut filtered_ranges = Vec::new();
-            let mut prev_range = ranges[0].clone();
-
-            for range in ranges.iter().skip(1)  {
-                if range.start >= prev_range.start && range.end <= prev_range.end {
-                    // Current range is fully contained within prev_range, skip it
-                    if args.debug {
-                        eprintln!(
-                            "Redundant range detected: Range [start={}, end={}] is contained within [start={}, end={}] and will be removed.",
-                            range.start, range.end, prev_range.start, prev_range.end
-                        );
-                    }
-                    continue;
-                } else if prev_range.start >= range.start && prev_range.end <= range.end {
-                    // Prev_range is fully contained within current range, replace prev_range with current range
-                    if args.debug {
-                        eprintln!(
-                            "Redundant range detected: Previous range [start={}, end={}] is contained within [start={}, end={}] and will be replaced.",
-                            prev_range.start, prev_range.end, range.start, range.end
-                        );
-                    }
-                    prev_range = range.clone();
-                } else {
-                    // No containment, add the previous range and update prev_range
-                    filtered_ranges.push(prev_range.clone());
-                    prev_range = range.clone();
+        let mut write_idx = 0;
+        for read_idx in 1..ranges.len() {
+            let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
+            let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
+
+            if curr_start == prev_start && curr_end == prev_end {
+                // Current range is a duplicate of the previous range, skip it
+                if args.debug {
+                    eprintln!(
+                        "  Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.",
+                        curr_start, curr_end
+                    );
+                }
+                continue;
+            } else if curr_start >= prev_start && curr_end <= prev_end {
+                // Current range is contained within previous range, skip it
+                continue;
+            } else if prev_start >= curr_start && prev_end <= curr_end {
+                // Previous range is contained within current range
+                if args.debug {
+                    eprintln!(
+                        "  Redundant range detected: Range [start={}, end={}, len={}] is contained within [start={}, end={} | len={}] and will be removed.",
+                        prev_start, prev_end, prev_end - prev_start, curr_start, curr_end, curr_end - curr_start
+                    );
+                }
+                ranges.swap(write_idx, read_idx);
+            } else {
+                // No containment, advance write_idx and copy current range
+                write_idx += 1;
+                if write_idx != read_idx {
+                    ranges.swap(write_idx, read_idx);
                 }
             }
+        }
 
-            // Don't forget to add the last range
-            filtered_ranges.push(prev_range);
+        // Truncate the ranges vector to remove any skipped elements
+        ranges.truncate(write_idx + 1);
 
-            *ranges = filtered_ranges;
+        if args.debug {
+            eprintln!("Path key '{}' without redundancy", path_key);
+            for range in ranges.iter() {
+                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+            }
+        }
+
+        if args.debug {
+            eprintln!("Trimming overlapping ranges");
         }
 
         // Process overlaps
@@ -286,7 +304,7 @@ fn main() {
                     let overlap_amount = overlap_end - overlap_start;
 
                     eprintln!(
-                        "Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
+                        "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
                         r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount
                     );
                 }
@@ -455,25 +473,38 @@ fn main() {
                     r2.start = overlap_end;
                     r2.end = overlap_end;
                 }
+
+
+                if args.debug {
+                    eprintln!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
+                }
+            }
+        }
+
+        if args.debug {
+            eprintln!("Path key '{}' without overlaps", path_key);
+            for range in ranges.iter() {
+                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
             }
         }
 
         // Check for overlaps and contiguity
-        let mut has_overlaps = false;
         let mut all_contiguous = true;
         
         for window in ranges.windows(2) {
             if has_overlap(&window[0], &window[1]) {
-                has_overlaps = true;
+                if args.debug {
+                    eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
+                    window[0].start, window[0].end, window[1].start, window[1].end);
+                }
+                panic!("Unresolved overlaps detected in path key '{}'", path_key);
             }
             if !is_contiguous(&window[0], &window[1]) {
                 all_contiguous = false;
             }
         }
-        
-        if (has_overlaps || !all_contiguous) && args.debug {
-            eprintln!("  Path key '{}' ranges analysis:", path_key);
-            
+                
+        if !all_contiguous && args.debug {
             let mut current_start = ranges[0].start;
             let mut current_end = ranges[0].end;
             let mut current_gfa_ids = vec![ranges[0].gfa_id];
@@ -510,7 +541,7 @@ fn main() {
                 current_start, current_end, current_gfa_ids);
         }
 
-        if all_contiguous && !has_overlaps {
+        if all_contiguous {
             // Create a single path with the original key
             let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap();
             let mut prev_step = None;
@@ -546,6 +577,7 @@ fn main() {
                 
                 // Create path name with range information
                 let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
+                println!("Creating path: {} with {} steps", path_name, steps.len());
                 let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
                 
                 // Add steps to the path

From 6ed8e4b5f6f1ad0758e114177aa2bf709412f899 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 14:10:39 -0600
Subject: [PATCH 16/45] chore: Add a blank line before the main function for
 better readability

---
 src/main.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main.rs b/src/main.rs
index 67b633e..8749343 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -132,6 +132,7 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
     
     Ok(())
 }
+
 fn main() {
     let args = Args::parse();
 

From b361a047e44ff748c3bf3d6063d3ecdccb2b2331 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 14:34:06 -0600
Subject: [PATCH 17/45] style: Update debug log messages for consistency and
 clarity

---
 src/main.rs | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 8749343..a83f937 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -212,7 +212,7 @@ fn main() {
                             step_lengths,
                         });
                     } else if args.debug {
-                        eprintln!("  Warning: Path '{}' has no steps", path_name);
+                        eprintln!("  Warning: path '{}' has no steps", path_name);
                     }
                 }
             }
@@ -275,7 +275,6 @@ fn main() {
                 }
             }
         }
-
         // Truncate the ranges vector to remove any skipped elements
         ranges.truncate(write_idx + 1);
 
@@ -315,20 +314,33 @@ fn main() {
                 let mut steps_to_split = Vec::new();
                 for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
                     if step_end <= overlap_start {
-                        //println!("\tStep {} before overlap", idx);
+                        if args.debug && r2.start == 154089 {
+                            eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
+                        }
                         continue;
                     } else if step_start >= overlap_end {
-                        //println!("\tStep {} after overlap", idx);
+                        if args.debug && r2.start == 154089 {
+                            eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
+                        }
                         break;
                     } else if step_start >= overlap_start && step_end <= overlap_end {
-                        //println!("\tStep {} within overlap", idx);
+                        if args.debug && r2.start == 154089 {
+                            eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
+                        }
                         steps_to_remove.push(idx);
                     } else {
-                        //println!("\tStep {} partially overlaps", idx);
+                        if args.debug && r2.start == 154089 {
+                            eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
+                        }
                         steps_to_split.push(idx);
                     }
                 }
 
+                if args.debug && r2.start == 154089 {
+                    eprintln!("    {} steps to remove", steps_to_remove.len());
+                    eprintln!("    Steps to split: {:?}", steps_to_split);   
+                }
+
                 // Initialize new vectors to store updated steps
                 let mut new_steps = Vec::new();
                 let mut new_step_positions = Vec::new();

From 54ba991e4122d3c20fce86c140ec5f8d822086c2 Mon Sep 17 00:00:00 2001
From: "AndreaGuarracino (aider)" <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 14:34:08 -0600
Subject: [PATCH 18/45] feat: Update to handle only one partially overlapping
 step at a time

---
 src/main.rs | 60 ++++++++---------------------------------------------
 1 file changed, 9 insertions(+), 51 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index a83f937..17c0a0e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -311,7 +311,7 @@ fn main() {
 
                 // Adjust r2 to remove the overlap
                 let mut steps_to_remove = Vec::new();
-                let mut steps_to_split = Vec::new();
+                let mut step_to_split: Option<usize> = None;
                 for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
                     if step_end <= overlap_start {
                         if args.debug && r2.start == 154089 {
@@ -332,13 +332,16 @@ fn main() {
                         if args.debug && r2.start == 154089 {
                             eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
                         }
-                        steps_to_split.push(idx);
+                        if step_to_split.is_some() {
+                            panic!("Error: More than one step is partially overlapping, which is not allowed.");
+                        }
+                        step_to_split = Some(idx);
                     }
                 }
 
                 if args.debug && r2.start == 154089 {
                     eprintln!("    {} steps to remove", steps_to_remove.len());
-                    eprintln!("    Steps to split: {:?}", steps_to_split);   
+                    eprintln!("    Step to split: {:?}", step_to_split);   
                 }
 
                 // Initialize new vectors to store updated steps
@@ -354,8 +357,8 @@ fn main() {
                     if steps_to_remove.contains(&idx) {
                         // Skip steps to remove
                         continue;
-                    } else if steps_to_split.contains(&idx) {
-                        // Split nodes for steps that partially overlap
+                    } else if step_to_split == Some(idx) {
+                        // Split node for the single partially overlapping step
                         let node_seq = combined_graph.sequence(step_handle).collect::<Vec<_>>();
                         let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
                         let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
@@ -374,52 +377,7 @@ fn main() {
                             (start_offset, end_offset)
                         };
 
-                        if step_start < overlap_start && step_end > overlap_end {
-                            // Split into three parts
-                            let left_seq = &node_seq[0..overlap_start_offset];
-                            let right_seq = &node_seq[overlap_end_offset..];
-
-                            // Only create left node if sequence is not empty and within bounds
-                            let mut left_handle = None;
-                            if !left_seq.is_empty() && overlap_start_offset > 0 {
-                                let left_id = NodeId::from(next_node_id_value);
-                                next_node_id_value += 1;
-                                let left_node = combined_graph.create_handle(left_seq, left_id);
-                                left_handle = Some(if step_handle.is_reverse() {
-                                    left_node.flip()
-                                } else {
-                                    left_node
-                                });
-                                new_steps.push(left_handle.unwrap());
-                                let left_start = step_start;
-                                let left_end = left_start + left_seq.len();
-                                new_step_positions.push((left_start, left_end));
-                                new_step_lengths.push(left_seq.len());
-                            }
-
-                            // Only create right node if sequence is not empty and within bounds
-                            let mut right_handle = None;
-                            if !right_seq.is_empty() && overlap_end_offset < node_seq.len() {
-                                let right_id = NodeId::from(next_node_id_value);
-                                next_node_id_value += 1;
-                                let right_node = combined_graph.create_handle(right_seq, right_id);
-                                right_handle = Some(if step_handle.is_reverse() {
-                                    right_node.flip()
-                                } else {
-                                    right_node
-                                });
-                                new_steps.push(right_handle.unwrap());
-                                let right_start = overlap_end;
-                                let right_end = right_start + right_seq.len();
-                                new_step_positions.push((right_start, right_end));
-                                new_step_lengths.push(right_seq.len());
-                            }
-
-                            // Create edge only if both nodes exist
-                            if let (Some(left), Some(right)) = (left_handle, right_handle) {
-                                combined_graph.create_edge(Edge(left, right));
-                            }
-                        } else if step_start < overlap_start {
+                        if step_start < overlap_start {
                             // Keep left part
                             let new_seq = &node_seq[0..overlap_start_offset];
                             if !new_seq.is_empty() {

From 8ebcec2d51952e346b5a0251a555cf4b8ded80c3 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 16:42:33 -0600
Subject: [PATCH 19/45] fix orientation and offsets

---
 src/main.rs | 106 +++++++++++++++++++++++++---------------------------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 17c0a0e..4166b40 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -212,7 +212,7 @@ fn main() {
                             step_lengths,
                         });
                     } else if args.debug {
-                        eprintln!("  Warning: path '{}' has no steps", path_name);
+                        eprintln!("  Warning: Path '{}' has no steps", path_name);
                     }
                 }
             }
@@ -314,24 +314,24 @@ fn main() {
                 let mut step_to_split: Option<usize> = None;
                 for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
                     if step_end <= overlap_start {
-                        if args.debug && r2.start == 154089 {
-                            eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
-                        }
+                        // if args.debug {
+                        //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
+                        // }
                         continue;
                     } else if step_start >= overlap_end {
-                        if args.debug && r2.start == 154089 {
-                            eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
-                        }
+                        // if args.debug {
+                        //     eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
+                        // }
                         break;
                     } else if step_start >= overlap_start && step_end <= overlap_end {
-                        if args.debug && r2.start == 154089 {
-                            eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
-                        }
+                        // if args.debug {
+                        //     eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
+                        // }
                         steps_to_remove.push(idx);
                     } else {
-                        if args.debug && r2.start == 154089 {
-                            eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
-                        }
+                        // if args.debug {
+                        //     eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
+                        // }
                         if step_to_split.is_some() {
                             panic!("Error: More than one step is partially overlapping, which is not allowed.");
                         }
@@ -340,7 +340,7 @@ fn main() {
                 }
 
                 if args.debug && r2.start == 154089 {
-                    eprintln!("    {} steps to remove", steps_to_remove.len());
+                    eprintln!("    Total steps to remove: {}", steps_to_remove.len());
                     eprintln!("    Step to split: {:?}", step_to_split);   
                 }
 
@@ -365,52 +365,48 @@ fn main() {
                         
                         // Calculate offsets relative to the node sequence
                         let node_len = node_seq.len();
-                        let (overlap_start_offset, overlap_end_offset) = if !step_handle.is_reverse() {
-                            // Forward handle - offsets from start of sequence
-                            let start_offset = (overlap_within_step_start - step_start).min(node_len);
-                            let end_offset = (overlap_within_step_end - step_start).min(node_len);
-                            (start_offset, end_offset)
-                        } else {
-                            // Reverse handle - offsets from end of sequence
-                            let start_offset = (step_end - overlap_within_step_end).min(node_len);
-                            let end_offset = (step_end - overlap_within_step_start).min(node_len);
-                            (start_offset, end_offset)
-                        };
+                        // Calculate offsets consistently regardless of strand
+                        let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len);
+                        let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len);
+
+                        if args.debug {
+                            eprintln!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
+                                idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
+                            eprintln!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
+                        }
 
                         if step_start < overlap_start {
-                            // Keep left part
-                            let new_seq = &node_seq[0..overlap_start_offset];
-                            if !new_seq.is_empty() {
-                                let node_id = NodeId::from(next_node_id_value);
-                                next_node_id_value += 1;
-                                let new_node = combined_graph.create_handle(new_seq, node_id);
-                                let new_handle = if step_handle.is_reverse() {
-                                    new_node.flip()
-                                } else {
-                                    new_node
-                                };
-
-                                new_steps.push(new_handle);
-                                new_step_positions.push((step_start, overlap_start));
-                                new_step_lengths.push(new_seq.len());
+                            if args.debug {
+                                eprintln!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
                             }
+                            assert!(overlap_start_offset > 0);
+
+                            // Keep left part
+                            let new_seq = node_seq[0..overlap_start_offset].to_vec();
+
+                            let node_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+                            let new_node = combined_graph.create_handle(&new_seq, node_id);
+
+                            new_steps.push(new_node);
+                            new_step_positions.push((step_start, overlap_start));
+                            new_step_lengths.push(new_seq.len());
                         } else if step_end > overlap_end {
-                            // Keep right part
-                            let new_seq = &node_seq[overlap_end_offset..];
-                            if !new_seq.is_empty() {
-                                let node_id = NodeId::from(next_node_id_value);
-                                next_node_id_value += 1;
-                                let new_node = combined_graph.create_handle(new_seq, node_id);
-                                let new_handle = if step_handle.is_reverse() {
-                                    new_node.flip()
-                                } else {
-                                    new_node
-                                };
-
-                                new_steps.push(new_handle);
-                                new_step_positions.push((overlap_end, step_end));
-                                new_step_lengths.push(new_seq.len());
+                            if args.debug {
+                                eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
                             }
+                            assert!(overlap_end_offset < node_len);
+
+                            // Keep right part
+                            let new_seq = node_seq[overlap_end_offset..].to_vec();
+
+                            let node_id = NodeId::from(next_node_id_value);
+                            next_node_id_value += 1;
+                            let new_node = combined_graph.create_handle(&new_seq, node_id);
+
+                            new_steps.push(new_node);
+                            new_step_positions.push((overlap_end, step_end));
+                            new_step_lengths.push(new_seq.len());
                         }
                     } else {
                         // Keep steps that are not to be removed or split

From e9cc5054ef6e93dae5c977e602b9deb8d26490c1 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 16:43:56 -0600
Subject: [PATCH 20/45] update version

---
 Cargo.lock | 6 +++---
 Cargo.toml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0880f37..da73213 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -183,9 +183,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.3"
+version = "1.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d"
+checksum = "9157bbaa6b165880c27a4293a474c91cdcf265cc68cc829bf10be0964a391caf"
 dependencies = [
  "jobserver",
  "libc",
@@ -327,7 +327,7 @@ dependencies = [
 
 [[package]]
 name = "gfalace"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "bstr 1.11.1",
  "clap",
diff --git a/Cargo.toml b/Cargo.toml
index b410919..896718d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gfalace"
-version = "0.1.0"
+version = "0.1.1"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

From 578bdc4eb5c6ef29127970be6b418ecb5f71c423 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 19:52:16 -0600
Subject: [PATCH 21/45] better containment management

---
 src/main.rs | 217 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 208 insertions(+), 9 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 4166b40..11fedf3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -247,6 +247,7 @@ fn main() {
             let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
 
             if curr_start == prev_start && curr_end == prev_end {
+                // Skip duplicate range
                 // Current range is a duplicate of the previous range, skip it
                 if args.debug {
                     eprintln!(
@@ -256,17 +257,38 @@ fn main() {
                 }
                 continue;
             } else if curr_start >= prev_start && curr_end <= prev_end {
-                // Current range is contained within previous range, skip it
-                continue;
-            } else if prev_start >= curr_start && prev_end <= curr_end {
-                // Previous range is contained within current range
+                // Skip range that is fully contained within previous range
                 if args.debug {
                     eprintln!(
-                        "  Redundant range detected: Range [start={}, end={}, len={}] is contained within [start={}, end={} | len={}] and will be removed.",
-                        prev_start, prev_end, prev_end - prev_start, curr_start, curr_end, curr_end - curr_start
+                        "  Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.",
+                        curr_start, curr_end, prev_start, prev_end
                     );
                 }
+                continue;
+            } else if prev_start >= curr_start && prev_end <= curr_end {
+                // Previous range is fully contained within current range
                 ranges.swap(write_idx, read_idx);
+            } else if curr_start < prev_end {
+                // Handle overlapping ranges
+                if read_idx < ranges.len() - 1 {
+                    let next_start = ranges[read_idx + 1].start;
+                    let next_end = ranges[read_idx + 1].end;
+                    
+                    // Only skip current range if it's substantially overlapped by both prev and next ranges
+                    if curr_start > prev_start && next_start < curr_end && next_end > curr_end {
+                        // Check if the current range doesn't extend far enough beyond previous range
+                        let extension_beyond_prev = curr_end - prev_end;
+                        let overlap_with_prev = prev_end - curr_start;
+                        if extension_beyond_prev < overlap_with_prev {
+                            continue;
+                        }
+                    }
+                }
+                // Keep current range
+                write_idx += 1;
+                if write_idx != read_idx {
+                    ranges.swap(write_idx, read_idx);
+                }
             } else {
                 // No containment, advance write_idx and copy current range
                 write_idx += 1;
@@ -314,17 +336,17 @@ fn main() {
                 let mut step_to_split: Option<usize> = None;
                 for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
                     if step_end <= overlap_start {
-                        // if args.debug {
+                        // if args.debug && r2.start == 11000 {
                         //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
                         // }
                         continue;
                     } else if step_start >= overlap_end {
-                        // if args.debug {
+                        // if args.debug && r2.start == 11000 {
                         //     eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
                         // }
                         break;
                     } else if step_start >= overlap_start && step_end <= overlap_end {
-                        // if args.debug {
+                        // if args.debug && r2.start == 11000 {
                         //     eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
                         // }
                         steps_to_remove.push(idx);
@@ -592,3 +614,180 @@ fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> {
     }
     None
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Helper function to create a simple RangeInfo for testing
+    fn create_range_info(start: usize, end: usize, gfa_id: usize) -> RangeInfo {
+        RangeInfo {
+            start,
+            end,
+            gfa_id,
+            steps: vec![],           // Empty steps for testing
+            step_positions: vec![],   // Empty positions for testing
+            step_lengths: vec![],     // Empty lengths for testing
+        }
+    }
+
+    #[test]
+    fn test_range_containment_removal() {
+        // Test cases
+        let test_cases = vec![
+            // Test case 1: Basic containment
+            (
+                vec![(10, 50, 0), (20, 30, 1)],  // Input ranges (start, end, gfa_id)
+                vec![(10, 50, 0)],               // Expected result
+                "Basic containment"
+            ),
+            
+            // Test case 2: No containment
+            (
+                vec![(10, 20, 0), (30, 40, 1)],
+                vec![(10, 20, 0), (30, 40, 1)],
+                "No containment"
+            ),
+            
+            // Test case 3: Multiple contained ranges
+            (
+                vec![(10, 100, 0), (20, 30, 1), (40, 50, 2), (60, 70, 3)],
+                vec![(10, 100, 0)],
+                "Multiple contained ranges"
+            ),
+            
+            // Test case 4: Identical ranges
+            (
+                vec![(10, 20, 0), (10, 20, 1)],
+                vec![(10, 20, 0)],
+                "Identical ranges"
+            ),
+            
+            // Test case 5: Nested containment
+            (
+                vec![(10, 100, 0), (20, 80, 1), (30, 40, 2)],
+                vec![(10, 100, 0)],
+                "Nested containment"
+            ),
+            
+            // Test case 6: Partial overlap (should keep both)
+            (
+                vec![(10, 30, 0), (20, 40, 1)],
+                vec![(10, 30, 0), (20, 40, 1)],
+                "Partial overlap"
+            ),
+            
+            // Test case 7: Edge cases - touching ranges
+            (
+                vec![(10, 20, 0), (20, 30, 1)],
+                vec![(10, 20, 0), (20, 30, 1)],
+                "Touching ranges"
+            ),
+
+            // Test case 8: Overlapping ranges from same GFA
+            (
+                vec![(0, 11742, 0), (9714, 13000, 1), (11000, 19000, 1)],
+                vec![(0, 11742, 0), (11000, 19000, 1)],
+                "Overlapping ranges from same GFA"
+            ),
+
+            // Test case 9: Overlapping ranges with different GFA IDs
+            (
+                vec![(0, 11742, 0), (9714, 13000, 1), (11000, 19000, 2)],
+                vec![(0, 11742, 0), (11000, 19000, 2)],
+                "Overlapping ranges"
+            ),
+
+            // Test case 10: Overlapping ranges with different GFA IDs 2
+            (
+                vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)],
+                vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)],
+                "Overlapping ranges"
+            ),
+        ];
+
+        // Run each test case
+        for (case_index, (input_ranges, expected_ranges, case_name)) in test_cases.iter().enumerate() {
+            println!("Running test case {}: {}", case_index + 1, case_name);
+            
+            // Create input ranges
+            let mut ranges: Vec<RangeInfo> = input_ranges
+                .iter()
+                .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id))
+                .collect();
+// Sort ranges by start position
+ranges.sort_by_key(|r| (r.start, r.end));
+
+let mut write_idx = 0;
+for read_idx in 1..ranges.len() {
+    let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
+    let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
+
+    if curr_start == prev_start && curr_end == prev_end {
+        // Skip duplicate range
+        continue;
+    } else if curr_start >= prev_start && curr_end <= prev_end {
+        // Skip range that is fully contained within previous range
+        continue;
+    } else if prev_start >= curr_start && prev_end <= curr_end {
+        // Previous range is fully contained within current range
+        ranges.swap(write_idx, read_idx);
+    } else if curr_start < prev_end {
+        // Handle overlapping ranges
+        if read_idx < ranges.len() - 1 {
+            let next_start = ranges[read_idx + 1].start;
+            let next_end = ranges[read_idx + 1].end;
+            
+            // Only skip current range if it's substantially overlapped by both prev and next ranges
+            if curr_start > prev_start && next_start < curr_end && next_end > curr_end {
+                // Check if the current range doesn't extend far enough beyond previous range
+                let extension_beyond_prev = curr_end - prev_end;
+                let overlap_with_prev = prev_end - curr_start;
+                if extension_beyond_prev < overlap_with_prev {
+                    continue;
+                }
+            }
+        }
+        // Keep current range
+        write_idx += 1;
+        if write_idx != read_idx {
+            ranges.swap(write_idx, read_idx);
+        }
+    } else {
+        // No overlap - keep both ranges
+        write_idx += 1;
+        if write_idx != read_idx {
+            ranges.swap(write_idx, read_idx);
+        }
+    }
+}
+ranges.truncate(write_idx + 1);
+
+            // Create expected ranges
+            let expected: Vec<RangeInfo> = expected_ranges
+                .iter()
+                .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id))
+                .collect();
+
+            // Compare results
+            assert_eq!(
+                ranges.len(),
+                expected.len(),
+                "Test case '{}': Wrong number of ranges after containment removal",
+                case_name
+            );
+
+            for (i, (result, expected)) in ranges.iter().zip(expected.iter()).enumerate() {
+                assert_eq!(
+                    (result.start, result.end, result.gfa_id),
+                    (expected.start, expected.end, expected.gfa_id),
+                    "Test case '{}': Mismatch at position {}",
+                    case_name,
+                    i
+                );
+            }
+            
+            println!("Test case {} passed: {}", case_index + 1, case_name);
+        }
+    }
+}

From 6b24c9b5576222f97c20e4be75a7fbf9e8788baf Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 19:55:26 -0600
Subject: [PATCH 22/45] more debugging log

---
 src/main.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/main.rs b/src/main.rs
index 11fedf3..ed187bd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -267,6 +267,12 @@ fn main() {
                 continue;
             } else if prev_start >= curr_start && prev_end <= curr_end {
                 // Previous range is fully contained within current range
+                if args.debug {
+                    eprintln!(
+                        "  Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.",
+                        prev_start, prev_end, curr_start, curr_end
+                    );
+                }
                 ranges.swap(write_idx, read_idx);
             } else if curr_start < prev_end {
                 // Handle overlapping ranges
@@ -280,6 +286,12 @@ fn main() {
                         let extension_beyond_prev = curr_end - prev_end;
                         let overlap_with_prev = prev_end - curr_start;
                         if extension_beyond_prev < overlap_with_prev {
+                            if args.debug {
+                                eprintln!(
+                                    "  Overlapping range detected: Range [start={}, end={}] is overlapped by both previous range [start={}, end={}] and next range [start={}, end={}] and will be removed.",
+                                    curr_start, curr_end, prev_start, prev_end, next_start, next_end
+                                );
+                            }
                             continue;
                         }
                     }

From 7b15330cf6ffc189c795a0ef7ffec19788dda48c Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 20:50:53 -0600
Subject: [PATCH 23/45] manage complex cases

---
 src/main.rs | 128 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 29 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index ed187bd..0554402 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -245,7 +245,7 @@ fn main() {
         for read_idx in 1..ranges.len() {
             let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
             let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
-
+        
             if curr_start == prev_start && curr_end == prev_end {
                 // Skip duplicate range
                 // Current range is a duplicate of the previous range, skip it
@@ -274,24 +274,84 @@ fn main() {
                     );
                 }
                 ranges.swap(write_idx, read_idx);
+            } else if curr_start < prev_end {
+                // Handle overlapping ranges - check both previous and next ranges
+                let mut should_skip = false;
+                
+                if read_idx < ranges.len() - 1 {
+                    let next_start = ranges[read_idx + 1].start;
+                    
+                    // Check if current range is significantly overlapped by both neighbors
+                    if curr_start > prev_start && next_start < curr_end {
+                        let overlap_with_prev = prev_end - curr_start;
+                        let overlap_with_next = curr_end - next_start;
+                        let range_length = curr_end - curr_start;
+                        
+                        // Skip if the range is mostly covered by its neighbors
+                        if overlap_with_prev + overlap_with_next > range_length {
+                            should_skip = true;
+                        }
+                    }
+                }
+                
+                if !should_skip {
+                    if args.debug {
+                        eprintln!(
+                            "  Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.",
+                            curr_start, curr_end, prev_start, prev_end
+                        );
+                    }
+                    write_idx += 1;
+                    if write_idx != read_idx {
+                        ranges.swap(write_idx, read_idx);
+                    }
+                }
+            } else {
+                // No overlap - keep both ranges
+                write_idx += 1;
+                if write_idx != read_idx {
+                    ranges.swap(write_idx, read_idx);
+                }
+            }
+        }
+        ranges.truncate(write_idx + 1);
+        
+
+        if args.debug {
+            eprintln!("Path key '{}' without redundancy", path_key);
+            for range in ranges.iter() {
+                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+            }
+        }
+
+        let mut write_idx = 0;
+        for read_idx in 1..ranges.len() {
+            let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
+            let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
+        
+            if curr_start == prev_start && curr_end == prev_end {
+                // Skip duplicate range
+                continue;
+            } else if curr_start >= prev_start && curr_end <= prev_end {
+                // Skip range that is fully contained within previous range
+                continue;
+            } else if prev_start >= curr_start && prev_end <= curr_end {
+                // Previous range is fully contained within current range
+                ranges.swap(write_idx, read_idx);
             } else if curr_start < prev_end {
                 // Handle overlapping ranges
                 if read_idx < ranges.len() - 1 {
                     let next_start = ranges[read_idx + 1].start;
-                    let next_end = ranges[read_idx + 1].end;
                     
-                    // Only skip current range if it's substantially overlapped by both prev and next ranges
-                    if curr_start > prev_start && next_start < curr_end && next_end > curr_end {
-                        // Check if the current range doesn't extend far enough beyond previous range
-                        let extension_beyond_prev = curr_end - prev_end;
+                    // Check if current range is significantly overlapped by both previous and next ranges
+                    if curr_start > prev_start && next_start < curr_end {
+                        // Calculate overlap percentages
                         let overlap_with_prev = prev_end - curr_start;
-                        if extension_beyond_prev < overlap_with_prev {
-                            if args.debug {
-                                eprintln!(
-                                    "  Overlapping range detected: Range [start={}, end={}] is overlapped by both previous range [start={}, end={}] and next range [start={}, end={}] and will be removed.",
-                                    curr_start, curr_end, prev_start, prev_end, next_start, next_end
-                                );
-                            }
+                        let overlap_with_next = curr_end - next_start;
+                        let range_length = curr_end - curr_start;
+                        
+                        // If the range is mostly overlapped by both neighbors, skip it
+                        if overlap_with_prev + overlap_with_next > range_length {
                             continue;
                         }
                     }
@@ -302,18 +362,16 @@ fn main() {
                     ranges.swap(write_idx, read_idx);
                 }
             } else {
-                // No containment, advance write_idx and copy current range
+                // No overlap - keep both ranges
                 write_idx += 1;
                 if write_idx != read_idx {
                     ranges.swap(write_idx, read_idx);
                 }
             }
         }
-        // Truncate the ranges vector to remove any skipped elements
         ranges.truncate(write_idx + 1);
-
         if args.debug {
-            eprintln!("Path key '{}' without redundancy", path_key);
+            eprintln!("Path key '{}' without redundancy 2", path_key);
             for range in ranges.iter() {
                 eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
             }
@@ -716,6 +774,13 @@ mod tests {
                 vec![(0, 10, 0), (8, 20, 1), (15, 30, 2)],
                 "Overlapping ranges"
             ),
+
+            // Test case 11: Overlapping ranges with different GFA IDs 3
+            (
+                vec![(8000, 11000, 0), (9694, 12313, 1), (10908, 13908, 2)],
+                vec![(8000, 11000, 0), (10908, 13908, 2)],
+                "Overlapping ranges"
+            ),
         ];
 
         // Run each test case
@@ -745,25 +810,30 @@ for read_idx in 1..ranges.len() {
         // Previous range is fully contained within current range
         ranges.swap(write_idx, read_idx);
     } else if curr_start < prev_end {
-        // Handle overlapping ranges
+        // Handle overlapping ranges - check both previous and next ranges
+        let mut should_skip = false;
+        
         if read_idx < ranges.len() - 1 {
             let next_start = ranges[read_idx + 1].start;
-            let next_end = ranges[read_idx + 1].end;
             
-            // Only skip current range if it's substantially overlapped by both prev and next ranges
-            if curr_start > prev_start && next_start < curr_end && next_end > curr_end {
-                // Check if the current range doesn't extend far enough beyond previous range
-                let extension_beyond_prev = curr_end - prev_end;
+            // Check if current range is significantly overlapped by both neighbors
+            if curr_start > prev_start && next_start < curr_end {
                 let overlap_with_prev = prev_end - curr_start;
-                if extension_beyond_prev < overlap_with_prev {
-                    continue;
+                let overlap_with_next = curr_end - next_start;
+                let range_length = curr_end - curr_start;
+                
+                // Skip if the range is mostly covered by its neighbors
+                if overlap_with_prev + overlap_with_next > range_length {
+                    should_skip = true;
                 }
             }
         }
-        // Keep current range
-        write_idx += 1;
-        if write_idx != read_idx {
-            ranges.swap(write_idx, read_idx);
+        
+        if !should_skip {
+            write_idx += 1;
+            if write_idx != read_idx {
+                ranges.swap(write_idx, read_idx);
+            }
         }
     } else {
         // No overlap - keep both ranges

From fdd78e19eac31a7fe84ef3cce5d840bd1649013b Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 21:26:26 -0600
Subject: [PATCH 24/45] do not write unused nodes and edges

---
 Cargo.lock  | 40 ++++++++++++++++++++++++++++++++++
 Cargo.toml  |  1 +
 src/main.rs | 63 ++++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index da73213..aa32e9b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -100,6 +100,18 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
 [[package]]
 name = "boomphf"
 version = "0.5.9"
@@ -309,6 +321,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
 [[package]]
 name = "gfa"
 version = "0.10.1"
@@ -329,6 +347,7 @@ dependencies = [
 name = "gfalace"
 version = "0.1.1"
 dependencies = [
+ "bitvec",
  "bstr 1.11.1",
  "clap",
  "gfa",
@@ -508,6 +527,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
 [[package]]
 name = "rand_core"
 version = "0.6.4"
@@ -634,6 +659,12 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -776,6 +807,15 @@ dependencies = [
  "rand_core",
 ]
 
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
+
 [[package]]
 name = "zstd"
 version = "0.13.2"
diff --git a/Cargo.toml b/Cargo.toml
index 896718d..fc645eb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,3 +11,4 @@ niffler = "2.6.0"
 clap = { version = "4.5.22", features = ["derive"] }
 handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e4216ce16a16667503a8875e469a40a97a" }
 gfa = "0.10.1"
+bitvec = "1.0.1"
diff --git a/src/main.rs b/src/main.rs
index 0554402..17a6504 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,6 +9,7 @@ use handlegraph::pathhandlegraph::{IntoPathIds, GraphPathNames, GraphPathsRef, M
 use handlegraph::hashgraph::HashGraph;
 //use handlegraph::pathhandlegraph::PathStep;
 use gfa::{gfa::GFA, parser::GFAParser};
+use bitvec::{bitvec, prelude::BitVec};
 
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
@@ -70,14 +71,16 @@ fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> std::io::Result<GF
     }
 }
 
-fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<()> {
+fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> {
     let mut file = File::create(output_path)?;
     
     // Write GFA version
     writeln!(file, "H\tVN:Z:1.0")?;
     
-    // Collect and sort nodes by ID
-    let mut nodes: Vec<Handle> = graph.handles().collect();
+    // Collect and sort nodes by ID, excluding marked nodes
+    let mut nodes: Vec<Handle> = graph.handles()
+        .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
+        .collect();
     nodes.sort_by_key(|handle| handle.id());
     
     // Write sorted nodes (Segments)
@@ -88,8 +91,13 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
         writeln!(file, "S\t{}\t{}", handle.id(), sequence_str)?;
     }
     
-    // Collect and sort edges
-    let mut edges: Vec<Edge> = graph.edges().collect();
+    // Collect and sort edges, excluding those connected to marked nodes
+    let mut edges: Vec<Edge> = graph.edges()
+        .filter(|edge| {
+            !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
+            !nodes_to_remove[u64::from(edge.1.id()) as usize]
+        })
+        .collect();
     edges.sort_by(|a, b| {
         a.0.id().cmp(&b.0.id())
             .then(a.1.id().cmp(&b.1.id()))
@@ -104,16 +112,8 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
         writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
     }
     
-    // Collect and sort paths by name
-    let mut paths: Vec<_> = graph.path_ids().collect();
-    paths.sort_by_key(|&path_id| {
-        graph.get_path_name(path_id)
-            .map(|name_iter| name_iter.collect::<Vec<u8>>())
-            .unwrap_or_default()
-    });
-    
-    // Write sorted paths
-    for path_id in paths {
+    // Write paths
+    for path_id in graph.path_ids() {
         if let Some(name_iter) = graph.get_path_name(path_id) {
             let path_name = String::from_utf8(name_iter.collect::<Vec<u8>>())
                 .unwrap_or_else(|_| String::from("unknown_path"));
@@ -661,13 +661,42 @@ fn main() {
         eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
     }
 
-    // Write the combined graph to GFA file
-    match write_graph_to_gfa(&combined_graph, &args.output) {
+    // After all paths are created but before writing the graph
+    if args.debug {
+        eprintln!("Total nodes before filtering: {}", combined_graph.node_count());
+    }
+    
+    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);
+    
+    if args.debug {
+        eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones());
+    }
+
+    // Write the combined graph to GFA file, skipping unused nodes
+    match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
         Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)},
         Err(e) => eprintln!("Error writing GFA file: {}", e),
     }
 }
 
+
+fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
+    // Create a bitvector with all nodes initially marked for removal
+    let max_node_id = u64::from(graph.max_node_id());
+    let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1];
+    
+    // Mark nodes used in paths as not to be removed (set bit to 0)
+    for path_id in graph.path_ids() {
+        if let Some(path_ref) = graph.get_path_ref(path_id) {
+            for handle in &path_ref.nodes {
+                nodes_to_remove.set(u64::from(handle.id()) as usize, false);
+            }
+        }
+    }
+    
+    nodes_to_remove
+}
+
 fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> {
     // Find the last ':' to split the range from the key
     if let Some(last_colon) = path_name.rfind(':') {

From 8e4b32981dcdc3ba5929cf47cf5fe4ad192a46d0 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 21:37:02 -0600
Subject: [PATCH 25/45] optimized node IDs

---
 src/main.rs | 48 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 17a6504..21350b8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -77,18 +77,27 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
     // Write GFA version
     writeln!(file, "H\tVN:Z:1.0")?;
     
-    // Collect and sort nodes by ID, excluding marked nodes
+    // Collect nodes, excluding marked nodes
     let mut nodes: Vec<Handle> = graph.handles()
         .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
         .collect();
     nodes.sort_by_key(|handle| handle.id());
     
-    // Write sorted nodes (Segments)
-    for handle in nodes {
-        let sequence = graph.sequence(handle).collect::<Vec<_>>();
+    // Create mapping from old IDs to new sequential IDs using a Vec
+    // We allocate a vector with capacity for the max node ID
+    let max_id = u64::from(nodes.last().unwrap().id()) as usize;
+    let mut id_mapping = vec![0; max_id + 1];
+    for (new_id, handle) in nodes.iter().enumerate() {
+        id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1
+    }
+    
+    // Write nodes with new IDs
+    for handle in &nodes {
+        let sequence = graph.sequence(*handle).collect::<Vec<_>>();
         let sequence_str = String::from_utf8(sequence)
             .unwrap_or_else(|_| String::from("N"));
-        writeln!(file, "S\t{}\t{}", handle.id(), sequence_str)?;
+        let new_id = id_mapping[u64::from(handle.id()) as usize];
+        writeln!(file, "S\t{}\t{}", new_id, sequence_str)?;
     }
     
     // Collect and sort edges, excluding those connected to marked nodes
@@ -103,17 +112,25 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
             .then(a.1.id().cmp(&b.1.id()))
     });
     
-    // Write sorted edges (Links)
+    // Write edges with new IDs
     for edge in edges {
-        let from_id = edge.0.id();
-        let to_id = edge.1.id();
+        let from_id = id_mapping[u64::from(edge.0.id()) as usize];
+        let to_id = id_mapping[u64::from(edge.1.id()) as usize];
         let from_orient = if edge.0.is_reverse() { "-" } else { "+" };
         let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
         writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
     }
     
-    // Write paths
-    for path_id in graph.path_ids() {
+    // Collect and sort paths by name
+    let mut paths: Vec<_> = graph.path_ids().collect();
+    paths.sort_by_key(|&path_id| {
+        graph.get_path_name(path_id)
+            .map(|name_iter| name_iter.collect::<Vec<u8>>())
+            .unwrap_or_default()
+    });
+
+    // Write paths with new IDs
+    for path_id in paths {
         if let Some(name_iter) = graph.get_path_name(path_id) {
             let path_name = String::from_utf8(name_iter.collect::<Vec<u8>>())
                 .unwrap_or_else(|_| String::from("unknown_path"));
@@ -121,12 +138,17 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
             let mut path_elements = Vec::new();
             if let Some(path_ref) = graph.get_path_ref(path_id) {
                 for handle in &path_ref.nodes {
-                    let orient = if handle.is_reverse() { "-" } else { "+" };
-                    path_elements.push(format!("{}{}", handle.id(), orient));
+                    let new_id = id_mapping[u64::from(handle.id()) as usize];
+                    if new_id > 0 { // Only include nodes that weren't removed
+                        let orient = if handle.is_reverse() { "-" } else { "+" };
+                        path_elements.push(format!("{}{}", new_id, orient));
+                    }
                 }
             }
             
-            writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+            if !path_elements.is_empty() {
+                writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+            }
         }
     }
     

From f5959ab529ecf2274152eae599e1be47026959cb Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 21:47:20 -0600
Subject: [PATCH 26/45] better management of temp files

---
 Cargo.lock  | 63 ++++++++++++++++++++++++++++++++++-
 Cargo.toml  |  1 +
 src/main.rs | 94 +++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 133 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index aa32e9b..3ecc40f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -100,6 +100,12 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
 [[package]]
 name = "bitvec"
 version = "1.0.1"
@@ -305,6 +311,22 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "errno"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
+dependencies = [
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+
 [[package]]
 name = "flate2"
 version = "1.0.35"
@@ -353,6 +375,7 @@ dependencies = [
  "gfa",
  "handlegraph",
  "niffler",
+ "tempfile",
 ]
 
 [[package]]
@@ -405,7 +428,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
 dependencies = [
  "arrayvec",
- "bitflags",
+ "bitflags 1.3.2",
  "cfg-if",
  "ryu",
  "static_assertions",
@@ -437,6 +460,12 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
 [[package]]
 name = "log"
 version = "0.4.22"
@@ -503,6 +532,12 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "once_cell"
+version = "1.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+
 [[package]]
 name = "pkg-config"
 version = "0.3.31"
@@ -594,6 +629,19 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
+[[package]]
+name = "rustix"
+version = "0.38.42"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85"
+dependencies = [
+ "bitflags 2.6.0",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys",
+]
+
 [[package]]
 name = "ryu"
 version = "1.0.18"
@@ -665,6 +713,19 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
+[[package]]
+name = "tempfile"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "once_cell",
+ "rustix",
+ "windows-sys",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
diff --git a/Cargo.toml b/Cargo.toml
index fc645eb..54238a5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,3 +12,4 @@ clap = { version = "4.5.22", features = ["derive"] }
 handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e4216ce16a16667503a8875e469a40a97a" }
 gfa = "0.10.1"
 bitvec = "1.0.1"
+tempfile = "3.14.0"
diff --git a/src/main.rs b/src/main.rs
index 21350b8..201de25 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,15 +1,26 @@
-use std::collections::{BTreeMap};
+use std::{
+    collections::BTreeMap,
+    fs::File,
+    io::{self, Write},
+    path::Path,
+};
 use clap::Parser;
-use std::fs::File;
-use std::io::Write;
-use handlegraph::handle::{Handle, NodeId, Edge};
-use handlegraph::handlegraph::*;
-use handlegraph::mutablehandlegraph::*;
-use handlegraph::pathhandlegraph::{IntoPathIds, GraphPathNames, GraphPathsRef, MutableGraphPaths, GraphPaths};
-use handlegraph::hashgraph::HashGraph;
-//use handlegraph::pathhandlegraph::PathStep;
+use handlegraph::{
+    handle::{Handle, NodeId, Edge},
+    handlegraph::*,
+    mutablehandlegraph::*,
+    pathhandlegraph::{
+        IntoPathIds, 
+        GraphPathNames, 
+        GraphPathsRef,
+        MutableGraphPaths,
+        GraphPaths
+    },
+    hashgraph::HashGraph,
+};
 use gfa::{gfa::GFA, parser::GFAParser};
 use bitvec::{bitvec, prelude::BitVec};
+use tempfile::NamedTempFile;
 
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
@@ -47,27 +58,62 @@ fn has_overlap(r1: &RangeInfo, r2: &RangeInfo) -> bool {
     r1.start < r2.end && r2.start < r1.end
 }
 
-// Helper function to read GFA file
-fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> std::io::Result<GFA<usize, ()>> {
+fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> io::Result<GFA<usize, ()>> {
     if gfa_path.ends_with(".gz") {
-        let file = std::fs::File::open(gfa_path)?;
-        let (mut reader, _format) = niffler::get_reader(Box::new(file)).unwrap();
+        let file = std::fs::File::open(gfa_path).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to open gzipped file '{}': {}", gfa_path, e)
+            )
+        })?;
+        
+        let (mut reader, _format) = niffler::get_reader(Box::new(file))
+            .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
         
         let mut decompressed = Vec::new();
-        reader.read_to_end(&mut decompressed)?;
+        reader.read_to_end(&mut decompressed).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to decompress file '{}': {}", gfa_path, e)
+            )
+        })?;
         
-        let temp_path = format!("{}.tmp", gfa_path);
-        {
-            let mut temp_file = std::fs::File::create(&temp_path)?;
-            temp_file.write_all(&decompressed)?;
-        }
+        // Create temporary file in the same directory as the input file for better performance
+        let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new("."));
+        let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to create temporary file: {}", e)
+            )
+        })?;
         
-        let result = parser.parse_file(&temp_path).unwrap();
-        std::fs::remove_file(&temp_path)?;
+        // Write decompressed data
+        temp_file.as_file().write_all(&decompressed).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to write to temporary file: {}", e)
+            )
+        })?;
         
-        Ok(result)
+        // Parse GFA
+        parser.parse_file(temp_file.path().to_str().ok_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid temporary file path"
+            )
+        })?).map_err(|e| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Failed to parse GFA: {}", e)
+            )
+        })
     } else {
-        Ok(parser.parse_file(gfa_path).unwrap())
+        parser.parse_file(gfa_path).map_err(|e| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Failed to parse GFA file '{}': {}", gfa_path, e)
+            )
+        })
     }
 }
 
@@ -120,7 +166,7 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
         let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
         writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
     }
-    
+
     // Collect and sort paths by name
     let mut paths: Vec<_> = graph.path_ids().collect();
     paths.sort_by_key(|&path_id| {

From 75fe859219b0934a2a69ca114b7ac1ef58f7427a Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Fri, 13 Dec 2024 21:55:23 -0600
Subject: [PATCH 27/45] FxHashMap

---
 Cargo.lock  | 7 +++++++
 Cargo.toml  | 1 +
 src/main.rs | 4 ++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3ecc40f..ea3e602 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -375,6 +375,7 @@ dependencies = [
  "gfa",
  "handlegraph",
  "niffler",
+ "rustc-hash",
  "tempfile",
 ]
 
@@ -629,6 +630,12 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
+
 [[package]]
 name = "rustix"
 version = "0.38.42"
diff --git a/Cargo.toml b/Cargo.toml
index 54238a5..9110992 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,3 +13,4 @@ handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e42
 gfa = "0.10.1"
 bitvec = "1.0.1"
 tempfile = "3.14.0"
+rustc-hash = "2.1.0"
diff --git a/src/main.rs b/src/main.rs
index 201de25..e8a91f4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,9 @@
 use std::{
-    collections::BTreeMap,
     fs::File,
     io::{self, Write},
     path::Path,
 };
+use rustc_hash::FxHashMap;
 use clap::Parser;
 use handlegraph::{
     handle::{Handle, NodeId, Edge},
@@ -206,7 +206,7 @@ fn main() {
 
     // Create a single combined graph
     let mut combined_graph = HashGraph::new();
-    let mut path_key_ranges: BTreeMap<String, Vec<RangeInfo>> = BTreeMap::new();
+    let mut path_key_ranges: FxHashMap<String, Vec<RangeInfo>> = FxHashMap::default();
     let mut id_translations = Vec::new();
 
     // Process each GFA file

From d14c951bb1543d38b54dee369ba293402e15a9e9 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 11:11:14 -0600
Subject: [PATCH 28/45] remove redundant loop

---
 src/main.rs | 53 -----------------------------------------------------
 1 file changed, 53 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index e8a91f4..d7a5f15 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -392,59 +392,6 @@ fn main() {
             }
         }
 
-        let mut write_idx = 0;
-        for read_idx in 1..ranges.len() {
-            let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
-            let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
-        
-            if curr_start == prev_start && curr_end == prev_end {
-                // Skip duplicate range
-                continue;
-            } else if curr_start >= prev_start && curr_end <= prev_end {
-                // Skip range that is fully contained within previous range
-                continue;
-            } else if prev_start >= curr_start && prev_end <= curr_end {
-                // Previous range is fully contained within current range
-                ranges.swap(write_idx, read_idx);
-            } else if curr_start < prev_end {
-                // Handle overlapping ranges
-                if read_idx < ranges.len() - 1 {
-                    let next_start = ranges[read_idx + 1].start;
-                    
-                    // Check if current range is significantly overlapped by both previous and next ranges
-                    if curr_start > prev_start && next_start < curr_end {
-                        // Calculate overlap percentages
-                        let overlap_with_prev = prev_end - curr_start;
-                        let overlap_with_next = curr_end - next_start;
-                        let range_length = curr_end - curr_start;
-                        
-                        // If the range is mostly overlapped by both neighbors, skip it
-                        if overlap_with_prev + overlap_with_next > range_length {
-                            continue;
-                        }
-                    }
-                }
-                // Keep current range
-                write_idx += 1;
-                if write_idx != read_idx {
-                    ranges.swap(write_idx, read_idx);
-                }
-            } else {
-                // No overlap - keep both ranges
-                write_idx += 1;
-                if write_idx != read_idx {
-                    ranges.swap(write_idx, read_idx);
-                }
-            }
-        }
-        ranges.truncate(write_idx + 1);
-        if args.debug {
-            eprintln!("Path key '{}' without redundancy 2", path_key);
-            for range in ranges.iter() {
-                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
-            }
-        }
-
         if args.debug {
             eprintln!("Trimming overlapping ranges");
         }

From c692c9fcd5830f67348cfef2cf6cdcf5f5265613 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 11:55:08 -0600
Subject: [PATCH 29/45] less memory

---
 src/main.rs | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index d7a5f15..ea7a95a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -45,7 +45,6 @@ struct RangeInfo {
     gfa_id: usize,
     steps: Vec<Handle>,                    // Path steps for this range
     step_positions: Vec<(usize, usize)>,   // Start and end positions of each step
-    step_lengths: Vec<usize>,              // Lengths of each step
 }
 
 // Helper function to check if two ranges are contiguous
@@ -277,7 +276,6 @@ fn main() {
                             gfa_id,
                             steps: translated_steps,
                             step_positions,
-                            step_lengths,
                         });
                     } else if args.debug {
                         eprintln!("  Warning: Path '{}' has no steps", path_name);
@@ -384,15 +382,12 @@ fn main() {
         }
         ranges.truncate(write_idx + 1);
         
-
         if args.debug {
             eprintln!("Path key '{}' without redundancy", path_key);
             for range in ranges.iter() {
                 eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
             }
-        }
 
-        if args.debug {
             eprintln!("Trimming overlapping ranges");
         }
 
@@ -454,7 +449,6 @@ fn main() {
                 // Initialize new vectors to store updated steps
                 let mut new_steps = Vec::new();
                 let mut new_step_positions = Vec::new();
-                let mut new_step_lengths = Vec::new();
 
                 // Iterate over the original steps
                 for idx in 0..r2.steps.len() {
@@ -497,7 +491,6 @@ fn main() {
 
                             new_steps.push(new_node);
                             new_step_positions.push((step_start, overlap_start));
-                            new_step_lengths.push(new_seq.len());
                         } else if step_end > overlap_end {
                             if args.debug {
                                 eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
@@ -513,20 +506,17 @@ fn main() {
 
                             new_steps.push(new_node);
                             new_step_positions.push((overlap_end, step_end));
-                            new_step_lengths.push(new_seq.len());
                         }
                     } else {
                         // Keep steps that are not to be removed or split
                         new_steps.push(step_handle);
                         new_step_positions.push((step_start, step_end));
-                        new_step_lengths.push(r2.step_lengths[idx]);
                     }
                 }
 
                 // Update r2 with the new steps
                 r2.steps = new_steps;
                 r2.step_positions = new_step_positions;
-                r2.step_lengths = new_step_lengths;
 
                 // Update edges for the modified steps
                 for idx in 0..r2.steps.len() {

From 411708ca8cf0f4f6621a7f1f72c1ee58b975a661 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 13:00:57 -0600
Subject: [PATCH 30/45] improve performance

---
 src/main.rs | 161 ++++++++++++++++++++++++++--------------------------
 1 file changed, 79 insertions(+), 82 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index ea7a95a..12df35f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -166,37 +166,35 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
         writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
     }
 
-    // Collect and sort paths by name
-    let mut paths: Vec<_> = graph.path_ids().collect();
-    paths.sort_by_key(|&path_id| {
-        graph.get_path_name(path_id)
-            .map(|name_iter| name_iter.collect::<Vec<u8>>())
-            .unwrap_or_default()
+    // Collect and sort paths directly with references to avoid multiple lookups
+    let mut path_entries: Vec<_> = graph.paths.iter().collect();
+    path_entries.sort_by_key(|(_, path)| {
+        // Get name directly from path struct instead of doing lookups
+        path.name.as_slice()
     });
 
     // Write paths with new IDs
-    for path_id in paths {
-        if let Some(name_iter) = graph.get_path_name(path_id) {
-            let path_name = String::from_utf8(name_iter.collect::<Vec<u8>>())
-                .unwrap_or_else(|_| String::from("unknown_path"));
-            
-            let mut path_elements = Vec::new();
-            if let Some(path_ref) = graph.get_path_ref(path_id) {
-                for handle in &path_ref.nodes {
-                    let new_id = id_mapping[u64::from(handle.id()) as usize];
-                    if new_id > 0 { // Only include nodes that weren't removed
-                        let orient = if handle.is_reverse() { "-" } else { "+" };
-                        path_elements.push(format!("{}{}", new_id, orient));
-                    }
-                }
-            }
-            
-            if !path_elements.is_empty() {
-                writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+    for (_path_id, path) in path_entries {
+        // Convert name bytes to string once
+        let path_name = String::from_utf8_lossy(&path.name);
+        
+        // Pre-allocate vector with capacity
+        let mut path_elements = Vec::with_capacity(path.nodes.len());
+        
+        // Process nodes directly from path reference
+        for handle in &path.nodes {
+            let new_id = id_mapping[u64::from(handle.id()) as usize];
+            if new_id > 0 { // Only include nodes that weren't removed
+                let orient = if handle.is_reverse() { "-" } else { "+" };
+                path_elements.push(format!("{}{}", new_id, orient));
             }
         }
+        
+        if !path_elements.is_empty() {
+            writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+        }
     }
-    
+        
     Ok(())
 }
 
@@ -447,8 +445,8 @@ fn main() {
                 }
 
                 // Initialize new vectors to store updated steps
-                let mut new_steps = Vec::new();
-                let mut new_step_positions = Vec::new();
+                let mut new_steps = Vec::with_capacity(r2.steps.len() / 2);
+                let mut new_step_positions = Vec::with_capacity(r2.steps.len() / 2);
 
                 // Iterate over the original steps
                 for idx in 0..r2.steps.len() {
@@ -691,11 +689,9 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
     let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1];
     
     // Mark nodes used in paths as not to be removed (set bit to 0)
-    for path_id in graph.path_ids() {
-        if let Some(path_ref) = graph.get_path_ref(path_id) {
-            for handle in &path_ref.nodes {
-                nodes_to_remove.set(u64::from(handle.id()) as usize, false);
-            }
+    for (_path_id, path_ref) in graph.paths.iter() {
+        for handle in &path_ref.nodes {
+            nodes_to_remove.set(u64::from(handle.id()) as usize, false);
         }
     }
     
@@ -826,58 +822,59 @@ mod tests {
                 .iter()
                 .map(|(start, end, gfa_id)| create_range_info(*start, *end, *gfa_id))
                 .collect();
-// Sort ranges by start position
-ranges.sort_by_key(|r| (r.start, r.end));
-
-let mut write_idx = 0;
-for read_idx in 1..ranges.len() {
-    let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
-    let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
-
-    if curr_start == prev_start && curr_end == prev_end {
-        // Skip duplicate range
-        continue;
-    } else if curr_start >= prev_start && curr_end <= prev_end {
-        // Skip range that is fully contained within previous range
-        continue;
-    } else if prev_start >= curr_start && prev_end <= curr_end {
-        // Previous range is fully contained within current range
-        ranges.swap(write_idx, read_idx);
-    } else if curr_start < prev_end {
-        // Handle overlapping ranges - check both previous and next ranges
-        let mut should_skip = false;
-        
-        if read_idx < ranges.len() - 1 {
-            let next_start = ranges[read_idx + 1].start;
-            
-            // Check if current range is significantly overlapped by both neighbors
-            if curr_start > prev_start && next_start < curr_end {
-                let overlap_with_prev = prev_end - curr_start;
-                let overlap_with_next = curr_end - next_start;
-                let range_length = curr_end - curr_start;
-                
-                // Skip if the range is mostly covered by its neighbors
-                if overlap_with_prev + overlap_with_next > range_length {
-                    should_skip = true;
+
+            // Sort ranges by start position
+            ranges.sort_by_key(|r| (r.start, r.end));
+
+            let mut write_idx = 0;
+            for read_idx in 1..ranges.len() {
+                let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
+                let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
+
+                if curr_start == prev_start && curr_end == prev_end {
+                    // Skip duplicate range
+                    continue;
+                } else if curr_start >= prev_start && curr_end <= prev_end {
+                    // Skip range that is fully contained within previous range
+                    continue;
+                } else if prev_start >= curr_start && prev_end <= curr_end {
+                    // Previous range is fully contained within current range
+                    ranges.swap(write_idx, read_idx);
+                } else if curr_start < prev_end {
+                    // Handle overlapping ranges - check both previous and next ranges
+                    let mut should_skip = false;
+                    
+                    if read_idx < ranges.len() - 1 {
+                        let next_start = ranges[read_idx + 1].start;
+                        
+                        // Check if current range is significantly overlapped by both neighbors
+                        if curr_start > prev_start && next_start < curr_end {
+                            let overlap_with_prev = prev_end - curr_start;
+                            let overlap_with_next = curr_end - next_start;
+                            let range_length = curr_end - curr_start;
+                            
+                            // Skip if the range is mostly covered by its neighbors
+                            if overlap_with_prev + overlap_with_next > range_length {
+                                should_skip = true;
+                            }
+                        }
+                    }
+                    
+                    if !should_skip {
+                        write_idx += 1;
+                        if write_idx != read_idx {
+                            ranges.swap(write_idx, read_idx);
+                        }
+                    }
+                } else {
+                    // No overlap - keep both ranges
+                    write_idx += 1;
+                    if write_idx != read_idx {
+                        ranges.swap(write_idx, read_idx);
+                    }
                 }
             }
-        }
-        
-        if !should_skip {
-            write_idx += 1;
-            if write_idx != read_idx {
-                ranges.swap(write_idx, read_idx);
-            }
-        }
-    } else {
-        // No overlap - keep both ranges
-        write_idx += 1;
-        if write_idx != read_idx {
-            ranges.swap(write_idx, read_idx);
-        }
-    }
-}
-ranges.truncate(write_idx + 1);
+            ranges.truncate(write_idx + 1);
 
             // Create expected ranges
             let expected: Vec<RangeInfo> = expected_ranges

From c4beda4a5274e39da1919aa0000c2140c46348e4 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 13:16:20 -0600
Subject: [PATCH 31/45] ~2X faster

---
 src/main.rs | 77 ++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 12df35f..cd938e3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -10,9 +10,6 @@ use handlegraph::{
     handlegraph::*,
     mutablehandlegraph::*,
     pathhandlegraph::{
-        IntoPathIds, 
-        GraphPathNames, 
-        GraphPathsRef,
         MutableGraphPaths,
         GraphPaths
     },
@@ -237,47 +234,43 @@ fn main() {
         }
 
         // Process paths and collect ranges with their steps
-        for path_id in block_graph.path_ids() {
-            if let Some(name_iter) = block_graph.get_path_name(path_id) {
-                let path_name = String::from_utf8(name_iter.collect::<Vec<u8>>()).unwrap();
-                
-                if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) {
-                    // Get the path steps and translate their IDs
-                    let mut translated_steps = Vec::new();
-                    let mut step_positions = Vec::new();
-                    let mut step_lengths = Vec::new();
-                    let mut cumulative_pos = start;
-
-                    if let Some(path_ref) = block_graph.get_path_ref(path_id) {
-                        for step in path_ref.nodes.iter() {
-                            let translated_id = id_translation + step.id().into();
-                            let translated_step = Handle::pack(translated_id, step.is_reverse());
-                            translated_steps.push(translated_step);
-
-                            // Get the sequence length of the node
-                            let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
-                            let node_length = node_seq.len();
-                            step_lengths.push(node_length);
-
-                            // Record the end position of this step
-                            step_positions.push((cumulative_pos, cumulative_pos + node_length));
-                            cumulative_pos = cumulative_pos + node_length;
-                        }
+        for (_path_id, path_ref) in block_graph.paths.iter() {
+            let path_name = String::from_utf8_lossy(&path_ref.name);
+            
+            if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) {
+                // Get the path steps and translate their IDs
+                let mut translated_steps = Vec::new();
+                let mut step_positions = Vec::new();
+                let mut step_lengths = Vec::new();
+                let mut cumulative_pos = start;
+
+                    for step in path_ref.nodes.iter() {
+                        let translated_id = id_translation + step.id().into();
+                        let translated_step = Handle::pack(translated_id, step.is_reverse());
+                        translated_steps.push(translated_step);
+
+                        // Get the sequence length of the node
+                        let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
+                        let node_length = node_seq.len();
+                        step_lengths.push(node_length);
+
+                        // Record the end position of this step
+                        step_positions.push((cumulative_pos, cumulative_pos + node_length));
+                        cumulative_pos = cumulative_pos + node_length;
                     }
 
-                    if !translated_steps.is_empty() {
-                        path_key_ranges.entry(sample_hap_name)
-                        .or_default()
-                        .push(RangeInfo { 
-                            start, 
-                            end, 
-                            gfa_id,
-                            steps: translated_steps,
-                            step_positions,
-                        });
-                    } else if args.debug {
-                        eprintln!("  Warning: Path '{}' has no steps", path_name);
-                    }
+                if !translated_steps.is_empty() {
+                    path_key_ranges.entry(sample_hap_name)
+                    .or_default()
+                    .push(RangeInfo { 
+                        start, 
+                        end, 
+                        gfa_id,
+                        steps: translated_steps,
+                        step_positions,
+                    });
+                } else if args.debug {
+                    eprintln!("  Warning: Path '{}' has no steps", path_name);
                 }
             }
         }

From 999e2196215c5e0c4e38b30a454cbd03767dff5a Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 14:37:02 -0600
Subject: [PATCH 32/45] tweaks

---
 src/main.rs | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index cd938e3..aa8a51a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -165,21 +165,21 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
 
     // Collect and sort paths directly with references to avoid multiple lookups
     let mut path_entries: Vec<_> = graph.paths.iter().collect();
-    path_entries.sort_by_key(|(_, path)| {
+    path_entries.sort_by_key(|(_, path_ref)| {
         // Get name directly from path struct instead of doing lookups
-        path.name.as_slice()
+        path_ref.name.as_slice()
     });
 
     // Write paths with new IDs
-    for (_path_id, path) in path_entries {
+    for (_path_id, path_ref) in path_entries {
         // Convert name bytes to string once
-        let path_name = String::from_utf8_lossy(&path.name);
+        let path_name = String::from_utf8_lossy(&path_ref.name);
         
         // Pre-allocate vector with capacity
-        let mut path_elements = Vec::with_capacity(path.nodes.len());
+        let mut path_elements = Vec::with_capacity(path_ref.nodes.len());
         
         // Process nodes directly from path reference
-        for handle in &path.nodes {
+        for handle in &path_ref.nodes {
             let new_id = id_mapping[u64::from(handle.id()) as usize];
             if new_id > 0 { // Only include nodes that weren't removed
                 let orient = if handle.is_reverse() { "-" } else { "+" };
@@ -241,23 +241,21 @@ fn main() {
                 // Get the path steps and translate their IDs
                 let mut translated_steps = Vec::new();
                 let mut step_positions = Vec::new();
-                let mut step_lengths = Vec::new();
                 let mut cumulative_pos = start;
 
-                    for step in path_ref.nodes.iter() {
-                        let translated_id = id_translation + step.id().into();
-                        let translated_step = Handle::pack(translated_id, step.is_reverse());
-                        translated_steps.push(translated_step);
+                for step in path_ref.nodes.iter() {
+                    let translated_id = id_translation + step.id().into();
+                    let translated_step = Handle::pack(translated_id, step.is_reverse());
+                    translated_steps.push(translated_step);
 
-                        // Get the sequence length of the node
-                        let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
-                        let node_length = node_seq.len();
-                        step_lengths.push(node_length);
+                    // Get the sequence length of the node
+                    let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
+                    let node_length = node_seq.len();
 
-                        // Record the end position of this step
-                        step_positions.push((cumulative_pos, cumulative_pos + node_length));
-                        cumulative_pos = cumulative_pos + node_length;
-                    }
+                    // Record the end position of this step
+                    step_positions.push((cumulative_pos, cumulative_pos + node_length));
+                    cumulative_pos = cumulative_pos + node_length;
+                }
 
                 if !translated_steps.is_empty() {
                     path_key_ranges.entry(sample_hap_name)
@@ -394,11 +392,9 @@ fn main() {
                 let overlap_end = std::cmp::min(r1.end, r2.end);
 
                 if args.debug {
-                    let overlap_amount = overlap_end - overlap_start;
-
                     eprintln!(
                         "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
-                        r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_amount
+                        r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
                     );
                 }
 
@@ -547,14 +543,17 @@ fn main() {
         let mut all_contiguous = true;
         
         for window in ranges.windows(2) {
-            if has_overlap(&window[0], &window[1]) {
+            let r1 = &window[0];
+            let r2 = &window[1];
+
+            if has_overlap(r1, r2) {
                 if args.debug {
                     eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
-                    window[0].start, window[0].end, window[1].start, window[1].end);
+                    r1.start, r1.end, r2.start, r2.end);
                 }
                 panic!("Unresolved overlaps detected in path key '{}'", path_key);
             }
-            if !is_contiguous(&window[0], &window[1]) {
+            if !is_contiguous(r1, r2) {
                 all_contiguous = false;
             }
         }
@@ -718,9 +717,8 @@ mod tests {
             start,
             end,
             gfa_id,
-            steps: vec![],           // Empty steps for testing
+            steps: vec![],            // Empty steps for testing
             step_positions: vec![],   // Empty positions for testing
-            step_lengths: vec![],     // Empty lengths for testing
         }
     }
 

From 7956b68218121775e58cde77a3a492cb5ebf85b7 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 14:44:19 -0600
Subject: [PATCH 33/45] remove unecessary vector

---
 src/main.rs | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index aa8a51a..147f482 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -22,11 +22,11 @@ use tempfile::NamedTempFile;
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
 struct Args {
-    /// List of GFA file paths.
+    /// List of GFA file paths to combine
     #[clap(short, long, value_parser, num_args = 1.., value_delimiter = ' ')]
     gfa_list: Vec<String>,
 
-    /// Output GFA file path
+    /// Output GFA file path for the combined graph
     #[clap(short, long, value_parser)]
     output: String,
 
@@ -418,7 +418,7 @@ fn main() {
                         // }
                         steps_to_remove.push(idx);
                     } else {
-                        // if args.debug {
+                        // if args.debug && r2.start == 11000 {
                         //     eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
                         // }
                         if step_to_split.is_some() {
@@ -428,10 +428,10 @@ fn main() {
                     }
                 }
 
-                if args.debug && r2.start == 154089 {
-                    eprintln!("    Total steps to remove: {}", steps_to_remove.len());
-                    eprintln!("    Step to split: {:?}", step_to_split);   
-                }
+                // if args.debug && r2.start == 11000 {
+                //     eprintln!("    Total steps to remove: {}", steps_to_remove.len());
+                //     eprintln!("    Step to split: {:?}", step_to_split);   
+                // }
 
                 // Initialize new vectors to store updated steps
                 let mut new_steps = Vec::with_capacity(r2.steps.len() / 2);
@@ -561,17 +561,15 @@ fn main() {
         if !all_contiguous && args.debug {
             let mut current_start = ranges[0].start;
             let mut current_end = ranges[0].end;
-            let mut current_gfa_ids = vec![ranges[0].gfa_id];
             
             for i in 1..ranges.len() {
                 if is_contiguous(&ranges[i-1], &ranges[i]) {
                     // Extend current merged range
                     current_end = ranges[i].end;
-                    current_gfa_ids.push(ranges[i].gfa_id);
                 } else {
                     // Print current merged range
-                    eprintln!("    Merged range: start={}, end={}, gfa_ids={:?}", 
-                        current_start, current_end, current_gfa_ids);
+                    eprintln!("    Merged range: start={}, end={}", 
+                        current_start, current_end);
                     
                     if !has_overlap(&ranges[i-1], &ranges[i]) {
                         // Calculate and print gap
@@ -586,13 +584,12 @@ fn main() {
                     // Start new merged range
                     current_start = ranges[i].start;
                     current_end = ranges[i].end;
-                    current_gfa_ids = vec![ranges[i].gfa_id];
                 }
             }
             
             // Print final merged range
-            eprintln!("    Merged range: start={}, end={}, gfa_ids={:?}", 
-                current_start, current_end, current_gfa_ids);
+            eprintln!("    Merged range: start={}, end={}", 
+                current_start, current_end);
         }
 
         if all_contiguous {
@@ -654,10 +651,7 @@ fn main() {
 
     if args.debug {
         eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
-    }
 
-    // After all paths are created but before writing the graph
-    if args.debug {
         eprintln!("Total nodes before filtering: {}", combined_graph.node_count());
     }
     

From f82584ec94bc2826fb6edb363a2b4fbc54e7378b Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 15:19:34 -0600
Subject: [PATCH 34/45] a bit faster and use less memory

---
 src/main.rs | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 147f482..6d46809 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -40,8 +40,8 @@ struct RangeInfo {
     start: usize,
     end: usize,
     gfa_id: usize,
-    steps: Vec<Handle>,                    // Path steps for this range
-    step_positions: Vec<(usize, usize)>,   // Start and end positions of each step
+    steps: Vec<Handle>,     // Path steps for this range
+    step_ends: Vec<usize>,  // End positions of each step (start is either the range start (for index 0) or the previous step's end position)
 }
 
 // Helper function to check if two ranges are contiguous
@@ -240,7 +240,7 @@ fn main() {
             if let Some((sample_hap_name, start, end)) = split_path_name(&path_name) {
                 // Get the path steps and translate their IDs
                 let mut translated_steps = Vec::new();
-                let mut step_positions = Vec::new();
+                let mut step_ends = Vec::new();
                 let mut cumulative_pos = start;
 
                 for step in path_ref.nodes.iter() {
@@ -248,13 +248,11 @@ fn main() {
                     let translated_step = Handle::pack(translated_id, step.is_reverse());
                     translated_steps.push(translated_step);
 
-                    // Get the sequence length of the node
+                    // Record the end position of this step
                     let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
                     let node_length = node_seq.len();
-
-                    // Record the end position of this step
-                    step_positions.push((cumulative_pos, cumulative_pos + node_length));
                     cumulative_pos = cumulative_pos + node_length;
+                    step_ends.push(cumulative_pos);
                 }
 
                 if !translated_steps.is_empty() {
@@ -265,7 +263,7 @@ fn main() {
                         end, 
                         gfa_id,
                         steps: translated_steps,
-                        step_positions,
+                        step_ends,
                     });
                 } else if args.debug {
                     eprintln!("  Warning: Path '{}' has no steps", path_name);
@@ -401,7 +399,8 @@ fn main() {
                 // Adjust r2 to remove the overlap
                 let mut steps_to_remove = Vec::new();
                 let mut step_to_split: Option<usize> = None;
-                for (idx, &(step_start, step_end)) in r2.step_positions.iter().enumerate() {
+                for (idx, &step_end) in r2.step_ends.iter().enumerate() {
+                    let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
                     if step_end <= overlap_start {
                         // if args.debug && r2.start == 11000 {
                         //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
@@ -435,12 +434,14 @@ fn main() {
 
                 // Initialize new vectors to store updated steps
                 let mut new_steps = Vec::with_capacity(r2.steps.len() / 2);
-                let mut new_step_positions = Vec::with_capacity(r2.steps.len() / 2);
+                let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2);
+                let mut range_new_start = None;
 
                 // Iterate over the original steps
                 for idx in 0..r2.steps.len() {
                     let step_handle = r2.steps[idx];
-                    let (step_start, step_end) = r2.step_positions[idx];
+                    let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
+                    let step_end = r2.step_ends[idx];
 
                     if steps_to_remove.contains(&idx) {
                         // Skip steps to remove
@@ -477,7 +478,10 @@ fn main() {
                             let new_node = combined_graph.create_handle(&new_seq, node_id);
 
                             new_steps.push(new_node);
-                            new_step_positions.push((step_start, overlap_start));
+                            new_step_ends.push(overlap_start);
+                            if range_new_start.is_none() {
+                                range_new_start = Some(step_start);
+                            }
                         } else if step_end > overlap_end {
                             if args.debug {
                                 eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
@@ -492,18 +496,24 @@ fn main() {
                             let new_node = combined_graph.create_handle(&new_seq, node_id);
 
                             new_steps.push(new_node);
-                            new_step_positions.push((overlap_end, step_end));
+                            new_step_ends.push(step_end);
+                            if range_new_start.is_none() {
+                                range_new_start = Some(overlap_end);
+                            }
                         }
                     } else {
                         // Keep steps that are not to be removed or split
                         new_steps.push(step_handle);
-                        new_step_positions.push((step_start, step_end));
+                        new_step_ends.push(step_end);
+                        if range_new_start.is_none() {
+                            range_new_start = Some(step_start);
+                        }
                     }
                 }
 
                 // Update r2 with the new steps
                 r2.steps = new_steps;
-                r2.step_positions = new_step_positions;
+                r2.step_ends = new_step_ends;
 
                 // Update edges for the modified steps
                 for idx in 0..r2.steps.len() {
@@ -516,16 +526,15 @@ fn main() {
                 }
 
                 // Update r2.start and r2.end based on the new step positions
-                if !r2.step_positions.is_empty() {
-                    r2.start = r2.step_positions.first().unwrap().0;
-                    r2.end = r2.step_positions.last().unwrap().1;
+                if !r2.step_ends.is_empty() {
+                    r2.start = range_new_start.unwrap();  // Safe because if we have positions, we must have set range_new_start
+                    r2.end = *r2.step_ends.last().unwrap();
                 } else {
                     // If no steps remain, set start and end to overlap_end to effectively remove this range
                     r2.start = overlap_end;
                     r2.end = overlap_end;
                 }
 
-
                 if args.debug {
                     eprintln!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
                 }
@@ -712,7 +721,7 @@ mod tests {
             end,
             gfa_id,
             steps: vec![],            // Empty steps for testing
-            step_positions: vec![],   // Empty positions for testing
+            step_ends: vec![],   // Empty positions for testing
         }
     }
 

From 1d2bf00387ca578fb438c63fab1be79f82198298 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 16:21:29 -0600
Subject: [PATCH 35/45] tweaks

---
 src/main.rs | 67 +++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 6d46809..907a961 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -43,15 +43,18 @@ struct RangeInfo {
     steps: Vec<Handle>,     // Path steps for this range
     step_ends: Vec<usize>,  // End positions of each step (start is either the range start (for index 0) or the previous step's end position)
 }
+impl RangeInfo {
+    /// Returns true if this range is immediately followed by another range
+    /// with no gap between them
+    fn is_contiguous_with(&self, other: &Self) -> bool {
+        self.end == other.start
+    }
 
-// Helper function to check if two ranges are contiguous
-fn is_contiguous(r1: &RangeInfo, r2: &RangeInfo) -> bool {
-    r1.end == r2.start
-}
-
-// Helper function to check if two ranges overlap
-fn has_overlap(r1: &RangeInfo, r2: &RangeInfo) -> bool {
-    r1.start < r2.end && r2.start < r1.end
+    /// Returns true if this range overlaps with another range
+    /// Two ranges overlap if one starts before the other ends
+    fn overlaps_with(&self, other: &Self) -> bool {
+        self.start < other.end && other.start < self.end
+    }
 }
 
 fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> io::Result<GFA<usize, ()>> {
@@ -120,14 +123,13 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
     writeln!(file, "H\tVN:Z:1.0")?;
     
     // Collect nodes, excluding marked nodes
-    let mut nodes: Vec<Handle> = graph.handles()
+    let nodes: Vec<Handle> = graph.handles()
         .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
         .collect();
-    nodes.sort_by_key(|handle| handle.id());
     
-    // Create mapping from old IDs to new sequential IDs using a Vec
-    // We allocate a vector with capacity for the max node ID
-    let max_id = u64::from(nodes.last().unwrap().id()) as usize;
+    // Create mapping from old IDs to new sequential IDs
+    // We allocate a vector with capacity for the max node ID + 1
+    let max_id = graph.node_count();
     let mut id_mapping = vec![0; max_id + 1];
     for (new_id, handle) in nodes.iter().enumerate() {
         id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1
@@ -136,23 +138,18 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
     // Write nodes with new IDs
     for handle in &nodes {
         let sequence = graph.sequence(*handle).collect::<Vec<_>>();
-        let sequence_str = String::from_utf8(sequence)
-            .unwrap_or_else(|_| String::from("N"));
-        let new_id = id_mapping[u64::from(handle.id()) as usize];
-        writeln!(file, "S\t{}\t{}", new_id, sequence_str)?;
+        let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N"));
+        let node_id = id_mapping[u64::from(handle.id()) as usize];
+        writeln!(file, "S\t{}\t{}", node_id, sequence_str)?;
     }
     
     // Collect and sort edges, excluding those connected to marked nodes
-    let mut edges: Vec<Edge> = graph.edges()
+    let edges: Vec<Edge> = graph.edges()
         .filter(|edge| {
             !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
             !nodes_to_remove[u64::from(edge.1.id()) as usize]
         })
         .collect();
-    edges.sort_by(|a, b| {
-        a.0.id().cmp(&b.0.id())
-            .then(a.1.id().cmp(&b.1.id()))
-    });
     
     // Write edges with new IDs
     for edge in edges {
@@ -172,22 +169,18 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &Bi
 
     // Write paths with new IDs
     for (_path_id, path_ref) in path_entries {
-        // Convert name bytes to string once
-        let path_name = String::from_utf8_lossy(&path_ref.name);
-        
         // Pre-allocate vector with capacity
         let mut path_elements = Vec::with_capacity(path_ref.nodes.len());
         
         // Process nodes directly from path reference
         for handle in &path_ref.nodes {
-            let new_id = id_mapping[u64::from(handle.id()) as usize];
-            if new_id > 0 { // Only include nodes that weren't removed
-                let orient = if handle.is_reverse() { "-" } else { "+" };
-                path_elements.push(format!("{}{}", new_id, orient));
-            }
+            let node_id = id_mapping[u64::from(handle.id()) as usize];
+            let orient = if handle.is_reverse() { "-" } else { "+" };
+            path_elements.push(format!("{}{}", node_id, orient));
         }
         
         if !path_elements.is_empty() {
+            let path_name = String::from_utf8_lossy(&path_ref.name);
             writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
         }
     }
@@ -384,7 +377,7 @@ fn main() {
             let r1 = &mut left[left.len()-1];
             let r2 = &mut right[0];
 
-            if has_overlap(r1, r2) {
+            if r1.overlaps_with(&r2) {
                 // Calculate the overlap region - use max/min to get precise overlap bounds
                 let overlap_start = std::cmp::max(r1.start, r2.start);
                 let overlap_end = std::cmp::min(r1.end, r2.end);
@@ -555,14 +548,14 @@ fn main() {
             let r1 = &window[0];
             let r2 = &window[1];
 
-            if has_overlap(r1, r2) {
+            if r1.overlaps_with(&r2) {
                 if args.debug {
                     eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
                     r1.start, r1.end, r2.start, r2.end);
                 }
                 panic!("Unresolved overlaps detected in path key '{}'", path_key);
             }
-            if !is_contiguous(r1, r2) {
+            if !r1.is_contiguous_with(&r2) {
                 all_contiguous = false;
             }
         }
@@ -572,7 +565,7 @@ fn main() {
             let mut current_end = ranges[0].end;
             
             for i in 1..ranges.len() {
-                if is_contiguous(&ranges[i-1], &ranges[i]) {
+                if ranges[i-1].is_contiguous_with(&ranges[i]) {
                     // Extend current merged range
                     current_end = ranges[i].end;
                 } else {
@@ -580,7 +573,7 @@ fn main() {
                     eprintln!("    Merged range: start={}, end={}", 
                         current_start, current_end);
                     
-                    if !has_overlap(&ranges[i-1], &ranges[i]) {
+                    if !ranges[i-1].overlaps_with(&ranges[i]) {
                         // Calculate and print gap
                         let gap = ranges[i].start - current_end;
                         eprintln!("      Gap to next range: {} positions", gap);
@@ -629,7 +622,7 @@ fn main() {
                 let mut end_range = start_range;
                 
                 // Merge contiguous ranges
-                while next_idx < ranges.len() && is_contiguous(&ranges[next_idx - 1], &ranges[next_idx]) {
+                while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
                     steps.extend(ranges[next_idx].steps.clone());
                     end_range = &ranges[next_idx];
                     next_idx += 1;
@@ -637,7 +630,6 @@ fn main() {
                 
                 // Create path name with range information
                 let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
-                println!("Creating path: {} with {} steps", path_name, steps.len());
                 let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
                 
                 // Add steps to the path
@@ -677,7 +669,6 @@ fn main() {
     }
 }
 
-
 fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
     // Create a bitvector with all nodes initially marked for removal
     let max_node_id = u64::from(graph.max_node_id());

From 1dd5a368fbd7db0d324e20c53f45461667a36964 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 18:55:40 -0600
Subject: [PATCH 36/45] a bit of structure in the code

---
 src/main.rs | 999 +++++++++++++++++++++++++++-------------------------
 1 file changed, 517 insertions(+), 482 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 907a961..bdbeefd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -35,6 +35,38 @@ struct Args {
     debug: bool,
 }
 
+fn main() {
+    let args = Args::parse();
+
+    // Create a single combined graph without paths and a map of path key to ranges
+    let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list, args.debug);
+
+    // Sort ranges and create merged paths in the combined graph
+    for (path_key, ranges) in path_key_ranges.iter_mut() {
+        sort_and_filter_ranges(path_key, ranges, args.debug);
+        trim_range_overlaps(path_key, ranges, &mut combined_graph, args.debug);
+        create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.debug);
+    }
+
+    if args.debug {
+        eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
+
+        eprintln!("Total nodes before filtering: {}", combined_graph.node_count());
+    }
+    
+    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);
+    
+    if args.debug {
+        eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones());
+    }
+
+    // Write the combined graph to GFA file, skipping unused nodes
+    match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
+        Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)},
+        Err(e) => eprintln!("Error writing GFA file: {}", e),
+    }
+}
+
 #[derive(Debug, Clone)]
 struct RangeInfo {
     start: usize,
@@ -57,148 +89,17 @@ impl RangeInfo {
     }
 }
 
-fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> io::Result<GFA<usize, ()>> {
-    if gfa_path.ends_with(".gz") {
-        let file = std::fs::File::open(gfa_path).map_err(|e| {
-            io::Error::new(
-                e.kind(),
-                format!("Failed to open gzipped file '{}': {}", gfa_path, e)
-            )
-        })?;
-        
-        let (mut reader, _format) = niffler::get_reader(Box::new(file))
-            .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
-        
-        let mut decompressed = Vec::new();
-        reader.read_to_end(&mut decompressed).map_err(|e| {
-            io::Error::new(
-                e.kind(),
-                format!("Failed to decompress file '{}': {}", gfa_path, e)
-            )
-        })?;
-        
-        // Create temporary file in the same directory as the input file for better performance
-        let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new("."));
-        let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| {
-            io::Error::new(
-                e.kind(),
-                format!("Failed to create temporary file: {}", e)
-            )
-        })?;
-        
-        // Write decompressed data
-        temp_file.as_file().write_all(&decompressed).map_err(|e| {
-            io::Error::new(
-                e.kind(),
-                format!("Failed to write to temporary file: {}", e)
-            )
-        })?;
-        
-        // Parse GFA
-        parser.parse_file(temp_file.path().to_str().ok_or_else(|| {
-            io::Error::new(
-                io::ErrorKind::InvalidData,
-                "Invalid temporary file path"
-            )
-        })?).map_err(|e| {
-            io::Error::new(
-                io::ErrorKind::InvalidData,
-                format!("Failed to parse GFA: {}", e)
-            )
-        })
-    } else {
-        parser.parse_file(gfa_path).map_err(|e| {
-            io::Error::new(
-                io::ErrorKind::InvalidData,
-                format!("Failed to parse GFA file '{}': {}", gfa_path, e)
-            )
-        })
-    }
-}
-
-fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> {
-    let mut file = File::create(output_path)?;
-    
-    // Write GFA version
-    writeln!(file, "H\tVN:Z:1.0")?;
-    
-    // Collect nodes, excluding marked nodes
-    let nodes: Vec<Handle> = graph.handles()
-        .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
-        .collect();
-    
-    // Create mapping from old IDs to new sequential IDs
-    // We allocate a vector with capacity for the max node ID + 1
-    let max_id = graph.node_count();
-    let mut id_mapping = vec![0; max_id + 1];
-    for (new_id, handle) in nodes.iter().enumerate() {
-        id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1
-    }
-    
-    // Write nodes with new IDs
-    for handle in &nodes {
-        let sequence = graph.sequence(*handle).collect::<Vec<_>>();
-        let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N"));
-        let node_id = id_mapping[u64::from(handle.id()) as usize];
-        writeln!(file, "S\t{}\t{}", node_id, sequence_str)?;
-    }
-    
-    // Collect and sort edges, excluding those connected to marked nodes
-    let edges: Vec<Edge> = graph.edges()
-        .filter(|edge| {
-            !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
-            !nodes_to_remove[u64::from(edge.1.id()) as usize]
-        })
-        .collect();
-    
-    // Write edges with new IDs
-    for edge in edges {
-        let from_id = id_mapping[u64::from(edge.0.id()) as usize];
-        let to_id = id_mapping[u64::from(edge.1.id()) as usize];
-        let from_orient = if edge.0.is_reverse() { "-" } else { "+" };
-        let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
-        writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
-    }
-
-    // Collect and sort paths directly with references to avoid multiple lookups
-    let mut path_entries: Vec<_> = graph.paths.iter().collect();
-    path_entries.sort_by_key(|(_, path_ref)| {
-        // Get name directly from path struct instead of doing lookups
-        path_ref.name.as_slice()
-    });
-
-    // Write paths with new IDs
-    for (_path_id, path_ref) in path_entries {
-        // Pre-allocate vector with capacity
-        let mut path_elements = Vec::with_capacity(path_ref.nodes.len());
-        
-        // Process nodes directly from path reference
-        for handle in &path_ref.nodes {
-            let node_id = id_mapping[u64::from(handle.id()) as usize];
-            let orient = if handle.is_reverse() { "-" } else { "+" };
-            path_elements.push(format!("{}{}", node_id, orient));
-        }
-        
-        if !path_elements.is_empty() {
-            let path_name = String::from_utf8_lossy(&path_ref.name);
-            writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
-        }
-    }
-        
-    Ok(())
-}
-
-fn main() {
-    let args = Args::parse();
-
-    // Create a single combined graph
+fn process_gfa_files(
+    gfa_list: &[String],
+    debug: bool,
+) -> (HashGraph, FxHashMap<String, Vec<RangeInfo>>) {
     let mut combined_graph = HashGraph::new();
     let mut path_key_ranges: FxHashMap<String, Vec<RangeInfo>> = FxHashMap::default();
     let mut id_translations = Vec::new();
 
     // Process each GFA file
     let parser = GFAParser::new();
-    for (gfa_id, gfa_path) in args.gfa_list.iter().enumerate() {
+    for (gfa_id, gfa_path) in gfa_list.iter().enumerate() {
         let gfa = read_gfa(gfa_path, &parser).unwrap();
         let block_graph = HashGraph::from_gfa(&gfa);
 
@@ -222,7 +123,7 @@ fn main() {
             combined_graph.create_edge(translated_edge);
         }
         
-        if args.debug {
+        if debug {
             eprintln!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count());
         }
 
@@ -258,415 +159,494 @@ fn main() {
                         steps: translated_steps,
                         step_ends,
                     });
-                } else if args.debug {
+                } else if debug {
                     eprintln!("  Warning: Path '{}' has no steps", path_name);
                 }
             }
         }
     }
 
-    if args.debug {
+    if debug {
         eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
             combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::<usize>());
     }
 
-    let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
+    (combined_graph, path_key_ranges)
+}
 
-    // Sort ranges and create merged paths in the combined graph
-    for (path_key, ranges) in path_key_ranges.iter_mut() {
-        // Sort ranges by start position
-        ranges.sort_by_key(|r| (r.start, r.end));
+fn read_gfa(gfa_path: &str, parser: &GFAParser<usize, ()>) -> io::Result<GFA<usize, ()>> {
+    if gfa_path.ends_with(".gz") {
+        let file = std::fs::File::open(gfa_path).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to open gzipped file '{}': {}", gfa_path, e)
+            )
+        })?;
+        
+        let (mut reader, _format) = niffler::get_reader(Box::new(file))
+            .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
+        
+        let mut decompressed = Vec::new();
+        reader.read_to_end(&mut decompressed).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to decompress file '{}': {}", gfa_path, e)
+            )
+        })?;
+        
+        // Create temporary file in the same directory as the input file for better performance
+        let parent_dir = Path::new(gfa_path).parent().unwrap_or(Path::new("."));
+        let temp_file = NamedTempFile::new_in(parent_dir).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to create temporary file: {}", e)
+            )
+        })?;
+        
+        // Write decompressed data
+        temp_file.as_file().write_all(&decompressed).map_err(|e| {
+            io::Error::new(
+                e.kind(),
+                format!("Failed to write to temporary file: {}", e)
+            )
+        })?;
+        
+        // Parse GFA
+        parser.parse_file(temp_file.path().to_str().ok_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid temporary file path"
+            )
+        })?).map_err(|e| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Failed to parse GFA: {}", e)
+            )
+        })
+    } else {
+        parser.parse_file(gfa_path).map_err(|e| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Failed to parse GFA file '{}': {}", gfa_path, e)
+            )
+        })
+    }
+}
 
-        if args.debug {
-            eprintln!("Processing path key '{}'", path_key);
-            for range in ranges.iter() {
-                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> {
+    // Find the last ':' to split the range from the key
+    if let Some(last_colon) = path_name.rfind(':') {
+        let (key, range_str) = path_name.split_at(last_colon);
+        // Skip the ':' character
+        let range_str = &range_str[1..];
+        
+        // Find the '-' in the range portion
+        if let Some((start_str, end_str)) = range_str.split_once('-') {
+            if let (Ok(start), Ok(end)) = (start_str.parse(), end_str.parse()) {
+                return Some((key.to_string(), start, end));
             }
+        }
+    }
+    None
+}
 
-            eprintln!("Removing redundant ranges");
+fn sort_and_filter_ranges(
+    path_key: &String,
+    ranges: &mut Vec<RangeInfo>,
+    debug: bool
+) {
+    // Sort ranges by start position
+    ranges.sort_by_key(|r| (r.start, r.end));
+
+    if debug {
+        eprintln!("Processing path key '{}'", path_key);
+        for range in ranges.iter() {
+            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
 
-        // Remove ranges that are contained within other ranges
-        let mut write_idx = 0;
-        for read_idx in 1..ranges.len() {
-            let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
-            let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
-        
-            if curr_start == prev_start && curr_end == prev_end {
-                // Skip duplicate range
-                // Current range is a duplicate of the previous range, skip it
-                if args.debug {
-                    eprintln!(
-                        "  Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.",
-                        curr_start, curr_end
-                    );
-                }
-                continue;
-            } else if curr_start >= prev_start && curr_end <= prev_end {
-                // Skip range that is fully contained within previous range
-                if args.debug {
-                    eprintln!(
-                        "  Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.",
-                        curr_start, curr_end, prev_start, prev_end
-                    );
-                }
-                continue;
-            } else if prev_start >= curr_start && prev_end <= curr_end {
-                // Previous range is fully contained within current range
-                if args.debug {
-                    eprintln!(
-                        "  Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.",
-                        prev_start, prev_end, curr_start, curr_end
-                    );
-                }
-                ranges.swap(write_idx, read_idx);
-            } else if curr_start < prev_end {
-                // Handle overlapping ranges - check both previous and next ranges
-                let mut should_skip = false;
+        eprintln!("Removing redundant ranges");
+    }
+
+    // Remove ranges that are contained within other ranges
+    let mut write_idx = 0;
+    for read_idx in 1..ranges.len() {
+        let (prev_start, prev_end) = (ranges[write_idx].start, ranges[write_idx].end);
+        let (curr_start, curr_end) = (ranges[read_idx].start, ranges[read_idx].end);
+    
+        if curr_start == prev_start && curr_end == prev_end {
+            // Skip duplicate range
+            // Current range is a duplicate of the previous range, skip it
+            if debug {
+                eprintln!(
+                    "  Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.",
+                    curr_start, curr_end
+                );
+            }
+            continue;
+        } else if curr_start >= prev_start && curr_end <= prev_end {
+            // Skip range that is fully contained within previous range
+            if debug {
+                eprintln!(
+                    "  Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.",
+                    curr_start, curr_end, prev_start, prev_end
+                );
+            }
+            continue;
+        } else if prev_start >= curr_start && prev_end <= curr_end {
+            // Previous range is fully contained within current range
+            if debug {
+                eprintln!(
+                    "  Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.",
+                    prev_start, prev_end, curr_start, curr_end
+                );
+            }
+            ranges.swap(write_idx, read_idx);
+        } else if curr_start < prev_end {
+            // Handle overlapping ranges - check both previous and next ranges
+            let mut should_skip = false;
+            
+            if read_idx < ranges.len() - 1 {
+                let next_start = ranges[read_idx + 1].start;
                 
-                if read_idx < ranges.len() - 1 {
-                    let next_start = ranges[read_idx + 1].start;
+                // Check if current range is significantly overlapped by both neighbors
+                if curr_start > prev_start && next_start < curr_end {
+                    let overlap_with_prev = prev_end - curr_start;
+                    let overlap_with_next = curr_end - next_start;
+                    let range_length = curr_end - curr_start;
                     
-                    // Check if current range is significantly overlapped by both neighbors
-                    if curr_start > prev_start && next_start < curr_end {
-                        let overlap_with_prev = prev_end - curr_start;
-                        let overlap_with_next = curr_end - next_start;
-                        let range_length = curr_end - curr_start;
-                        
-                        // Skip if the range is mostly covered by its neighbors
-                        if overlap_with_prev + overlap_with_next > range_length {
-                            should_skip = true;
-                        }
+                    // Skip if the range is mostly covered by its neighbors
+                    if overlap_with_prev + overlap_with_next > range_length {
+                        should_skip = true;
                     }
                 }
-                
-                if !should_skip {
-                    if args.debug {
-                        eprintln!(
-                            "  Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.",
-                            curr_start, curr_end, prev_start, prev_end
-                        );
-                    }
-                    write_idx += 1;
-                    if write_idx != read_idx {
-                        ranges.swap(write_idx, read_idx);
-                    }
+            }
+            
+            if !should_skip {
+                if debug {
+                    eprintln!(
+                        "  Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.",
+                        curr_start, curr_end, prev_start, prev_end
+                    );
                 }
-            } else {
-                // No overlap - keep both ranges
                 write_idx += 1;
                 if write_idx != read_idx {
                     ranges.swap(write_idx, read_idx);
                 }
             }
-        }
-        ranges.truncate(write_idx + 1);
-        
-        if args.debug {
-            eprintln!("Path key '{}' without redundancy", path_key);
-            for range in ranges.iter() {
-                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+        } else {
+            // No overlap - keep both ranges
+            write_idx += 1;
+            if write_idx != read_idx {
+                ranges.swap(write_idx, read_idx);
             }
-
-            eprintln!("Trimming overlapping ranges");
         }
+    }
+    ranges.truncate(write_idx + 1);
+    
+    if debug {
+        eprintln!("Path key '{}' without redundancy", path_key);
+        for range in ranges.iter() {
+            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+        }
+    }
+}
 
-        // Process overlaps
-        for i in 1..ranges.len() {
-            let (left, right) = ranges.split_at_mut(i);
-            let r1 = &mut left[left.len()-1];
-            let r2 = &mut right[0];
+fn trim_range_overlaps(
+    path_key: &String,
+    ranges: &mut Vec<RangeInfo>, 
+    combined_graph: &mut HashGraph,
+    debug: bool
+) {
+    // Trim overlaps
+    if debug {
+        eprintln!("Trimming overlapping ranges");
+    }
 
-            if r1.overlaps_with(&r2) {
-                // Calculate the overlap region - use max/min to get precise overlap bounds
-                let overlap_start = std::cmp::max(r1.start, r2.start);
-                let overlap_end = std::cmp::min(r1.end, r2.end);
+    let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
 
-                if args.debug {
-                    eprintln!(
-                        "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
-                        r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
-                    );
-                }
+    for i in 1..ranges.len() {
+        let (left, right) = ranges.split_at_mut(i);
+        let r1 = &mut left[left.len()-1];
+        let r2 = &mut right[0];
 
-                // Adjust r2 to remove the overlap
-                let mut steps_to_remove = Vec::new();
-                let mut step_to_split: Option<usize> = None;
-                for (idx, &step_end) in r2.step_ends.iter().enumerate() {
-                    let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
-                    if step_end <= overlap_start {
-                        // if args.debug && r2.start == 11000 {
-                        //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
-                        // }
-                        continue;
-                    } else if step_start >= overlap_end {
-                        // if args.debug && r2.start == 11000 {
-                        //     eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
-                        // }
-                        break;
-                    } else if step_start >= overlap_start && step_end <= overlap_end {
-                        // if args.debug && r2.start == 11000 {
-                        //     eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
-                        // }
-                        steps_to_remove.push(idx);
-                    } else {
-                        // if args.debug && r2.start == 11000 {
-                        //     eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
-                        // }
-                        if step_to_split.is_some() {
-                            panic!("Error: More than one step is partially overlapping, which is not allowed.");
-                        }
-                        step_to_split = Some(idx);
+        if r1.overlaps_with(&r2) {
+            // Calculate the overlap region - use max/min to get precise overlap bounds
+            let overlap_start = std::cmp::max(r1.start, r2.start);
+            let overlap_end = std::cmp::min(r1.end, r2.end);
+
+            if debug {
+                eprintln!(
+                    "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
+                    r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
+                );
+            }
+
+            // Adjust r2 to remove the overlap
+            let mut steps_to_remove = Vec::new();
+            let mut step_to_split: Option<usize> = None;
+            for (idx, &step_end) in r2.step_ends.iter().enumerate() {
+                let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
+                if step_end <= overlap_start {
+                    // if debug && r2.start == 11000 {
+                    //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
+                    // }
+                    continue;
+                } else if step_start >= overlap_end {
+                    // if debug && r2.start == 11000 {
+                    //     eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
+                    // }
+                    break;
+                } else if step_start >= overlap_start && step_end <= overlap_end {
+                    // if debug && r2.start == 11000 {
+                    //     eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
+                    // }
+                    steps_to_remove.push(idx);
+                } else {
+                    // if debug && r2.start == 11000 {
+                    //     eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
+                    // }
+                    if step_to_split.is_some() {
+                        panic!("Error: More than one step is partially overlapping, which is not allowed.");
                     }
+                    step_to_split = Some(idx);
                 }
+            }
 
-                // if args.debug && r2.start == 11000 {
-                //     eprintln!("    Total steps to remove: {}", steps_to_remove.len());
-                //     eprintln!("    Step to split: {:?}", step_to_split);   
-                // }
-
-                // Initialize new vectors to store updated steps
-                let mut new_steps = Vec::with_capacity(r2.steps.len() / 2);
-                let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2);
-                let mut range_new_start = None;
-
-                // Iterate over the original steps
-                for idx in 0..r2.steps.len() {
-                    let step_handle = r2.steps[idx];
-                    let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
-                    let step_end = r2.step_ends[idx];
-
-                    if steps_to_remove.contains(&idx) {
-                        // Skip steps to remove
-                        continue;
-                    } else if step_to_split == Some(idx) {
-                        // Split node for the single partially overlapping step
-                        let node_seq = combined_graph.sequence(step_handle).collect::<Vec<_>>();
-                        let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
-                        let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
-                        
-                        // Calculate offsets relative to the node sequence
-                        let node_len = node_seq.len();
-                        // Calculate offsets consistently regardless of strand
-                        let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len);
-                        let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len);
-
-                        if args.debug {
-                            eprintln!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
-                                idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
-                            eprintln!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
-                        }
+            // if debug && r2.start == 11000 {
+            //     eprintln!("    Total steps to remove: {}", steps_to_remove.len());
+            //     eprintln!("    Step to split: {:?}", step_to_split);   
+            // }
 
-                        if step_start < overlap_start {
-                            if args.debug {
-                                eprintln!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
-                            }
-                            assert!(overlap_start_offset > 0);
+            // Initialize new vectors to store updated steps
+            let mut new_steps = Vec::with_capacity(r2.steps.len() / 2);
+            let mut new_step_ends = Vec::with_capacity(r2.steps.len() / 2);
+            let mut range_new_start = None;
 
-                            // Keep left part
-                            let new_seq = node_seq[0..overlap_start_offset].to_vec();
+            // Iterate over the original steps
+            for idx in 0..r2.steps.len() {
+                let step_handle = r2.steps[idx];
+                let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
+                let step_end = r2.step_ends[idx];
 
-                            let node_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-                            let new_node = combined_graph.create_handle(&new_seq, node_id);
+                if steps_to_remove.contains(&idx) {
+                    // Skip steps to remove
+                    continue;
+                } else if step_to_split == Some(idx) {
+                    // Split node for the single partially overlapping step
+                    let node_seq = combined_graph.sequence(step_handle).collect::<Vec<_>>();
+                    let overlap_within_step_start = std::cmp::max(step_start, overlap_start);
+                    let overlap_within_step_end = std::cmp::min(step_end, overlap_end);
+                    
+                    // Calculate offsets relative to the node sequence
+                    let node_len = node_seq.len();
+                    // Calculate offsets consistently regardless of strand
+                    let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len);
+                    let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len);
+
+                    if debug {
+                        eprintln!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
+                            idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
+                        eprintln!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
+                    }
 
-                            new_steps.push(new_node);
-                            new_step_ends.push(overlap_start);
-                            if range_new_start.is_none() {
-                                range_new_start = Some(step_start);
-                            }
-                        } else if step_end > overlap_end {
-                            if args.debug {
-                                eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
-                            }
-                            assert!(overlap_end_offset < node_len);
+                    if step_start < overlap_start {
+                        if debug {
+                            eprintln!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
+                        }
+                        assert!(overlap_start_offset > 0);
 
-                            // Keep right part
-                            let new_seq = node_seq[overlap_end_offset..].to_vec();
+                        // Keep left part
+                        let new_seq = node_seq[0..overlap_start_offset].to_vec();
 
-                            let node_id = NodeId::from(next_node_id_value);
-                            next_node_id_value += 1;
-                            let new_node = combined_graph.create_handle(&new_seq, node_id);
+                        let node_id = NodeId::from(next_node_id_value);
+                        next_node_id_value += 1;
+                        let new_node = combined_graph.create_handle(&new_seq, node_id);
 
-                            new_steps.push(new_node);
-                            new_step_ends.push(step_end);
-                            if range_new_start.is_none() {
-                                range_new_start = Some(overlap_end);
-                            }
+                        new_steps.push(new_node);
+                        new_step_ends.push(overlap_start);
+                        if range_new_start.is_none() {
+                            range_new_start = Some(step_start);
                         }
-                    } else {
-                        // Keep steps that are not to be removed or split
-                        new_steps.push(step_handle);
+                    } else if step_end > overlap_end {
+                        if debug {
+                            eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
+                        }
+                        assert!(overlap_end_offset < node_len);
+
+                        // Keep right part
+                        let new_seq = node_seq[overlap_end_offset..].to_vec();
+
+                        let node_id = NodeId::from(next_node_id_value);
+                        next_node_id_value += 1;
+                        let new_node = combined_graph.create_handle(&new_seq, node_id);
+
+                        new_steps.push(new_node);
                         new_step_ends.push(step_end);
                         if range_new_start.is_none() {
-                            range_new_start = Some(step_start);
+                            range_new_start = Some(overlap_end);
                         }
                     }
+                } else {
+                    // Keep steps that are not to be removed or split
+                    new_steps.push(step_handle);
+                    new_step_ends.push(step_end);
+                    if range_new_start.is_none() {
+                        range_new_start = Some(step_start);
+                    }
                 }
+            }
 
-                // Update r2 with the new steps
-                r2.steps = new_steps;
-                r2.step_ends = new_step_ends;
+            // Update r2 with the new steps
+            r2.steps = new_steps;
+            r2.step_ends = new_step_ends;
 
-                // Update edges for the modified steps
-                for idx in 0..r2.steps.len() {
-                    if idx > 0 {
-                        let prev_step = r2.steps[idx - 1];
-                        if !combined_graph.has_edge(prev_step, r2.steps[idx]) {
-                            combined_graph.create_edge(Edge(prev_step, r2.steps[idx]));
-                        }
+            // Update edges for the modified steps
+            for idx in 0..r2.steps.len() {
+                if idx > 0 {
+                    let prev_step = r2.steps[idx - 1];
+                    if !combined_graph.has_edge(prev_step, r2.steps[idx]) {
+                        combined_graph.create_edge(Edge(prev_step, r2.steps[idx]));
                     }
                 }
+            }
 
-                // Update r2.start and r2.end based on the new step positions
-                if !r2.step_ends.is_empty() {
-                    r2.start = range_new_start.unwrap();  // Safe because if we have positions, we must have set range_new_start
-                    r2.end = *r2.step_ends.last().unwrap();
-                } else {
-                    // If no steps remain, set start and end to overlap_end to effectively remove this range
-                    r2.start = overlap_end;
-                    r2.end = overlap_end;
-                }
+            // Update r2.start and r2.end based on the new step positions
+            if !r2.step_ends.is_empty() {
+                r2.start = range_new_start.unwrap();  // Safe because if we have positions, we must have set range_new_start
+                r2.end = *r2.step_ends.last().unwrap();
+            } else {
+                // If no steps remain, set start and end to overlap_end to effectively remove this range
+                r2.start = overlap_end;
+                r2.end = overlap_end;
+            }
 
-                if args.debug {
-                    eprintln!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
-                }
+            if debug {
+                eprintln!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
             }
         }
+    }
 
-        if args.debug {
-            eprintln!("Path key '{}' without overlaps", path_key);
-            for range in ranges.iter() {
-                eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
-            }
+    if debug {
+        eprintln!("Path key '{}' without overlaps", path_key);
+        for range in ranges.iter() {
+            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
+    }
+}
 
-        // Check for overlaps and contiguity
-        let mut all_contiguous = true;
-        
-        for window in ranges.windows(2) {
-            let r1 = &window[0];
-            let r2 = &window[1];
-
-            if r1.overlaps_with(&r2) {
-                if args.debug {
-                    eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
-                    r1.start, r1.end, r2.start, r2.end);
-                }
-                panic!("Unresolved overlaps detected in path key '{}'", path_key);
-            }
-            if !r1.is_contiguous_with(&r2) {
-                all_contiguous = false;
+fn create_paths_from_ranges(
+    path_key: &str,
+    ranges: &[RangeInfo],
+    combined_graph: &mut HashGraph,
+    debug: bool
+) {
+    // Check for overlaps and contiguity
+    let mut all_contiguous = true;
+    
+    for window in ranges.windows(2) {
+        let r1 = &window[0];
+        let r2 = &window[1];
+
+        if r1.overlaps_with(&r2) {
+            if debug {
+                eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
+                r1.start, r1.end, r2.start, r2.end);
             }
+            panic!("Unresolved overlaps detected in path key '{}'", path_key);
         }
-                
-        if !all_contiguous && args.debug {
-            let mut current_start = ranges[0].start;
-            let mut current_end = ranges[0].end;
+        if !r1.is_contiguous_with(&r2) {
+            all_contiguous = false;
+        }
+    }
             
-            for i in 1..ranges.len() {
-                if ranges[i-1].is_contiguous_with(&ranges[i]) {
-                    // Extend current merged range
-                    current_end = ranges[i].end;
+    if !all_contiguous && debug {
+        let mut current_start = ranges[0].start;
+        let mut current_end = ranges[0].end;
+        
+        for i in 1..ranges.len() {
+            if ranges[i-1].is_contiguous_with(&ranges[i]) {
+                // Extend current merged range
+                current_end = ranges[i].end;
+            } else {
+                // Print current merged range
+                eprintln!("    Merged range: start={}, end={}", 
+                    current_start, current_end);
+                
+                if !ranges[i-1].overlaps_with(&ranges[i]) {
+                    // Calculate and print gap
+                    let gap = ranges[i].start - current_end;
+                    eprintln!("      Gap to next range: {} positions", gap);
                 } else {
-                    // Print current merged range
-                    eprintln!("    Merged range: start={}, end={}", 
-                        current_start, current_end);
-                    
-                    if !ranges[i-1].overlaps_with(&ranges[i]) {
-                        // Calculate and print gap
-                        let gap = ranges[i].start - current_end;
-                        eprintln!("      Gap to next range: {} positions", gap);
-                    } else {
-                        // Calculate and print overlap
-                        let overlap = current_end - ranges[i].start;
-                        eprintln!("      Overlap with next range: {} positions", overlap);
-                    }
-
-                    // Start new merged range
-                    current_start = ranges[i].start;
-                    current_end = ranges[i].end;
+                    // Calculate and print overlap
+                    let overlap = current_end - ranges[i].start;
+                    eprintln!("      Overlap with next range: {} positions", overlap);
                 }
+
+                // Start new merged range
+                current_start = ranges[i].start;
+                current_end = ranges[i].end;
             }
-            
-            // Print final merged range
-            eprintln!("    Merged range: start={}, end={}", 
-                current_start, current_end);
         }
+        
+        // Print final merged range
+        eprintln!("    Merged range: start={}, end={}", 
+            current_start, current_end);
+    }
 
-        if all_contiguous {
-            // Create a single path with the original key
-            let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap();
-            let mut prev_step = None;
-            
-            // Add all steps from all ranges
-            for range in ranges.iter() {
-                for step in &range.steps {
-                    combined_graph.path_append_step(path_id, *step);
-                    
-                    if let Some(prev) = prev_step {
-                        if !combined_graph.has_edge(prev, *step) {
-                            combined_graph.create_edge(Edge(prev, *step));
-                        }
+    if all_contiguous {
+        // Create a single path with the original key
+        let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap();
+        let mut prev_step = None;
+        
+        // Add all steps from all ranges
+        for range in ranges.iter() {
+            for step in &range.steps {
+                combined_graph.path_append_step(path_id, *step);
+                
+                if let Some(prev) = prev_step {
+                    if !combined_graph.has_edge(prev, *step) {
+                        combined_graph.create_edge(Edge(prev, *step));
                     }
-                    prev_step = Some(*step);
                 }
+                prev_step = Some(*step);
             }
-        } else {
-            // Handle non-contiguous ranges by creating separate paths for each contiguous group
-            let mut current_range_idx = 0;
-            while current_range_idx < ranges.len() {
-                let start_range = &ranges[current_range_idx];
-                let mut steps = start_range.steps.clone();
-                let mut next_idx = current_range_idx + 1;
-                let mut end_range = start_range;
-                
-                // Merge contiguous ranges
-                while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
-                    steps.extend(ranges[next_idx].steps.clone());
-                    end_range = &ranges[next_idx];
-                    next_idx += 1;
-                }
-                
-                // Create path name with range information
-                let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
-                let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
+        }
+    } else {
+        // Handle non-contiguous ranges by creating separate paths for each contiguous group
+        let mut current_range_idx = 0;
+        while current_range_idx < ranges.len() {
+            let start_range = &ranges[current_range_idx];
+            let mut steps = start_range.steps.clone();
+            let mut next_idx = current_range_idx + 1;
+            let mut end_range = start_range;
+            
+            // Merge contiguous ranges
+            while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
+                steps.extend(ranges[next_idx].steps.clone());
+                end_range = &ranges[next_idx];
+                next_idx += 1;
+            }
+            
+            // Create path name with range information
+            let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
+            let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
+            
+            // Add steps to the path
+            let mut prev_step = None;
+            for step in steps {
+                combined_graph.path_append_step(path_id, step);
                 
-                // Add steps to the path
-                let mut prev_step = None;
-                for step in steps {
-                    combined_graph.path_append_step(path_id, step);
-                    
-                    if let Some(prev) = prev_step {
-                        if !combined_graph.has_edge(prev, step) {
-                            combined_graph.create_edge(Edge(prev, step));
-                        }
+                if let Some(prev) = prev_step {
+                    if !combined_graph.has_edge(prev, step) {
+                        combined_graph.create_edge(Edge(prev, step));
                     }
-                    prev_step = Some(step);
                 }
-                
-                current_range_idx = next_idx;
+                prev_step = Some(step);
             }
+            
+            current_range_idx = next_idx;
         }
     }
-
-    if args.debug {
-        eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
-
-        eprintln!("Total nodes before filtering: {}", combined_graph.node_count());
-    }
-    
-    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);
-    
-    if args.debug {
-        eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones());
-    }
-
-    // Write the combined graph to GFA file, skipping unused nodes
-    match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
-        Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)},
-        Err(e) => eprintln!("Error writing GFA file: {}", e),
-    }
 }
 
 fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
@@ -684,21 +664,76 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
     nodes_to_remove
 }
 
-fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> {
-    // Find the last ':' to split the range from the key
-    if let Some(last_colon) = path_name.rfind(':') {
-        let (key, range_str) = path_name.split_at(last_colon);
-        // Skip the ':' character
-        let range_str = &range_str[1..];
+fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> {
+    let mut file = File::create(output_path)?;
+    
+    // Write GFA version
+    writeln!(file, "H\tVN:Z:1.0")?;
+    
+    // Collect nodes, excluding marked nodes
+    let nodes: Vec<Handle> = graph.handles()
+        .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
+        .collect();
+    
+    // Create mapping from old IDs to new sequential IDs
+    // We allocate a vector with capacity for the max node ID + 1
+    let max_id = graph.node_count();
+    let mut id_mapping = vec![0; max_id + 1];
+    for (new_id, handle) in nodes.iter().enumerate() {
+        id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1
+    }
+    
+    // Write nodes with new IDs
+    for handle in &nodes {
+        let sequence = graph.sequence(*handle).collect::<Vec<_>>();
+        let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N"));
+        let node_id = id_mapping[u64::from(handle.id()) as usize];
+        writeln!(file, "S\t{}\t{}", node_id, sequence_str)?;
+    }
+    
+    // Collect and sort edges, excluding those connected to marked nodes
+    let edges: Vec<Edge> = graph.edges()
+        .filter(|edge| {
+            !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
+            !nodes_to_remove[u64::from(edge.1.id()) as usize]
+        })
+        .collect();
+    
+    // Write edges with new IDs
+    for edge in edges {
+        let from_id = id_mapping[u64::from(edge.0.id()) as usize];
+        let to_id = id_mapping[u64::from(edge.1.id()) as usize];
+        let from_orient = if edge.0.is_reverse() { "-" } else { "+" };
+        let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
+        writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
+    }
+
+    // Collect and sort paths directly with references to avoid multiple lookups
+    let mut path_entries: Vec<_> = graph.paths.iter().collect();
+    path_entries.sort_by_key(|(_, path_ref)| {
+        // Get name directly from path struct instead of doing lookups
+        path_ref.name.as_slice()
+    });
+
+    // Write paths with new IDs
+    for (_path_id, path_ref) in path_entries {
+        // Pre-allocate vector with capacity
+        let mut path_elements = Vec::with_capacity(path_ref.nodes.len());
         
-        // Find the '-' in the range portion
-        if let Some((start_str, end_str)) = range_str.split_once('-') {
-            if let (Ok(start), Ok(end)) = (start_str.parse(), end_str.parse()) {
-                return Some((key.to_string(), start, end));
-            }
+        // Process nodes directly from path reference
+        for handle in &path_ref.nodes {
+            let node_id = id_mapping[u64::from(handle.id()) as usize];
+            let orient = if handle.is_reverse() { "-" } else { "+" };
+            path_elements.push(format!("{}{}", node_id, orient));
+        }
+        
+        if !path_elements.is_empty() {
+            let path_name = String::from_utf8_lossy(&path_ref.name);
+            writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
         }
     }
-    None
+        
+    Ok(())
 }
 
 #[cfg(test)]

From 9e2de3e7f1bb4a3093ab19d33e22836660ce51b0 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 19:24:27 -0600
Subject: [PATCH 37/45] clippy is happy

---
 src/main.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index bdbeefd..d9ee1f7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -145,7 +145,7 @@ fn process_gfa_files(
                     // Record the end position of this step
                     let node_seq = block_graph.sequence(*step).collect::<Vec<_>>();
                     let node_length = node_seq.len();
-                    cumulative_pos = cumulative_pos + node_length;
+                    cumulative_pos += node_length;
                     step_ends.push(cumulative_pos);
                 }
 
@@ -168,7 +168,7 @@ fn process_gfa_files(
 
     if debug {
         eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
-            combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.iter().map(|(_, ranges)| ranges.len()).sum::<usize>());
+            combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
     }
 
     (combined_graph, path_key_ranges)
@@ -251,7 +251,7 @@ fn split_path_name(path_name: &str) -> Option<(String, usize, usize)> {
 }
 
 fn sort_and_filter_ranges(
-    path_key: &String,
+    path_key: &str,
     ranges: &mut Vec<RangeInfo>,
     debug: bool
 ) {
@@ -352,8 +352,8 @@ fn sort_and_filter_ranges(
 }
 
 fn trim_range_overlaps(
-    path_key: &String,
-    ranges: &mut Vec<RangeInfo>, 
+    path_key: &str,
+    ranges: &mut [RangeInfo],
     combined_graph: &mut HashGraph,
     debug: bool
 ) {
@@ -369,7 +369,7 @@ fn trim_range_overlaps(
         let r1 = &mut left[left.len()-1];
         let r2 = &mut right[0];
 
-        if r1.overlaps_with(&r2) {
+        if r1.overlaps_with(r2) {
             // Calculate the overlap region - use max/min to get precise overlap bounds
             let overlap_start = std::cmp::max(r1.start, r2.start);
             let overlap_end = std::cmp::min(r1.end, r2.end);
@@ -547,14 +547,14 @@ fn create_paths_from_ranges(
         let r1 = &window[0];
         let r2 = &window[1];
 
-        if r1.overlaps_with(&r2) {
+        if r1.overlaps_with(r2) {
             if debug {
                 eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
                 r1.start, r1.end, r2.start, r2.end);
             }
             panic!("Unresolved overlaps detected in path key '{}'", path_key);
         }
-        if !r1.is_contiguous_with(&r2) {
+        if !r1.is_contiguous_with(r2) {
             all_contiguous = false;
         }
     }

From 64372b0ca04c7154a1e7a04468014ae0d4e92af9 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sat, 14 Dec 2024 22:33:00 -0600
Subject: [PATCH 38/45] logging

---
 Cargo.lock  |  33 ++++++++++++++-
 Cargo.toml  |   2 +
 src/main.rs | 115 +++++++++++++++++++++++++++-------------------------
 3 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ea3e602..8f49aaa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "adler2"
@@ -311,6 +311,29 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "env_filter"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+dependencies = [
+ "log",
+ "regex",
+]
+
+[[package]]
+name = "env_logger"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "humantime",
+ "log",
+]
+
 [[package]]
 name = "errno"
 version = "0.3.10"
@@ -372,8 +395,10 @@ dependencies = [
  "bitvec",
  "bstr 1.11.1",
  "clap",
+ "env_logger",
  "gfa",
  "handlegraph",
+ "log",
  "niffler",
  "rustc-hash",
  "tempfile",
@@ -401,6 +426,12 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.1"
diff --git a/Cargo.toml b/Cargo.toml
index 9110992..756012d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,3 +14,5 @@ gfa = "0.10.1"
 bitvec = "1.0.1"
 tempfile = "3.14.0"
 rustc-hash = "2.1.0"
+log = "0.4.22"
+env_logger = "0.11.5"
diff --git a/src/main.rs b/src/main.rs
index d9ee1f7..098790e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -18,6 +18,7 @@ use handlegraph::{
 use gfa::{gfa::GFA, parser::GFAParser};
 use bitvec::{bitvec, prelude::BitVec};
 use tempfile::NamedTempFile;
+use log::{debug, info, warn, error};
 
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
@@ -30,40 +31,45 @@ struct Args {
     #[clap(short, long, value_parser)]
     output: String,
 
-    /// Enable debug output
-    #[clap(short, long)]
-    debug: bool,
+    /// Verbosity level (0 = error, 1 = info, 2 = debug)
+    #[clap(short, long, default_value = "0")]
+    verbose: u8,
 }
 
 fn main() {
     let args = Args::parse();
 
+    // Initialize logger based on verbosity
+    env_logger::Builder::new()
+    .filter_level(match args.verbose {
+        0 => log::LevelFilter::Error,
+        1 => log::LevelFilter::Info,
+        _ => log::LevelFilter::Debug,
+    })
+    .init();
+
     // Create a single combined graph without paths and a map of path key to ranges
-    let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list, args.debug);
+    let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list);
 
     // Sort ranges and create merged paths in the combined graph
     for (path_key, ranges) in path_key_ranges.iter_mut() {
-        sort_and_filter_ranges(path_key, ranges, args.debug);
-        trim_range_overlaps(path_key, ranges, &mut combined_graph, args.debug);
-        create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.debug);
+        info!("Processing path key '{}' with {} ranges", path_key, ranges.len());
+        sort_and_filter_ranges(path_key, ranges, args.verbose > 1);
+        trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1);
+        create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1);
     }
+    info!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
 
-    if args.debug {
-        eprintln!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
-
-        eprintln!("Total nodes before filtering: {}", combined_graph.node_count());
-    }
+    info!("Total nodes before filtering: {}", combined_graph.node_count());
     
     let nodes_to_skip = mark_nodes_for_removal(&combined_graph);
     
-    if args.debug {
-        eprintln!("Nodes to be filtered out: {}", nodes_to_skip.count_ones());
-    }
-
+    info!("Total nodes to be filtered out: {}", nodes_to_skip.count_ones());
+    
     // Write the combined graph to GFA file, skipping unused nodes
     match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
-        Ok(_) => if args.debug {eprintln!("Successfully wrote combined graph to {}", args.output)},
-        Err(e) => eprintln!("Error writing GFA file: {}", e),
+        Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
+        Err(e) => error!("Error writing the GFA file: {}", e),
     }
 }
 
@@ -91,12 +97,13 @@ impl RangeInfo {
 
 fn process_gfa_files(
     gfa_list: &[String],
-    debug: bool,
 ) -> (HashGraph, FxHashMap<String, Vec<RangeInfo>>) {
     let mut combined_graph = HashGraph::new();
     let mut path_key_ranges: FxHashMap<String, Vec<RangeInfo>> = FxHashMap::default();
     let mut id_translations = Vec::new();
 
+    info!("Processing {} GFA files", gfa_list.len());
+
     // Process each GFA file
     let parser = GFAParser::new();
     for (gfa_id, gfa_path) in gfa_list.iter().enumerate() {
@@ -123,9 +130,7 @@ fn process_gfa_files(
             combined_graph.create_edge(translated_edge);
         }
         
-        if debug {
-            eprintln!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count());
-        }
+        debug!("GFA file {} ({}) processed: Added {} nodes and {} edges", gfa_id, gfa_path, block_graph.node_count(), block_graph.edge_count());
 
         // Process paths and collect ranges with their steps
         for (_path_id, path_ref) in block_graph.paths.iter() {
@@ -159,17 +164,15 @@ fn process_gfa_files(
                         steps: translated_steps,
                         step_ends,
                     });
-                } else if debug {
-                    eprintln!("  Warning: Path '{}' has no steps", path_name);
+                } else {
+                    warn!("  Path '{}' has no steps", path_name);
                 }
             }
         }
     }
 
-    if debug {
-        eprintln!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
+    info!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
             combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
-    }
 
     (combined_graph, path_key_ranges)
 }
@@ -259,12 +262,12 @@ fn sort_and_filter_ranges(
     ranges.sort_by_key(|r| (r.start, r.end));
 
     if debug {
-        eprintln!("Processing path key '{}'", path_key);
+        debug!("Processing path key '{}'", path_key);
         for range in ranges.iter() {
-            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+            debug!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
 
-        eprintln!("Removing redundant ranges");
+        debug!("Removing redundant ranges");
     }
 
     // Remove ranges that are contained within other ranges
@@ -277,7 +280,7 @@ fn sort_and_filter_ranges(
             // Skip duplicate range
             // Current range is a duplicate of the previous range, skip it
             if debug {
-                eprintln!(
+                debug!(
                     "  Duplicate range detected: Range [start={}, end={}] is identical to previous range and will be removed.",
                     curr_start, curr_end
                 );
@@ -286,7 +289,7 @@ fn sort_and_filter_ranges(
         } else if curr_start >= prev_start && curr_end <= prev_end {
             // Skip range that is fully contained within previous range
             if debug {
-                eprintln!(
+                debug!(
                     "  Contained range detected: Range [start={}, end={}] is fully contained within previous range [start={}, end={}] and will be removed.",
                     curr_start, curr_end, prev_start, prev_end
                 );
@@ -295,7 +298,7 @@ fn sort_and_filter_ranges(
         } else if prev_start >= curr_start && prev_end <= curr_end {
             // Previous range is fully contained within current range
             if debug {
-                eprintln!(
+                debug!(
                     "  Containing range detected: Previous range [start={}, end={}] is fully contained within current range [start={}, end={}] and will be removed.",
                     prev_start, prev_end, curr_start, curr_end
                 );
@@ -323,7 +326,7 @@ fn sort_and_filter_ranges(
             
             if !should_skip {
                 if debug {
-                    eprintln!(
+                    debug!(
                         "  Overlapping range detected: Range [start={}, end={}] overlaps with previous range [start={}, end={}] and will be kept.",
                         curr_start, curr_end, prev_start, prev_end
                     );
@@ -344,9 +347,9 @@ fn sort_and_filter_ranges(
     ranges.truncate(write_idx + 1);
     
     if debug {
-        eprintln!("Path key '{}' without redundancy", path_key);
+        debug!("Path key '{}' without redundancy", path_key);
         for range in ranges.iter() {
-            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+            debug!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
     }
 }
@@ -359,7 +362,7 @@ fn trim_range_overlaps(
 ) {
     // Trim overlaps
     if debug {
-        eprintln!("Trimming overlapping ranges");
+        debug!("Trimming overlapping ranges");
     }
 
     let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
@@ -375,7 +378,7 @@ fn trim_range_overlaps(
             let overlap_end = std::cmp::min(r1.end, r2.end);
 
             if debug {
-                eprintln!(
+                debug!(
                     "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
                     r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
                 );
@@ -388,22 +391,22 @@ fn trim_range_overlaps(
                 let step_start = if idx == 0 { r2.start } else { r2.step_ends[idx - 1] };
                 if step_end <= overlap_start {
                     // if debug && r2.start == 11000 {
-                    //     eprintln!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
+                    //     debug!("    Step {} [start={}, end={}, len={}] before overlap", idx, step_start, step_end, step_end - step_start);
                     // }
                     continue;
                 } else if step_start >= overlap_end {
                     // if debug && r2.start == 11000 {
-                    //     eprintln!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
+                    //     debug!("    Step {} [start={}, end={}, len={}] after overlap", idx, step_start, step_end, step_end - step_start);
                     // }
                     break;
                 } else if step_start >= overlap_start && step_end <= overlap_end {
                     // if debug && r2.start == 11000 {
-                    //     eprintln!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
+                    //     debug!("    Step {} [start={}, end={}, len={}] fully overlaps", idx, step_start, step_end, step_end - step_start);
                     // }
                     steps_to_remove.push(idx);
                 } else {
                     // if debug && r2.start == 11000 {
-                    //     eprintln!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
+                    //     debug!("    Step {} [start={}, end={}, len={}] partially overlaps", idx, step_start, step_end, step_end - step_start);
                     // }
                     if step_to_split.is_some() {
                         panic!("Error: More than one step is partially overlapping, which is not allowed.");
@@ -413,8 +416,8 @@ fn trim_range_overlaps(
             }
 
             // if debug && r2.start == 11000 {
-            //     eprintln!("    Total steps to remove: {}", steps_to_remove.len());
-            //     eprintln!("    Step to split: {:?}", step_to_split);   
+            //     debug!("    Total steps to remove: {}", steps_to_remove.len());
+            //     debug!("    Step to split: {:?}", step_to_split);   
             // }
 
             // Initialize new vectors to store updated steps
@@ -444,14 +447,14 @@ fn trim_range_overlaps(
                     let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len);
 
                     if debug {
-                        eprintln!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
+                        debug!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
                             idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
-                        eprintln!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
+                            debug!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
                     }
 
                     if step_start < overlap_start {
                         if debug {
-                            eprintln!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
+                            debug!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
                         }
                         assert!(overlap_start_offset > 0);
 
@@ -469,7 +472,7 @@ fn trim_range_overlaps(
                         }
                     } else if step_end > overlap_end {
                         if debug {
-                            eprintln!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
+                            debug!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
                         }
                         assert!(overlap_end_offset < node_len);
 
@@ -521,15 +524,15 @@ fn trim_range_overlaps(
             }
 
             if debug {
-                eprintln!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
+                debug!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
             }
         }
     }
 
     if debug {
-        eprintln!("Path key '{}' without overlaps", path_key);
+        debug!("Path key '{}' without overlaps", path_key);
         for range in ranges.iter() {
-            eprintln!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
+            debug!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
     }
 }
@@ -549,7 +552,7 @@ fn create_paths_from_ranges(
 
         if r1.overlaps_with(r2) {
             if debug {
-                eprintln!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
+                debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
                 r1.start, r1.end, r2.start, r2.end);
             }
             panic!("Unresolved overlaps detected in path key '{}'", path_key);
@@ -569,17 +572,17 @@ fn create_paths_from_ranges(
                 current_end = ranges[i].end;
             } else {
                 // Print current merged range
-                eprintln!("    Merged range: start={}, end={}", 
+                debug!("    Merged range: start={}, end={}", 
                     current_start, current_end);
                 
                 if !ranges[i-1].overlaps_with(&ranges[i]) {
                     // Calculate and print gap
                     let gap = ranges[i].start - current_end;
-                    eprintln!("      Gap to next range: {} positions", gap);
+                    debug!("      Gap to next range: {} positions", gap);
                 } else {
                     // Calculate and print overlap
                     let overlap = current_end - ranges[i].start;
-                    eprintln!("      Overlap with next range: {} positions", overlap);
+                    debug!("      Overlap with next range: {} positions", overlap);
                 }
 
                 // Start new merged range
@@ -589,7 +592,7 @@ fn create_paths_from_ranges(
         }
         
         // Print final merged range
-        eprintln!("    Merged range: start={}, end={}", 
+        debug!("    Merged range: start={}, end={}", 
             current_start, current_end);
     }
 

From 5edcf0ce18e6c4b718ce218e87497eac91950835 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 10:03:11 -0600
Subject: [PATCH 39/45] more log

---
 Cargo.lock  | 12 ++++++------
 src/main.rs | 23 ++++++++++++-----------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8f49aaa..80c0fc3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -273,18 +273,18 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.13"
+version = "0.5.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
+checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471"
 dependencies = [
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
 dependencies = [
  "crossbeam-epoch",
  "crossbeam-utils",
@@ -301,9 +301,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.20"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
 [[package]]
 name = "either"
diff --git a/src/main.rs b/src/main.rs
index 098790e..215e462 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -51,22 +51,23 @@ fn main() {
     // Create a single combined graph without paths and a map of path key to ranges
     let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list);
 
-    // Sort ranges and create merged paths in the combined graph
+    // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph
+    info!("Processing {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
     for (path_key, ranges) in path_key_ranges.iter_mut() {
-        info!("Processing path key '{}' with {} ranges", path_key, ranges.len());
+        debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());
+        
         sort_and_filter_ranges(path_key, ranges, args.verbose > 1);
         trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1);
         create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1);
     }
-    info!("Total paths created: {}", GraphPaths::path_count(&combined_graph));
+    info!("Created {} nodes, {} edges, and {} paths in the combined graph",
+        combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count());
 
-    info!("Total nodes before filtering: {}", combined_graph.node_count());
-    
-    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);
-    
-    info!("Total nodes to be filtered out: {}", nodes_to_skip.count_ones());
+    info!("Marking unused nodes for removal");
+    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);    
+    info!("Total unused nodes to remove: {}", nodes_to_skip.count_ones());
     
-    // Write the combined graph to GFA file, skipping unused nodes
+    info!("Writing the combined graph by removing unused nodes and compacting node IDs");
     match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
@@ -171,8 +172,8 @@ fn process_gfa_files(
         }
     }
 
-    info!("Collected {} nodes, {} edges, {} paths, and {} path ranges from all GFA files",
-            combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len(), path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
+    info!("Collected {} nodes, {} edges, and {} path keys",
+        combined_graph.node_count(), combined_graph.edge_count(), path_key_ranges.len());
 
     (combined_graph, path_key_ranges)
 }

From 4845e06394288d784e4e9b052cfd43792bb67c7f Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 11:54:10 -0600
Subject: [PATCH 40/45] tweaks

---
 src/main.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 215e462..e58c0e0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -49,10 +49,10 @@ fn main() {
     .init();
 
     // Create a single combined graph without paths and a map of path key to ranges
-    let (mut combined_graph, mut path_key_ranges) = process_gfa_files(&args.gfa_list);
+    let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list);
 
     // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph
-    info!("Processing {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
+    info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
     for (path_key, ranges) in path_key_ranges.iter_mut() {
         debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());
         
@@ -60,15 +60,11 @@ fn main() {
         trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1);
         create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1);
     }
-    info!("Created {} nodes, {} edges, and {} paths in the combined graph",
+    info!("Created {} nodes, {} edges, and {} paths",
         combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count());
 
-    info!("Marking unused nodes for removal");
-    let nodes_to_skip = mark_nodes_for_removal(&combined_graph);    
-    info!("Total unused nodes to remove: {}", nodes_to_skip.count_ones());
-    
-    info!("Writing the combined graph by removing unused nodes and compacting node IDs");
-    match write_graph_to_gfa(&combined_graph, &args.output, &nodes_to_skip) {
+    info!("Writing the result by removing unused nodes/edges and compacting node IDs");
+    match write_graph_to_gfa(&combined_graph, &args.output) {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
     }
@@ -96,14 +92,14 @@ impl RangeInfo {
     }
 }
 
-fn process_gfa_files(
+fn read_gfa_files(
     gfa_list: &[String],
 ) -> (HashGraph, FxHashMap<String, Vec<RangeInfo>>) {
     let mut combined_graph = HashGraph::new();
     let mut path_key_ranges: FxHashMap<String, Vec<RangeInfo>> = FxHashMap::default();
     let mut id_translations = Vec::new();
 
-    info!("Processing {} GFA files", gfa_list.len());
+    info!("Reading {} GFA files", gfa_list.len());
 
     // Process each GFA file
     let parser = GFAParser::new();
@@ -668,7 +664,11 @@ fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
     nodes_to_remove
 }
 
-fn write_graph_to_gfa(graph: &HashGraph, output_path: &str, nodes_to_remove: &BitVec) -> std::io::Result<()> {
+fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<()> {
+    debug!("Marking unused nodes for removal");
+    let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph);    
+    debug!("Marked {} nodes", nodes_to_remove.count_ones());
+    
     let mut file = File::create(output_path)?;
     
     // Write GFA version

From 62904bdceff999f4d9dc009d568013ce8e7c618b Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 13:37:54 -0600
Subject: [PATCH 41/45] reduce memory usage

---
 src/main.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index e58c0e0..3db24c6 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -53,12 +53,12 @@ fn main() {
 
     // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph
     info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
-    for (path_key, ranges) in path_key_ranges.iter_mut() {
+    for (path_key, mut ranges) in path_key_ranges.drain() {
         debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());
-        
-        sort_and_filter_ranges(path_key, ranges, args.verbose > 1);
-        trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1);
-        create_paths_from_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1);
+
+        sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1);
+        trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1);
+        create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1);
     }
     info!("Created {} nodes, {} edges, and {} paths",
         combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count());
@@ -68,6 +68,7 @@ fn main() {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
     }
+
 }
 
 #[derive(Debug, Clone)]
@@ -446,7 +447,7 @@ fn trim_range_overlaps(
                     if debug {
                         debug!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
                             idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
-                            debug!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
+                        debug!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
                     }
 
                     if step_start < overlap_start {

From 2be3e3e2ba91993ef3fde6724d1dbcb1d709566a Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 14:06:31 -0600
Subject: [PATCH 42/45] tiny improvements; profile memory

---
 src/main.rs | 91 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 3db24c6..3c11f3a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -20,6 +20,24 @@ use bitvec::{bitvec, prelude::BitVec};
 use tempfile::NamedTempFile;
 use log::{debug, info, warn, error};
 
+use std::process::Command;
+
+#[cfg(not(debug_assertions))]
+fn log_memory_usage(stage: &str) {
+    let output = Command::new("ps")
+        .args(&["-o", "rss=", "-p", &std::process::id().to_string()])
+        .output()
+        .expect("Failed to execute ps command");
+    
+    let memory_kb = String::from_utf8_lossy(&output.stdout)
+        .trim()
+        .parse::<u64>()
+        .unwrap_or(0);
+    
+    let memory_mb = memory_kb as f64 / 1024.0;
+    info!("Memory usage at {}: {:.2} MB", stage, memory_mb);
+}
+
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
 struct Args {
@@ -48,9 +66,16 @@ fn main() {
     })
     .init();
 
+    log_memory_usage("start");
+
     // Create a single combined graph without paths and a map of path key to ranges
     let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list);
 
+    log_memory_usage("after_reading_files");
+
+    let total_path_keys = path_key_ranges.len();
+    let mut processed_path_keys = 0;
+
     // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph
     info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
     for (path_key, mut ranges) in path_key_ranges.drain() {
@@ -59,16 +84,25 @@ fn main() {
         sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1);
         trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1);
         create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1);
+
+        processed_path_keys += 1;
+        if processed_path_keys % 10 == 0 {
+            info!("Processed {}/{} path keys", processed_path_keys, total_path_keys);
+            log_memory_usage(&format!("Processed {} path keys", processed_path_keys));
+        }
     }
     info!("Created {} nodes, {} edges, and {} paths",
         combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count());
 
+    log_memory_usage("before_writing");
+
     info!("Writing the result by removing unused nodes/edges and compacting node IDs");
     match write_graph_to_gfa(&combined_graph, &args.output) {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
     }
 
+    log_memory_usage("end");
 }
 
 #[derive(Debug, Clone)]
@@ -675,42 +709,35 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
     // Write GFA version
     writeln!(file, "H\tVN:Z:1.0")?;
     
-    // Collect nodes, excluding marked nodes
-    let nodes: Vec<Handle> = graph.handles()
-        .filter(|handle| !nodes_to_remove[u64::from(handle.id()) as usize])
-        .collect();
-    
-    // Create mapping from old IDs to new sequential IDs
-    // We allocate a vector with capacity for the max node ID + 1
+
+    // Write nodes by exluding marked ones and create the id_mapping
     let max_id = graph.node_count();
     let mut id_mapping = vec![0; max_id + 1];
-    for (new_id, handle) in nodes.iter().enumerate() {
-        id_mapping[u64::from(handle.id()) as usize] = new_id + 1; // +1 to start from 1
-    }
-    
-    // Write nodes with new IDs
-    for handle in &nodes {
-        let sequence = graph.sequence(*handle).collect::<Vec<_>>();
-        let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N"));
-        let node_id = id_mapping[u64::from(handle.id()) as usize];
-        writeln!(file, "S\t{}\t{}", node_id, sequence_str)?;
+    let mut new_id = 1; // Start from 1
+    for handle in graph.handles() {
+        let node_id = usize::from(handle.id());
+        if !nodes_to_remove[node_id] {
+            id_mapping[node_id] = new_id;
+            
+            let sequence = graph.sequence(handle).collect::<Vec<_>>();
+            let sequence_str = String::from_utf8(sequence).unwrap_or_else(|_| String::from("N"));
+            writeln!(file, "S\t{}\t{}", new_id, sequence_str)?;
+            
+            new_id += 1;
+        }
     }
     
-    // Collect and sort edges, excluding those connected to marked nodes
-    let edges: Vec<Edge> = graph.edges()
-        .filter(|edge| {
-            !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
-            !nodes_to_remove[u64::from(edge.1.id()) as usize]
-        })
-        .collect();
-    
-    // Write edges with new IDs
-    for edge in edges {
-        let from_id = id_mapping[u64::from(edge.0.id()) as usize];
-        let to_id = id_mapping[u64::from(edge.1.id()) as usize];
-        let from_orient = if edge.0.is_reverse() { "-" } else { "+" };
-        let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
-        writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
+    // Write edges by excluding those connected to marked nodes
+    for edge in graph.edges() {
+        if !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
+           !nodes_to_remove[u64::from(edge.1.id()) as usize] 
+        {
+            let from_id = id_mapping[u64::from(edge.0.id()) as usize];
+            let to_id = id_mapping[u64::from(edge.1.id()) as usize];
+            let from_orient = if edge.0.is_reverse() { "-" } else { "+" };
+            let to_orient = if edge.1.is_reverse() { "-" } else { "+" };
+            writeln!(file, "L\t{}\t{}\t{}\t{}\t0M", from_id, from_orient, to_id, to_orient)?;
+        }
     }
 
     // Collect and sort paths directly with references to avoid multiple lookups

From c9e482d28b00c8f98a0cc0509998f07f3e8b915d Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 15:06:01 -0600
Subject: [PATCH 43/45] faster and 2X less memory

---
 src/main.rs | 458 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 274 insertions(+), 184 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 3c11f3a..c4f719d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,10 +9,7 @@ use handlegraph::{
     handle::{Handle, NodeId, Edge},
     handlegraph::*,
     mutablehandlegraph::*,
-    pathhandlegraph::{
-        MutableGraphPaths,
-        GraphPaths
-    },
+    pathhandlegraph::GraphPaths,
     hashgraph::HashGraph,
 };
 use gfa::{gfa::GFA, parser::GFAParser};
@@ -20,23 +17,23 @@ use bitvec::{bitvec, prelude::BitVec};
 use tempfile::NamedTempFile;
 use log::{debug, info, warn, error};
 
-use std::process::Command;
+// use std::process::Command;
 
-#[cfg(not(debug_assertions))]
-fn log_memory_usage(stage: &str) {
-    let output = Command::new("ps")
-        .args(&["-o", "rss=", "-p", &std::process::id().to_string()])
-        .output()
-        .expect("Failed to execute ps command");
+// #[cfg(not(debug_assertions))]
+// fn log_memory_usage(stage: &str) {
+//     let output = Command::new("ps")
+//         .args(&["-o", "rss=", "-p", &std::process::id().to_string()])
+//         .output()
+//         .expect("Failed to execute ps command");
     
-    let memory_kb = String::from_utf8_lossy(&output.stdout)
-        .trim()
-        .parse::<u64>()
-        .unwrap_or(0);
+//     let memory_kb = String::from_utf8_lossy(&output.stdout)
+//         .trim()
+//         .parse::<u64>()
+//         .unwrap_or(0);
     
-    let memory_mb = memory_kb as f64 / 1024.0;
-    info!("Memory usage at {}: {:.2} MB", stage, memory_mb);
-}
+//     let memory_mb = memory_kb as f64 / 1024.0;
+//     info!("Memory usage at {}: {:.2} MB", stage, memory_mb);
+// }
 
 #[derive(Parser, Debug)]
 #[clap(author, version, about)]
@@ -66,43 +63,34 @@ fn main() {
     })
     .init();
 
-    log_memory_usage("start");
+    // log_memory_usage("start");
 
     // Create a single combined graph without paths and a map of path key to ranges
     let (mut combined_graph, mut path_key_ranges) = read_gfa_files(&args.gfa_list);
 
-    log_memory_usage("after_reading_files");
-
-    let total_path_keys = path_key_ranges.len();
-    let mut processed_path_keys = 0;
+    // log_memory_usage("after_reading_files");
 
-    // Sort, deduplicate, and trim path ranges and create merged paths in the combined graph
+    // Sort, deduplicate, trim, and link path ranges
     info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
-    for (path_key, mut ranges) in path_key_ranges.drain() {
+    for (path_key, ranges) in path_key_ranges.iter_mut() {
         debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());
 
-        sort_and_filter_ranges(&path_key, &mut ranges, args.verbose > 1);
-        trim_range_overlaps(&path_key, &mut ranges, &mut combined_graph, args.verbose > 1);
-        create_paths_from_ranges(&path_key, &ranges, &mut combined_graph, args.verbose > 1);
-
-        processed_path_keys += 1;
-        if processed_path_keys % 10 == 0 {
-            info!("Processed {}/{} path keys", processed_path_keys, total_path_keys);
-            log_memory_usage(&format!("Processed {} path keys", processed_path_keys));
-        }
+        sort_and_filter_ranges(path_key, ranges, args.verbose > 1);
+        trim_range_overlaps(path_key, ranges, &mut combined_graph, args.verbose > 1);
+        link_contiguous_ranges(path_key, ranges, &mut combined_graph, args.verbose > 1);
     }
     info!("Created {} nodes, {} edges, and {} paths",
         combined_graph.node_count(), combined_graph.edge_count(), combined_graph.path_count());
 
-    log_memory_usage("before_writing");
+    // log_memory_usage("before_writing");
 
-    info!("Writing the result by removing unused nodes/edges and compacting node IDs");
-    match write_graph_to_gfa(&combined_graph, &args.output) {
+    info!("Writing the result by adding paths, removing unused nodes/edges, and compacting node IDs");
+    match write_graph_to_gfa(&combined_graph, &path_key_ranges, &args.output, args.verbose > 1) {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
     }
 
-    log_memory_usage("end");
+    // log_memory_usage("end");
 }
 
 #[derive(Debug, Clone)]
@@ -393,9 +381,7 @@ fn trim_range_overlaps(
     debug: bool
 ) {
     // Trim overlaps
-    if debug {
-        debug!("Trimming overlapping ranges");
-    }
+    debug!("Trimming overlapping ranges");
 
     let mut next_node_id_value = u64::from(combined_graph.max_node_id()) + 1;
 
@@ -409,12 +395,10 @@ fn trim_range_overlaps(
             let overlap_start = std::cmp::max(r1.start, r2.start);
             let overlap_end = std::cmp::min(r1.end, r2.end);
 
-            if debug {
-                debug!(
-                    "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
-                    r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
-                );
-            }
+            debug!(
+                "  Overlap detected: Range1 [start={}, end={}], Range2 [start={}, end={}], Overlap [start={}, end={}], Overlap size={}",
+                r1.start, r1.end, r2.start, r2.end, overlap_start, overlap_end, overlap_end - overlap_start
+            );
 
             // Adjust r2 to remove the overlap
             let mut steps_to_remove = Vec::new();
@@ -478,16 +462,11 @@ fn trim_range_overlaps(
                     let overlap_start_offset = (overlap_within_step_start - step_start).min(node_len);
                     let overlap_end_offset = (overlap_within_step_end - step_start).min(node_len);
 
-                    if debug {
-                        debug!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]", 
-                            idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end);
-                        debug!("    Overlap offsets: start={}, end={}", overlap_start_offset, overlap_end_offset);
-                    }
+                    debug!("    Splitting step {} [start={}, end={}, len={}] to remove overlap at [start={}, end={}]\n    Overlap offsets: start={}, end={}",
+                        idx, step_start, step_end, step_end - step_start, overlap_within_step_start, overlap_within_step_end, overlap_start_offset, overlap_end_offset);
 
                     if step_start < overlap_start {
-                        if debug {
-                            debug!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
-                        }
+                        debug!("    Adding left part of step [start={}, end={}]", step_start, overlap_within_step_start);
                         assert!(overlap_start_offset > 0);
 
                         // Keep left part
@@ -503,9 +482,7 @@ fn trim_range_overlaps(
                             range_new_start = Some(step_start);
                         }
                     } else if step_end > overlap_end {
-                        if debug {
-                            debug!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
-                        }
+                        debug!("    Adding right part of step [start={}, end={}]", overlap_within_step_end, step_end);
                         assert!(overlap_end_offset < node_len);
 
                         // Keep right part
@@ -555,9 +532,7 @@ fn trim_range_overlaps(
                 r2.end = overlap_end;
             }
 
-            if debug {
-                debug!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
-            }
+            debug!("    Updated overlaps: Range2 [start={}, end={}]", r2.start, r2.end);
         }
     }
 
@@ -569,139 +544,184 @@ fn trim_range_overlaps(
     }
 }
 
-fn create_paths_from_ranges(
+fn link_contiguous_ranges(
     path_key: &str,
-    ranges: &[RangeInfo],
+    ranges: &mut [RangeInfo],
     combined_graph: &mut HashGraph,
     debug: bool
 ) {
-    // Check for overlaps and contiguity
-    let mut all_contiguous = true;
-    
-    for window in ranges.windows(2) {
-        let r1 = &window[0];
-        let r2 = &window[1];
+    // Trim overlaps
+    debug!("Linking overlapping ranges");
 
-        if r1.overlaps_with(r2) {
-            if debug {
-                debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
-                r1.start, r1.end, r2.start, r2.end);
+    for i in 1..ranges.len() {
+        let (left, right) = ranges.split_at_mut(i);
+        let r1 = &mut left[left.len()-1];
+        let r2 = &mut right[0];
+
+        // Check if ranges are contiguous
+        if r1.is_contiguous_with(r2) {
+            // Get last handle from previous range and first handle from current range
+            if let (Some(&last_handle), Some(&first_handle)) = (r1.steps.last(), r2.steps.first()) {
+                // Create edge if it doesn't exist
+                if !combined_graph.has_edge(last_handle, first_handle) {
+                    combined_graph.create_edge(Edge(last_handle, first_handle));
+                    debug!("  Created edge between contiguous ranges at position {}", r1.end);
+                }
             }
-            panic!("Unresolved overlaps detected in path key '{}'", path_key);
         }
-        if !r1.is_contiguous_with(r2) {
-            all_contiguous = false;
+    }
+
+    if debug {
+        debug!("Path key '{}' without overlaps", path_key);
+        for range in ranges.iter() {
+            debug!("  Range: start={}, end={}, num.steps={}, gfa_id={}", range.start, range.end, range.steps.len(), range.gfa_id);
         }
     }
+}
+
+// fn create_paths_from_ranges(
+//     path_key: &str,
+//     ranges: &[RangeInfo],
+//     combined_graph: &mut HashGraph,
+//     debug: bool
+// ) {
+//     // Check for overlaps and contiguity
+//     let mut all_contiguous = true;
+    
+//     for window in ranges.windows(2) {
+//         let r1 = &window[0];
+//         let r2 = &window[1];
+
+//         if r1.overlaps_with(r2) {
+//             if debug {
+//                 debug!("Unresolved overlaps detected between ranges: [start={}, end={}] and [start={}, end={}]", 
+//                 r1.start, r1.end, r2.start, r2.end);
+//             }
+//             panic!("Unresolved overlaps detected in path key '{}'", path_key);
+//         }
+//         if !r1.is_contiguous_with(r2) {
+//             all_contiguous = false;
+//         }
+//     }
             
-    if !all_contiguous && debug {
-        let mut current_start = ranges[0].start;
-        let mut current_end = ranges[0].end;
+//     if !all_contiguous && debug {
+//         let mut current_start = ranges[0].start;
+//         let mut current_end = ranges[0].end;
         
-        for i in 1..ranges.len() {
-            if ranges[i-1].is_contiguous_with(&ranges[i]) {
-                // Extend current merged range
-                current_end = ranges[i].end;
-            } else {
-                // Print current merged range
-                debug!("    Merged range: start={}, end={}", 
-                    current_start, current_end);
+//         for i in 1..ranges.len() {
+//             if ranges[i-1].is_contiguous_with(&ranges[i]) {
+//                 // Extend current merged range
+//                 current_end = ranges[i].end;
+//             } else {
+//                 // Print current merged range
+//                 debug!("    Merged range: start={}, end={}", 
+//                     current_start, current_end);
                 
-                if !ranges[i-1].overlaps_with(&ranges[i]) {
-                    // Calculate and print gap
-                    let gap = ranges[i].start - current_end;
-                    debug!("      Gap to next range: {} positions", gap);
-                } else {
-                    // Calculate and print overlap
-                    let overlap = current_end - ranges[i].start;
-                    debug!("      Overlap with next range: {} positions", overlap);
-                }
-
-                // Start new merged range
-                current_start = ranges[i].start;
-                current_end = ranges[i].end;
-            }
-        }
+//                 if !ranges[i-1].overlaps_with(&ranges[i]) {
+//                     // Calculate and print gap
+//                     let gap = ranges[i].start - current_end;
+//                     debug!("      Gap to next range: {} positions", gap);
+//                 } else {
+//                     // Calculate and print overlap
+//                     let overlap = current_end - ranges[i].start;
+//                     debug!("      Overlap with next range: {} positions", overlap);
+//                 }
+
+//                 // Start new merged range
+//                 current_start = ranges[i].start;
+//                 current_end = ranges[i].end;
+//             }
+//         }
         
-        // Print final merged range
-        debug!("    Merged range: start={}, end={}", 
-            current_start, current_end);
-    }
-
-    if all_contiguous {
-        // Create a single path with the original key
-        let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap();
-        let mut prev_step = None;
+//         // Print final merged range
+//         debug!("    Merged range: start={}, end={}", 
+//             current_start, current_end);
+//     }
+
+//     if all_contiguous {
+//         // Create a single path with the original key
+//         let path_id = combined_graph.create_path(path_key.as_bytes(), false).unwrap();
+//         let mut prev_step = None;
         
-        // Add all steps from all ranges
-        for range in ranges.iter() {
-            for step in &range.steps {
-                combined_graph.path_append_step(path_id, *step);
+//         // Add all steps from all ranges
+//         for range in ranges.iter() {
+//             for step in &range.steps {
+//                 combined_graph.path_append_step(path_id, *step);
                 
-                if let Some(prev) = prev_step {
-                    if !combined_graph.has_edge(prev, *step) {
-                        combined_graph.create_edge(Edge(prev, *step));
-                    }
-                }
-                prev_step = Some(*step);
-            }
-        }
-    } else {
-        // Handle non-contiguous ranges by creating separate paths for each contiguous group
-        let mut current_range_idx = 0;
-        while current_range_idx < ranges.len() {
-            let start_range = &ranges[current_range_idx];
-            let mut steps = start_range.steps.clone();
-            let mut next_idx = current_range_idx + 1;
-            let mut end_range = start_range;
+//                 if let Some(prev) = prev_step {
+//                     if !combined_graph.has_edge(prev, *step) {
+//                         combined_graph.create_edge(Edge(prev, *step));
+//                     }
+//                 }
+//                 prev_step = Some(*step);
+//             }
+//         }
+//     } else {
+//         // Handle non-contiguous ranges by creating separate paths for each contiguous group
+//         let mut current_range_idx = 0;
+//         while current_range_idx < ranges.len() {
+//             let start_range = &ranges[current_range_idx];
+//             let mut steps = start_range.steps.clone();
+//             let mut next_idx = current_range_idx + 1;
+//             let mut end_range = start_range;
             
-            // Merge contiguous ranges
-            while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
-                steps.extend(ranges[next_idx].steps.clone());
-                end_range = &ranges[next_idx];
-                next_idx += 1;
-            }
+//             // Merge contiguous ranges
+//             while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
+//                 steps.extend(ranges[next_idx].steps.clone());
+//                 end_range = &ranges[next_idx];
+//                 next_idx += 1;
+//             }
             
-            // Create path name with range information
-            let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
-            let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
+//             // Create path name with range information
+//             let path_name = format!("{}:{}-{}", path_key, start_range.start, end_range.end);
+//             let path_id = combined_graph.create_path(path_name.as_bytes(), false).unwrap();
             
-            // Add steps to the path
-            let mut prev_step = None;
-            for step in steps {
-                combined_graph.path_append_step(path_id, step);
+//             // Add steps to the path
+//             let mut prev_step = None;
+//             for step in steps {
+//                 combined_graph.path_append_step(path_id, step);
                 
-                if let Some(prev) = prev_step {
-                    if !combined_graph.has_edge(prev, step) {
-                        combined_graph.create_edge(Edge(prev, step));
-                    }
-                }
-                prev_step = Some(step);
-            }
+//                 if let Some(prev) = prev_step {
+//                     if !combined_graph.has_edge(prev, step) {
+//                         combined_graph.create_edge(Edge(prev, step));
+//                     }
+//                 }
+//                 prev_step = Some(step);
+//             }
             
-            current_range_idx = next_idx;
-        }
-    }
-}
-
-fn mark_nodes_for_removal(graph: &HashGraph) -> BitVec {
+//             current_range_idx = next_idx;
+//         }
+//     }
+// }
+
+fn mark_nodes_for_removal(
+    graph: &HashGraph,
+    path_key_ranges: &FxHashMap<String, Vec<RangeInfo>>
+) -> BitVec {
     // Create a bitvector with all nodes initially marked for removal
     let max_node_id = u64::from(graph.max_node_id());
     let mut nodes_to_remove = bitvec![1; max_node_id as usize + 1];
     
-    // Mark nodes used in paths as not to be removed (set bit to 0)
-    for (_path_id, path_ref) in graph.paths.iter() {
-        for handle in &path_ref.nodes {
-            nodes_to_remove.set(u64::from(handle.id()) as usize, false);
+    // Mark nodes used in path ranges as not to be removed (set bit to 0)
+    for ranges in path_key_ranges.values() {
+        for range in ranges {
+            for handle in &range.steps {
+                nodes_to_remove.set(u64::from(handle.id()) as usize, false);
+            }
         }
     }
     
     nodes_to_remove
 }
 
-fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<()> {
+fn write_graph_to_gfa(
+    graph: &HashGraph, 
+    path_key_ranges: &FxHashMap<String, Vec<RangeInfo>>,
+    output_path: &str,
+    debug: bool
+) -> std::io::Result<()> {
     debug!("Marking unused nodes for removal");
-    let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph);    
+    let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph, path_key_ranges);    
     debug!("Marked {} nodes", nodes_to_remove.count_ones());
     
     let mut file = File::create(output_path)?;
@@ -709,7 +729,6 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
     // Write GFA version
     writeln!(file, "H\tVN:Z:1.0")?;
     
-
     // Write nodes by exluding marked ones and create the id_mapping
     let max_id = graph.node_count();
     let mut id_mapping = vec![0; max_id + 1];
@@ -740,28 +759,99 @@ fn write_graph_to_gfa(graph: &HashGraph, output_path: &str) -> std::io::Result<(
         }
     }
 
-    // Collect and sort paths directly with references to avoid multiple lookups
-    let mut path_entries: Vec<_> = graph.paths.iter().collect();
-    path_entries.sort_by_key(|(_, path_ref)| {
-        // Get name directly from path struct instead of doing lookups
-        path_ref.name.as_slice()
-    });
-
-    // Write paths with new IDs
-    for (_path_id, path_ref) in path_entries {
-        // Pre-allocate vector with capacity
-        let mut path_elements = Vec::with_capacity(path_ref.nodes.len());
+    // Write paths by processing ranges directly
+    let mut path_key_vec: Vec<_> = path_key_ranges.keys().collect();
+    path_key_vec.sort(); // Sort path keys for consistent output
+
+    for path_key in path_key_vec {
+        let ranges = &path_key_ranges[path_key];
+
+        // Check for overlaps and contiguity
+        let mut all_contiguous = true;
         
-        // Process nodes directly from path reference
-        for handle in &path_ref.nodes {
-            let node_id = id_mapping[u64::from(handle.id()) as usize];
-            let orient = if handle.is_reverse() { "-" } else { "+" };
-            path_elements.push(format!("{}{}", node_id, orient));
+        for window in ranges.windows(2) {
+            let r1 = &window[0];
+            let r2 = &window[1];
+
+            if r1.overlaps_with(r2) {
+                panic!("Unresolved overlaps detected in path key '{}': [start={}, end={}] and [start={}, end={}]", path_key, 
+                r1.start, r1.end, r2.start, r2.end);
+            }
+            if !r1.is_contiguous_with(r2) {
+                all_contiguous = false;
+            }
         }
-        
-        if !path_elements.is_empty() {
-            let path_name = String::from_utf8_lossy(&path_ref.name);
-            writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+
+        if !all_contiguous && debug {
+            let mut current_start = ranges[0].start;
+            let mut current_end = ranges[0].end;
+            
+            for i in 1..ranges.len() {
+                if ranges[i-1].is_contiguous_with(&ranges[i]) {
+                    // Extend current merged range
+                    current_end = ranges[i].end;
+                } else {
+                    // Print current merged range
+                    debug!("    Merged range: start={}, end={}", 
+                        current_start, current_end);
+                    
+                    if !ranges[i-1].overlaps_with(&ranges[i]) {
+                        // Calculate and print gap
+                        let gap = ranges[i].start - current_end;
+                        debug!("      Gap to next range: {} positions", gap);
+                    } else {
+                        // Calculate and print overlap
+                        let overlap = current_end - ranges[i].start;
+                        debug!("      Overlap with next range: {} positions", overlap);
+                    }
+    
+                    // Start new merged range
+                    current_start = ranges[i].start;
+                    current_end = ranges[i].end;
+                }
+            }
+            
+            // Print final merged range
+            debug!("    Merged range: start={}, end={}", 
+                current_start, current_end);
+        }
+
+        let mut current_range_idx = 0;
+
+        while current_range_idx < ranges.len() {
+            let start_range = &ranges[current_range_idx];
+            let mut next_idx = current_range_idx + 1;
+            let mut end_range = start_range;
+            
+            // Find contiguous ranges
+            while next_idx < ranges.len() && ranges[next_idx - 1].is_contiguous_with(&ranges[next_idx]) {
+                end_range = &ranges[next_idx];
+                next_idx += 1;
+            }
+
+            // Create path elements for contiguous range group
+            let mut path_elements = Vec::new();
+            for idx in current_range_idx..next_idx {
+                for handle in &ranges[idx].steps {
+                    let node_id = id_mapping[u64::from(handle.id()) as usize];
+                    let orient = if handle.is_reverse() { "-" } else { "+" };
+                    path_elements.push(format!("{}{}", node_id, orient));
+                }
+            }
+
+            if !path_elements.is_empty() {
+                let path_name = if next_idx - current_range_idx == ranges.len() {
+                    // All ranges are contiguous - use original path key
+                    path_key.to_string()
+                } else {
+                    // Create path name with range information
+                    format!("{}:{}-{}", path_key, start_range.start, end_range.end)
+                };
+                
+                writeln!(file, "P\t{}\t{}\t*", path_name, path_elements.join(","))?;
+            }
+            
+            current_range_idx = next_idx;
         }
     }
         

From ed5306f986d4a700a95081c79ba08f2826976daf Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 15:19:29 -0600
Subject: [PATCH 44/45] tweak log

---
 src/main.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index c4f719d..bf9a40c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -84,7 +84,6 @@ fn main() {
 
     // log_memory_usage("before_writing");
 
-    info!("Writing the result by adding paths, removing unused nodes/edges, and compacting node IDs");
     match write_graph_to_gfa(&combined_graph, &path_key_ranges, &args.output, args.verbose > 1) {
         Ok(_) => info!("Successfully wrote the combined graph to {}", args.output),
         Err(e) => error!("Error writing the GFA file: {}", e),
@@ -720,7 +719,7 @@ fn write_graph_to_gfa(
     output_path: &str,
     debug: bool
 ) -> std::io::Result<()> {
-    debug!("Marking unused nodes for removal");
+    info!("Marking unused nodes");
     let nodes_to_remove : BitVec = mark_nodes_for_removal(&graph, path_key_ranges);    
     debug!("Marked {} nodes", nodes_to_remove.count_ones());
     
@@ -730,6 +729,7 @@ fn write_graph_to_gfa(
     writeln!(file, "H\tVN:Z:1.0")?;
     
     // Write nodes by exluding marked ones and create the id_mapping
+    info!("Writing used nodes by compacting their IDs");
     let max_id = graph.node_count();
     let mut id_mapping = vec![0; max_id + 1];
     let mut new_id = 1; // Start from 1
@@ -747,6 +747,7 @@ fn write_graph_to_gfa(
     }
     
     // Write edges by excluding those connected to marked nodes
+    info!("Writing edges connecting used nodes");
     for edge in graph.edges() {
         if !nodes_to_remove[u64::from(edge.0.id()) as usize] && 
            !nodes_to_remove[u64::from(edge.1.id()) as usize] 
@@ -760,6 +761,7 @@ fn write_graph_to_gfa(
     }
 
     // Write paths by processing ranges directly
+    info!("Writing paths by merging contiguous path ranges");
     let mut path_key_vec: Vec<_> = path_key_ranges.keys().collect();
     path_key_vec.sort(); // Sort path keys for consistent output
 

From 2445a8ec84c41680d5fde022548bad2c732d5eae Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Sun, 15 Dec 2024 15:38:20 -0600
Subject: [PATCH 45/45] typo

---
 src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index bf9a40c..6136085 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -71,7 +71,7 @@ fn main() {
     // log_memory_usage("after_reading_files");
 
     // Sort, deduplicate, trim, and link path ranges
-    info!("Sorting, deduplicating, and trimming {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
+    info!("Sorting, deduplicating, trimming, and linking {} path ranges", path_key_ranges.values().map(|ranges| ranges.len()).sum::<usize>());
     for (path_key, ranges) in path_key_ranges.iter_mut() {
         debug!("Processing path key '{}' with {} ranges", path_key, ranges.len());