Skip to content

Commit

Permalink
FlatGFA: A prototype odgi extract reimplementation (#165)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Apr 5, 2024
2 parents aa55cd9 + 6933ed7 commit b120a64
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 10 deletions.
4 changes: 2 additions & 2 deletions mygfa/docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@

html_theme = "alabaster"

autodoc_member_order = 'bysource'
autodoc_typehints_format = 'short'
autodoc_member_order = "bysource"
autodoc_typehints_format = "short"
154 changes: 152 additions & 2 deletions polbin/src/cmds.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::flatgfa;
use crate::pool::Index;
use crate::flatgfa::{self, GFABuilder};
use crate::pool::{self, Index, Pool};
use argh::FromArgs;
use std::collections::HashMap;

Expand Down Expand Up @@ -72,3 +72,153 @@ pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) {
println!("unique\t{}", counts.len());
}
}

/// create a subset graph
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand, name = "extract")]
pub struct Extract {
/// segment to extract around
#[argh(option, short = 'n')]
seg_name: usize,

/// number of edges "away" from the node to include
#[argh(option, short = 'c')]
link_distance: usize,
}

pub fn extract(gfa: &flatgfa::FlatGFA, args: Extract) -> Result<flatgfa::HeapStore, &'static str> {
let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?;

let mut subgraph = SubgraphBuilder::new(gfa);
subgraph.extract(origin_seg, args.link_distance);
Ok(subgraph.store)
}

/// A helper to construct a new graph that includes part of an old graph.
struct SubgraphBuilder<'a> {
old: &'a flatgfa::FlatGFA<'a>,
store: flatgfa::HeapStore,
seg_map: HashMap<Index, Index>,
}

struct SubpathStart {
step: Index, // The id of the first step in the subpath.
pos: usize, // The bp position at the start of the subpath.
}

impl<'a> SubgraphBuilder<'a> {
fn new(old: &'a flatgfa::FlatGFA) -> Self {
Self {
old,
store: flatgfa::HeapStore::default(),
seg_map: HashMap::new(),
}
}

/// Add a segment from the source graph to this subgraph.
fn include_seg(&mut self, seg_id: Index) {
let seg = &self.old.segs[seg_id as usize];
let new_seg_id = self.store.add_seg(
seg.name,
self.old.get_seq(seg),
self.old.get_optional_data(seg),
);
self.seg_map.insert(seg_id, new_seg_id);
}

/// Add a link from the source graph to the subgraph.
fn include_link(&mut self, link: &flatgfa::Link) {
let from = self.tr_handle(link.from);
let to = self.tr_handle(link.to);
let overlap = self.old.get_alignment(&link.overlap);
self.store.add_link(from, to, overlap.ops.into());
}

/// Add a single subpath from the given path to the subgraph.
fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) {
let steps = pool::Span {
start: start.step,
end: self.store.steps.next_id(),
};
let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos);
self.store
.add_path(name.as_bytes(), steps, std::iter::empty());
}

/// Identify all the subpaths in a path from the original graph that cross through
/// segments in this subgraph and add them.
fn find_subpaths(&mut self, path: &flatgfa::Path) {
let mut cur_subpath_start: Option<SubpathStart> = None;
let mut path_pos = 0;

for step in self.old.get_steps(path) {
let in_neighb = self.seg_map.contains_key(&step.segment());

if let (Some(start), false) = (&cur_subpath_start, in_neighb) {
// End the current subpath.
self.include_subpath(path, start, path_pos);
cur_subpath_start = None;
} else if let (None, true) = (&cur_subpath_start, in_neighb) {
// Start a new subpath.
cur_subpath_start = Some(SubpathStart {
step: self.store.steps.next_id(),
pos: path_pos,
});
}

// Add the (translated) step to the new graph.
if in_neighb {
self.store.add_step(self.tr_handle(*step));
}

// Track the current bp position in the path.
path_pos += self.old.get_handle_seg(*step).len();
}

// Did we reach the end of the path while still in the neighborhood?
if let Some(start) = cur_subpath_start {
self.include_subpath(path, &start, path_pos);
}
}

/// Translate a handle from the source graph to this subgraph.
fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle {
flatgfa::Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient())
}

/// Check whether a segment from the old graph is in the subgraph.
fn contains(&self, old_seg_id: Index) -> bool {
self.seg_map.contains_key(&old_seg_id)
}

/// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away
/// from the given segment in the original graph.
///
/// Include any links between the segments in the neighborhood and subpaths crossing
/// through the neighborhood.
fn extract(&mut self, origin: Index, dist: usize) {
self.include_seg(origin);

// Find the set of all segments that are 1 link away.
assert_eq!(dist, 1, "only `-c 1` is implemented so far");
for link in self.old.links.iter() {
if let Some(other_seg) = link.incident_seg(origin) {
if !self.seg_map.contains_key(&other_seg) {
self.include_seg(other_seg);
}
}
}

// Include all links within the subgraph.
for link in self.old.links.iter() {
if self.contains(link.from.segment()) && self.contains(link.to.segment()) {
self.include_link(link);
}
}

// Find subpaths within the subgraph.
for path in self.old.paths.iter() {
self.find_subpaths(path);
}
}
}
36 changes: 36 additions & 0 deletions polbin/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ pub struct Segment {
pub optional: Span,
}

impl Segment {
pub fn len(&self) -> usize {
self.seq.len()
}
}

/// A path is a sequence of oriented references to segments.
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
Expand Down Expand Up @@ -106,6 +112,19 @@ pub struct Link {
pub overlap: Span,
}

impl Link {
/// Is either end of the link the given segment? If so, return the other end.
pub fn incident_seg(&self, seg_id: Index) -> Option<Index> {
if self.from.segment() == seg_id {
Some(self.to.segment())
} else if self.to.segment() == seg_id {
Some(self.from.segment())
} else {
None
}
}
}

/// A forward or backward direction.
#[derive(Debug, PartialEq, IntoPrimitive, TryFromPrimitive)]
#[repr(u8)]
Expand Down Expand Up @@ -205,6 +224,16 @@ impl<'a> FlatGFA<'a> {
self.seq_data[seg.seq.range()].as_ref()
}

/// Look up a segment by its name.
pub fn find_seg(&self, name: usize) -> Option<Index> {
// TODO Make this more efficient by maintaining the name index? This would not be
// too hard; we already have the machinery in `parse.rs`...
self.segs
.iter()
.position(|seg| seg.name == name)
.map(|i| i as Index)
}

/// Get all the steps for a path.
pub fn get_steps(&self, path: &Path) -> &[Handle] {
&self.steps[path.steps.range()]
Expand Down Expand Up @@ -277,6 +306,9 @@ pub trait GFABuilder {
/// Add a sequence of steps.
fn add_steps(&mut self, steps: impl Iterator<Item = Handle>) -> Span;

/// Add a single step.
fn add_step(&mut self, step: Handle) -> Index;

/// Add a link between two (oriented) segments.
fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Index;

Expand Down Expand Up @@ -324,6 +356,10 @@ impl<'a, P: PoolFamily<'a>> GFABuilder for Store<'a, P> {
self.steps.add_iter(steps)
}

fn add_step(&mut self, step: Handle) -> Index {
self.steps.add(step)
}

fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Index {
self.links.add(Link {
from,
Expand Down
15 changes: 11 additions & 4 deletions polbin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,17 @@ enum Command {
Toc(cmds::Toc),
Paths(cmds::Paths),
Stats(cmds::Stats),
Extract(cmds::Extract),
}

fn main() {
fn main() -> Result<(), &'static str> {
let args: PolBin = argh::from_env();

// A special case for converting from GFA text to an in-place FlatGFA binary.
if args.mutate {
if let (None, None, Some(out_name)) = (&args.command, &args.input, &args.output) {
prealloc_translate(args.input_gfa.as_deref(), out_name, args.prealloc_factor);
return;
return Ok(());
}
}

Expand Down Expand Up @@ -120,14 +121,20 @@ fn main() {
Some(Command::Paths(_)) => {
cmds::paths(&gfa);
}
Some(Command::Stats(args)) => {
cmds::stats(&gfa, args);
Some(Command::Stats(sub_args)) => {
cmds::stats(&gfa, sub_args);
}
Some(Command::Extract(sub_args)) => {
let store = cmds::extract(&gfa, sub_args)?;
dump(&store.view(), &args.output);
}
None => {
// Just emit the GFA or FlatGFA file.
dump(&gfa, &args.output);
}
}

Ok(())
}

/// Write a FlatGFA either to a GFA text file to stdout or a binary FlatGFA file given
Expand Down
4 changes: 4 additions & 0 deletions polbin/src/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ impl Span {
pub fn range(&self) -> std::ops::Range<usize> {
(*self).into()
}

pub fn len(&self) -> usize {
(self.end - self.start) as usize
}
}

pub trait Pool<T: Clone> {
Expand Down
29 changes: 27 additions & 2 deletions polbin/src/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ fn print_seg(gfa: &flatgfa::FlatGFA, seg: &flatgfa::Segment) {
println!();
}

/// Print our flat representation as a GFA text file to stdout.
pub fn print(gfa: &flatgfa::FlatGFA) {
/// Print a graph in the order preserved from an original GFA file.
fn print_preserved(gfa: &flatgfa::FlatGFA) {
let mut seg_iter = gfa.segs.iter();
let mut path_iter = gfa.paths.iter();
let mut link_iter = gfa.links.iter();
Expand All @@ -105,3 +105,28 @@ pub fn print(gfa: &flatgfa::FlatGFA) {
}
}
}

/// Print a graph in a normalized order, ignoring the original GFA line order.
pub fn print_normalized(gfa: &flatgfa::FlatGFA) {
if !gfa.header.is_empty() {
println!("H\t{}", bstr::BStr::new(gfa.header));
}
for seg in gfa.segs.iter() {
print_seg(gfa, seg);
}
for path in gfa.paths.iter() {
print_path(gfa, path);
}
for link in gfa.links.iter() {
print_link(gfa, link);
}
}

/// Print our flat representation as a GFA text file to stdout.
pub fn print(gfa: &flatgfa::FlatGFA) {
if gfa.line_order.is_empty() {
print_normalized(gfa);
} else {
print_preserved(gfa);
}
}

0 comments on commit b120a64

Please sign in to comment.