From 1f7aecfcdabf93e0a5946f2fd941a2dbc5f8e3eb Mon Sep 17 00:00:00 2001 From: susan-garry Date: Wed, 25 Sep 2024 01:04:14 -0400 Subject: [PATCH 1/8] move flatgfa data structure definition and internal operations to its own module --- .gitignore | 3 + flatgfa-py/Cargo.lock | 11 +- flatgfa/Cargo.lock | 11 +- flatgfa/Cargo.toml | 11 +- flatgfa/src/cmds.rs | 6 +- flatgfa/src/file.rs | 336 -------------------------------- flatgfa/src/flatgfa.rs | 430 ----------------------------------------- flatgfa/src/gfaline.rs | 272 -------------------------- flatgfa/src/lib.rs | 9 - flatgfa/src/main.rs | 12 +- flatgfa/src/parse.rs | 283 --------------------------- flatgfa/src/pool.rs | 299 ---------------------------- flatgfa/src/print.rs | 153 --------------- 13 files changed, 35 insertions(+), 1801 deletions(-) delete mode 100644 flatgfa/src/file.rs delete mode 100644 flatgfa/src/flatgfa.rs delete mode 100644 flatgfa/src/gfaline.rs delete mode 100644 flatgfa/src/lib.rs delete mode 100644 flatgfa/src/parse.rs delete mode 100644 flatgfa/src/pool.rs delete mode 100644 flatgfa/src/print.rs diff --git a/.gitignore b/.gitignore index 2c542cdd..f4e54928 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,7 @@ pollen/target polbin/target pollen/*.rlib +flatgfa/target +flatgfa/**/target + slow_odgi/dist/ diff --git a/flatgfa-py/Cargo.lock b/flatgfa-py/Cargo.lock index 3b85ab62..dcb76f0d 100644 --- a/flatgfa-py/Cargo.lock +++ b/flatgfa-py/Cargo.lock @@ -84,10 +84,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "flatgfa" +name = "fgfa_ds" version = "0.1.0" dependencies = [ - "argh", "atoi", "bstr", "memchr", @@ -97,6 +96,14 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "flatgfa" +version = "0.1.0" +dependencies = [ + "argh", + "fgfa_ds", +] + [[package]] name = "flatgfa-py" version = "0.1.0" diff --git a/flatgfa/Cargo.lock b/flatgfa/Cargo.lock index d09b04b8..bf18eb98 100644 --- a/flatgfa/Cargo.lock +++ b/flatgfa/Cargo.lock @@ -72,10 +72,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "flatgfa" +name = "fgfa_ds" version = "0.1.0" dependencies = [ - "argh", "atoi", "bstr", "memchr", @@ -85,6 +84,14 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "flatgfa" +version = "0.1.0" +dependencies = [ + "argh", + "fgfa_ds", +] + [[package]] name = "hashbrown" version = "0.14.3" diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index 2426a543..6178f607 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -1,3 +1,6 @@ +[workspace] +members = ["fgfa_ds"] + [package] name = "flatgfa" version = "0.1.0" @@ -9,13 +12,7 @@ path = "src/main.rs" [dependencies] argh = "0.1.12" -atoi = "2.0.0" -bstr = "1.9.1" -memchr = "2.7.1" -memmap = "0.7.0" -num_enum = "0.7.2" -tinyvec = "1.6.0" -zerocopy = { version = "0.7.32", features = ["derive"] } +fgfa_ds = { path = "fgfa_ds" } [profile.profiling] inherits = "release" diff --git a/flatgfa/src/cmds.rs b/flatgfa/src/cmds.rs index 48437e4c..3e56d721 100644 --- a/flatgfa/src/cmds.rs +++ b/flatgfa/src/cmds.rs @@ -1,6 +1,6 @@ -use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; -use crate::pool::{self, Id, Span, Store}; -use crate::{GFAStore, HeapFamily}; +use fgfa_ds::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; +use fgfa_ds::pool::{self, Id, Span, Store}; +use fgfa_ds::{GFAStore, HeapFamily}; use argh::FromArgs; use std::collections::{HashMap, HashSet}; diff --git a/flatgfa/src/file.rs b/flatgfa/src/file.rs deleted file mode 100644 index 77bf7bf0..00000000 --- a/flatgfa/src/file.rs +++ /dev/null @@ -1,336 +0,0 @@ -use crate::flatgfa; -use crate::pool::{FixedStore, Pool, Span, Store}; -use memmap::{Mmap, MmapMut}; -use std::mem::{size_of, size_of_val}; -use tinyvec::SliceVec; -use zerocopy::{AsBytes, FromBytes, FromZeroes}; - -const MAGIC_NUMBER: u64 = 0xB101_1054; - -/// A table of contents for the FlatGFA file. -#[derive(FromBytes, FromZeroes, AsBytes, Debug)] -#[repr(packed)] -pub struct Toc { - magic: u64, - header: Size, - segs: Size, - paths: Size, - links: Size, - steps: Size, - seq_data: Size, - overlaps: Size, - alignment: Size, - name_data: Size, - optional_data: Size, - line_order: Size, -} - -/// A table-of-contents entry for a pool in the FlatGFA file. -#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy, Debug)] -#[repr(packed)] -struct Size { - /// The number of actual elements in the pool. - len: usize, - - // The allocated space for the pool. `capacity - len` slots are "empty." - capacity: usize, -} - -impl Size { - fn of_pool(pool: Pool) -> Self { - Size { - len: pool.len(), - capacity: pool.len(), - } - } - - fn of_store(store: &FixedStore<'_, T>) -> Self { - Size { - len: store.len(), - capacity: store.capacity(), - } - } - - fn bytes(&self) -> usize { - self.capacity * size_of::() - } - - fn empty(capacity: usize) -> Self { - Size { len: 0, capacity } - } -} - -impl Toc { - /// Get the total size in bytes of the file described. - pub fn size(&self) -> usize { - size_of::() - + self.header.bytes::() - + self.segs.bytes::() - + self.paths.bytes::() - + self.links.bytes::() - + self.steps.bytes::() - + self.seq_data.bytes::() - + self.overlaps.bytes::>() - + self.alignment.bytes::() - + self.name_data.bytes::() - + self.optional_data.bytes::() - + self.line_order.bytes::() - } - - /// Get a table of contents that fits a FlatGFA with no spare space. - fn full(gfa: &flatgfa::FlatGFA) -> Self { - Self { - magic: MAGIC_NUMBER, - header: Size::of_pool(gfa.header), - segs: Size::of_pool(gfa.segs), - paths: Size::of_pool(gfa.paths), - links: Size::of_pool(gfa.links), - steps: Size::of_pool(gfa.steps), - seq_data: Size::of_pool(gfa.seq_data), - overlaps: Size::of_pool(gfa.overlaps), - alignment: Size::of_pool(gfa.alignment), - name_data: Size::of_pool(gfa.name_data), - optional_data: Size::of_pool(gfa.optional_data), - line_order: Size::of_pool(gfa.line_order), - } - } - - pub fn for_fixed_store(store: &flatgfa::FixedGFAStore) -> Self { - Self { - magic: MAGIC_NUMBER, - header: Size::of_store(&store.header), - segs: Size::of_store(&store.segs), - paths: Size::of_store(&store.paths), - links: Size::of_store(&store.links), - steps: Size::of_store(&store.steps), - seq_data: Size::of_store(&store.seq_data), - overlaps: Size::of_store(&store.overlaps), - alignment: Size::of_store(&store.alignment), - name_data: Size::of_store(&store.name_data), - optional_data: Size::of_store(&store.optional_data), - line_order: Size::of_store(&store.line_order), - } - } - - /// Guess a reasonable set of capacities for a fresh file. - pub fn guess(factor: usize) -> Self { - Self { - magic: MAGIC_NUMBER, - header: Size::empty(128), - segs: Size::empty(32 * factor * factor), - paths: Size::empty(factor), - links: Size::empty(32 * factor * factor), - steps: Size::empty(1024 * factor * factor), - seq_data: Size::empty(512 * factor * factor), - overlaps: Size::empty(256 * factor), - alignment: Size::empty(64 * factor * factor), - name_data: Size::empty(64 * factor), - optional_data: Size::empty(512 * factor * factor), - line_order: Size::empty(64 * factor * factor), - } - } - - /// Estimate a reasonable set of capacities for a fresh file based on some - /// measurements of the GFA text. - pub fn estimate( - segs: usize, - links: usize, - paths: usize, - header_bytes: usize, - seg_bytes: usize, - path_bytes: usize, - ) -> Self { - Self { - magic: MAGIC_NUMBER, - header: Size::empty(header_bytes), - segs: Size::empty(segs), - paths: Size::empty(paths), - links: Size::empty(links), - steps: Size::empty(path_bytes / 3), - seq_data: Size::empty(seg_bytes), - overlaps: Size::empty((links + paths) * 2), - alignment: Size::empty(links * 2 + paths * 4), - name_data: Size::empty(paths * 512), - optional_data: Size::empty(links * 16), - line_order: Size::empty(segs + links + paths + 8), - } - } -} - -/// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity` -/// elements, and return the items and the rest of the slice. -fn slice_prefix(data: &[u8], size: Size) -> (&[T], &[u8]) { - let (prefix, rest) = T::slice_from_prefix(data, size.len).unwrap(); - let pad = size_of::() * (size.capacity - size.len); - (prefix, &rest[pad..]) -} - -/// Read the table of contents from a prefix of the byte buffer. -fn read_toc(data: &[u8]) -> (&Toc, &[u8]) { - let toc = Toc::ref_from_prefix(data).unwrap(); - let rest = &data[size_of::()..]; - let magic = toc.magic; - assert_eq!(magic, MAGIC_NUMBER); - (toc, rest) -} - -fn read_toc_mut(data: &mut [u8]) -> (&mut Toc, &mut [u8]) { - let (toc_slice, rest) = Toc::mut_slice_from_prefix(data, 1).unwrap(); - let toc = &mut toc_slice[0]; - let magic = toc.magic; - assert_eq!(magic, MAGIC_NUMBER); - (toc, rest) -} - -/// Get a FlatGFA backed by the data in a byte buffer. -pub fn view(data: &[u8]) -> flatgfa::FlatGFA { - let (toc, rest) = read_toc(data); - - let (header, rest) = slice_prefix(rest, toc.header); - let (segs, rest) = slice_prefix(rest, toc.segs); - let (paths, rest) = slice_prefix(rest, toc.paths); - let (links, rest) = slice_prefix(rest, toc.links); - let (steps, rest) = slice_prefix(rest, toc.steps); - let (seq_data, rest) = slice_prefix(rest, toc.seq_data); - let (overlaps, rest) = slice_prefix(rest, toc.overlaps); - let (alignment, rest) = slice_prefix(rest, toc.alignment); - let (name_data, rest) = slice_prefix(rest, toc.name_data); - let (optional_data, rest) = slice_prefix(rest, toc.optional_data); - let (line_order, _) = slice_prefix(rest, toc.line_order); - - flatgfa::FlatGFA { - header: header.into(), - segs: segs.into(), - paths: paths.into(), - links: links.into(), - steps: steps.into(), - seq_data: seq_data.into(), - overlaps: overlaps.into(), - alignment: alignment.into(), - name_data: name_data.into(), - optional_data: optional_data.into(), - line_order: line_order.into(), - } -} - -/// Like `slice_prefix`, but produce a `SliceVec`. -fn slice_vec_prefix( - data: &mut [u8], - size: Size, -) -> (SliceVec, &mut [u8]) { - let (prefix, rest) = T::mut_slice_from_prefix(data, size.capacity).unwrap(); - let vec = SliceVec::from_slice_len(prefix, size.len); - (vec, rest) -} - -/// Get a FlatGFA `SliceStore` from the suffix of a file just following the table of contents. -fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::FixedGFAStore<'a> { - let (header, rest) = slice_vec_prefix(data, toc.header); - let (segs, rest) = slice_vec_prefix(rest, toc.segs); - let (paths, rest) = slice_vec_prefix(rest, toc.paths); - let (links, rest) = slice_vec_prefix(rest, toc.links); - let (steps, rest) = slice_vec_prefix(rest, toc.steps); - let (seq_data, rest) = slice_vec_prefix(rest, toc.seq_data); - let (overlaps, rest) = slice_vec_prefix(rest, toc.overlaps); - let (alignment, rest) = slice_vec_prefix(rest, toc.alignment); - let (name_data, rest) = slice_vec_prefix(rest, toc.name_data); - let (optional_data, rest) = slice_vec_prefix(rest, toc.optional_data); - let (line_order, _) = slice_vec_prefix(rest, toc.line_order); - - flatgfa::FixedGFAStore { - header: header.into(), - segs: segs.into(), - paths: paths.into(), - links: links.into(), - steps: steps.into(), - seq_data: seq_data.into(), - overlaps: overlaps.into(), - alignment: alignment.into(), - name_data: name_data.into(), - optional_data: optional_data.into(), - line_order: line_order.into(), - } -} - -/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer. -pub fn view_store(data: &mut [u8]) -> flatgfa::FixedGFAStore { - let (toc, rest) = read_toc_mut(data); - slice_store(rest, toc) -} - -/// Initialize a buffer with an empty FlatGFA store. -pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, flatgfa::FixedGFAStore) { - // Write the table of contents. - assert!(data.len() == toc.size()); - toc.write_to_prefix(data).unwrap(); - - // Get a mutable reference to the embedded TOC. - let (toc_bytes, rest) = data.split_at_mut(size_of::()); - let toc_mut = Toc::mut_from(toc_bytes).unwrap(); - - // Extract a store from the remaining bytes. - (toc_mut, slice_store(rest, &toc)) -} - -fn write_bump<'a, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &T) -> Option<&'a mut [u8]> { - let len = size_of_val(data); - data.write_to_prefix(buf)?; - Some(&mut buf[len..]) -} - -fn write_bytes<'a>(buf: &'a mut [u8], data: &[u8]) -> Option<&'a mut [u8]> { - let len = data.len(); - buf[0..len].copy_from_slice(data); - Some(&mut buf[len..]) -} - -/// Copy a FlatGFA into a byte buffer. -pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { - // Table of contents. - let toc = Toc::full(gfa); - let rest = write_bump(buf, &toc).unwrap(); - - // All the slices. - let rest = write_bytes(rest, gfa.header.all()).unwrap(); - let rest = write_bump(rest, gfa.segs.all()).unwrap(); - let rest = write_bump(rest, gfa.paths.all()).unwrap(); - let rest = write_bump(rest, gfa.links.all()).unwrap(); - let rest = write_bump(rest, gfa.steps.all()).unwrap(); - let rest = write_bytes(rest, gfa.seq_data.all()).unwrap(); - let rest = write_bump(rest, gfa.overlaps.all()).unwrap(); - let rest = write_bump(rest, gfa.alignment.all()).unwrap(); - let rest = write_bytes(rest, gfa.name_data.all()).unwrap(); - let rest = write_bytes(rest, gfa.optional_data.all()).unwrap(); - write_bytes(rest, gfa.line_order.all()).unwrap(); -} - -/// Get the total size in bytes of a FlatGFA structure. This should result in a big -/// enough buffer to write the entire FlatGFA into with `dump`. -pub fn size(gfa: &flatgfa::FlatGFA) -> usize { - Toc::full(gfa).size() -} - -pub fn map_file(name: &str) -> Mmap { - let file = std::fs::File::open(name).unwrap(); - unsafe { Mmap::map(&file) }.unwrap() -} - -pub fn map_new_file(name: &str, size: u64) -> MmapMut { - let file = std::fs::OpenOptions::new() - .read(true) - .write(true) - .create(true) - .open(name) - .unwrap(); - file.set_len(size).unwrap(); - unsafe { MmapMut::map_mut(&file) }.unwrap() -} - -pub fn map_file_mut(name: &str) -> MmapMut { - let file = std::fs::OpenOptions::new() - .read(true) - .write(true) - .open(name) - .unwrap(); - unsafe { MmapMut::map_mut(&file) }.unwrap() -} diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs deleted file mode 100644 index a7f0e5dd..00000000 --- a/flatgfa/src/flatgfa.rs +++ /dev/null @@ -1,430 +0,0 @@ -use std::str::FromStr; - -use crate::pool::{self, Id, Pool, Span, Store}; -use bstr::BStr; -use num_enum::{IntoPrimitive, TryFromPrimitive}; -use zerocopy::{AsBytes, FromBytes, FromZeroes}; - -/// An efficient flattened representation of a GFA file. -/// -/// This struct *borrows* the underlying data from some other data store. Namely, the -/// `GFAStore` structs contain `Vec`s or `Vec`-like arenas as backing stores for each -/// of the slices in this struct. `FlatGFA` itself provides access to the GFA data -/// structure that is agnostic to the location of the underlying bytes. However, all -/// its components have a fixed size; unlike the underlying `GFAStore`, it is not -/// possible to add new objects. -pub struct FlatGFA<'a> { - /// A GFA may optionally have a single header line, with a version number. - /// If this is empty, there is no header line. - pub header: Pool<'a, u8>, - - /// The segment (S) lines in the GFA file. - pub segs: Pool<'a, Segment>, - - /// The path (P) lines. - pub paths: Pool<'a, Path>, - - /// The link (L) lines. - pub links: Pool<'a, Link>, - - /// Paths consist of steps. This is a flat pool of steps, chunks of which are - /// associated with each path. - pub steps: Pool<'a, Handle>, - - /// The actual base-pair sequences for the segments. This is a pool of - /// base-pair symbols, chunks of which are associated with each segment. - /// - /// TODO: This could certainly use a smaller representation than `u8` - /// (since we care only about 4 base pairs). If we want to pay the cost - /// of bit-packing. - pub seq_data: Pool<'a, u8>, - - /// Both paths and links can have overlaps, which are CIGAR sequences. They - /// are all stored together here in a flat pool, elements of which point - /// to chunks of `alignment`. - pub overlaps: Pool<'a, Span>, - - /// The CIGAR aligment operations that make up the overlaps. `overlaps` - /// contains range of indices in this pool. - pub alignment: Pool<'a, AlignOp>, - - /// The string names: currenly, just of paths. (We assume segments have integer - /// names, so they don't need to be stored separately.) - pub name_data: Pool<'a, u8>, - - /// Segments can come with optional extra fields, which we store in a flat pool - /// as raw characters because we don't currently care about them. - pub optional_data: Pool<'a, u8>, - - /// An "interleaving" order of GFA lines. This is to preserve perfect round-trip - /// fidelity: we record the order of lines as we saw them when parsing a GFA file - /// so we can emit them again in that order. Elements should be `LineKind` values - /// (but they are checked before we use them). - pub line_order: Pool<'a, u8>, -} - -/// GFA graphs consist of "segment" nodes, which are fragments of base-pair sequences -/// that can be strung together into paths. -#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] -#[repr(packed)] -pub struct Segment { - /// The segment's name. We assume all names are just plain numbers. - pub name: usize, - - /// The base-pair sequence for the segment. This is a range in the `seq_data` pool. - pub seq: Span, - - /// Segments can have optional fields. This is a range in the `optional_data` pool. - pub optional: Span, -} - -impl Segment { - #[allow(clippy::len_without_is_empty)] - pub fn len(&self) -> usize { - self.seq.len() - } -} - -/// A path is a sequence of oriented references to segments. -#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] -#[repr(packed)] -pub struct Path { - /// The path's name. This can be an arbitrary string. It is a range in the - /// `name_data` pool. - pub name: Span, - - /// The sequence of path steps. This is a range in the `steps` pool. - pub steps: Span, - - /// The CIGAR overlaps for each step on the path. This is a range in the - /// `overlaps` pool. - pub overlaps: Span>, -} - -impl Path { - pub fn step_count(&self) -> usize { - self.steps.end.index() - self.steps.start.index() - } -} - -/// An allowed edge between two oriented segments. -#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)] -#[repr(packed)] -pub struct Link { - /// The source of the edge. - pub from: Handle, - - // The destination of the edge. - pub to: Handle, - - /// The CIGAR overlap between the segments. This is a range in the - /// `alignment` pool. - pub overlap: Span, -} - -impl Link { - /// Is either end of the link the given segment? If so, return the other end. - pub fn incident_seg(&self, seg_id: Id) -> Option> { - if self.from.segment() == seg_id { - Some(self.to.segment()) - } else if self.to.segment() == seg_id { - Some(self.from.segment()) - } else { - None - } - } -} - -/// A forward or backward direction. -#[derive(Debug, PartialEq, IntoPrimitive, TryFromPrimitive)] -#[repr(u8)] -pub enum Orientation { - Forward, // + - Backward, // - -} - -impl FromStr for Orientation { - type Err = (); - - fn from_str(s: &str) -> Result { - if s == "+" { - Ok(Orientation::Forward) - } else if s == "-" { - Ok(Orientation::Backward) - } else { - Err(()) - } - } -} - -/// An oriented reference to a segment. -/// -/// A Handle refers to the forward (+) or backward (-) orientation for a given segment. -/// So, logically, it consists of a pair of a segment reference (usize) and an -/// orientation (1 bit). We pack the two values into a single word. -#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy, PartialEq, Eq, Hash)] -#[repr(packed)] -pub struct Handle(u32); - -impl Handle { - /// Create a new handle referring to a segment ID and an orientation. - pub fn new(segment: Id, orient: Orientation) -> Self { - let seg_num: u32 = segment.into(); - assert!(seg_num & (1 << (u32::BITS - 1)) == 0, "index too large"); - let orient_bit: u8 = orient.into(); - assert!(orient_bit & !1 == 0, "invalid orientation"); - Self(seg_num << 1 | (orient_bit as u32)) - } - - /// Get the segment ID. This is an index in the `segs` pool. - pub fn segment(&self) -> Id { - (self.0 >> 1).into() - } - - /// Get the orientation (+ or -) for the handle. - pub fn orient(&self) -> Orientation { - ((self.0 & 1) as u8).try_into().unwrap() - } -} - -/// The kind of each operation in a CIGAR alignment. -#[derive(Debug, IntoPrimitive, TryFromPrimitive, Clone, Copy)] -#[repr(u8)] -pub enum AlignOpcode { - Match, // M - Gap, // N - Insertion, // D - Deletion, // I -} - -/// A single operation in a CIGAR alignment, like "3M" or "1D". -/// -/// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two -/// into a single u32. -#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] -#[repr(packed)] -pub struct AlignOp(u32); - -impl AlignOp { - /// Create a new alignment operation from an opcode and count. - pub fn new(op: AlignOpcode, len: u32) -> Self { - let op_byte: u8 = op.into(); - assert!(len & !0xff == 0, "length too large"); - Self((len << 8) | (op_byte as u32)) - } - - /// Get the operation (M, I, etc.) for this operation. - pub fn op(&self) -> AlignOpcode { - ((self.0 & 0xff) as u8).try_into().unwrap() - } - - /// Get the length of the operation. - pub fn len(&self) -> u32 { - self.0 >> 8 - } - - /// Check whether there are zero operations in this alignment. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// An entire CIGAR alignment string, like "3M1D2M". -#[derive(Debug)] -#[repr(transparent)] -pub struct Alignment<'a> { - /// The sequence of operations that make up the alignment. - pub ops: &'a [AlignOp], -} - -/// A kind of GFA line. We use this in `line_order` to preserve the textual order -/// in a GFA file for round-tripping. -#[derive(Debug, IntoPrimitive, TryFromPrimitive)] -#[repr(u8)] -pub enum LineKind { - Header, - Segment, - Path, - Link, -} - -impl<'a> FlatGFA<'a> { - /// Get the base-pair sequence for a segment. - pub fn get_seq(&self, seg: &Segment) -> &BStr { - self.seq_data[seg.seq].as_ref() - } - - /// Look up a segment by its name. - pub fn find_seg(&self, name: usize) -> Option> { - // TODO Make this more efficient by maintaining the name index? This would not be - // too hard; we already have the machinery in `parse.rs`... - self.segs.search(|seg| seg.name == name) - } - - /// Look up a path by its name. - pub fn find_path(&self, name: &BStr) -> Option> { - self.paths.search(|path| self.get_path_name(path) == name) - } - - /// Get the string name of a path. - pub fn get_path_name(&self, path: &Path) -> &BStr { - self.name_data[path.name].as_ref() - } - - pub fn get_path_steps(&self, path: &Path) -> impl Iterator { - self.steps[path.steps].iter() - } - - /// Get a handle's associated segment. - pub fn get_handle_seg(&self, handle: Handle) -> &Segment { - &self.segs[handle.segment()] - } - - /// Get the optional data for a segment, as a tab-separated string. - pub fn get_optional_data(&self, seg: &Segment) -> &BStr { - self.optional_data[seg.optional].as_ref() - } - - /// Look up a CIGAR alignment. - pub fn get_alignment(&self, overlap: Span) -> Alignment { - Alignment { - ops: &self.alignment[overlap] - } - } - - /// Get the recorded order of line kinds. - pub fn get_line_order(&self) -> impl Iterator + 'a { - self.line_order - .all() - .iter() - .map(|b| (*b).try_into().unwrap()) - } -} - -/// The data storage pools for a `FlatGFA`. -#[derive(Default)] -pub struct GFAStore<'a, P: StoreFamily<'a>> { - pub header: P::Store, - pub segs: P::Store, - pub paths: P::Store, - pub links: P::Store, - pub steps: P::Store, - pub seq_data: P::Store, - pub overlaps: P::Store>, - pub alignment: P::Store, - pub name_data: P::Store, - pub optional_data: P::Store, - pub line_order: P::Store, -} - -impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> { - /// Add a header line for the GFA file. This may only be added once. - pub fn add_header(&mut self, version: &[u8]) { - assert!(self.header.as_ref().is_empty()); - self.header.add_slice(version); - } - - /// Add a new segment to the GFA file. - pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Id { - self.segs.add(Segment { - name, - seq: self.seq_data.add_slice(seq), - optional: self.optional_data.add_slice(optional), - }) - } - - /// Add a new path. - pub fn add_path( - &mut self, - name: &[u8], - steps: Span, - overlaps: impl Iterator>, - ) -> Id { - let overlaps = self.overlaps.add_iter( - overlaps - .into_iter() - .map(|align| self.alignment.add_iter(align)), - ); - let name = self.name_data.add_slice(name); - self.paths.add(Path { - name, - steps, - overlaps, - }) - } - - /// Add a sequence of steps. - pub fn add_steps(&mut self, steps: impl Iterator) -> Span { - self.steps.add_iter(steps) - } - - /// Add a single step. - pub fn add_step(&mut self, step: Handle) -> Id { - self.steps.add(step) - } - - /// Add a sequence of links. - pub fn add_links(&mut self, links: impl Iterator) -> Span { - self.links.add_iter(links) - } - - /// Add a link between two (oriented) segments. - pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec) -> Id { - self.links.add(Link { - from, - to, - overlap: self.alignment.add_iter(overlap), - }) - } - - /// Record a line type to preserve the line order. - pub fn record_line(&mut self, kind: LineKind) { - self.line_order.add(kind.into()); - } - - /// Borrow a FlatGFA view of this data store. - pub fn as_ref(&self) -> FlatGFA { - FlatGFA { - header: self.header.as_ref(), - segs: self.segs.as_ref(), - paths: self.paths.as_ref(), - links: self.links.as_ref(), - name_data: self.name_data.as_ref(), - seq_data: self.seq_data.as_ref(), - steps: self.steps.as_ref(), - overlaps: self.overlaps.as_ref(), - alignment: self.alignment.as_ref(), - optional_data: self.optional_data.as_ref(), - line_order: self.line_order.as_ref(), - } - } -} - -pub trait StoreFamily<'a> { - type Store: pool::Store; -} - -#[derive(Default)] -pub struct HeapFamily; -impl<'a> StoreFamily<'a> for HeapFamily { - type Store = pool::HeapStore; -} - -pub struct FixedFamily; -impl<'a> StoreFamily<'a> for FixedFamily { - type Store = pool::FixedStore<'a, T>; -} - -/// A store for `FlatGFA` data backed by fixed-size slices. -/// -/// This store contains `SliceVec`s, which act like `Vec`s but are allocated within -/// a fixed region. This means they have a maximum size, but they can directly map -/// onto the contents of a file. -pub type FixedGFAStore<'a> = GFAStore<'a, FixedFamily>; - -/// A mutable, in-memory data store for `FlatGFA`. -/// -/// This store contains a bunch of `Vec`s: one per array required to implement a -/// `FlatGFA`. It exposes an API for building up a GFA data structure, so it is -/// useful for creating new ones from scratch. -pub type HeapGFAStore = GFAStore<'static, HeapFamily>; diff --git a/flatgfa/src/gfaline.rs b/flatgfa/src/gfaline.rs deleted file mode 100644 index 36408d42..00000000 --- a/flatgfa/src/gfaline.rs +++ /dev/null @@ -1,272 +0,0 @@ -use crate::flatgfa::{AlignOp, Orientation}; -use atoi::FromRadix10; - -type ParseResult = Result; -type LineResult<'a> = ParseResult>; -type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>; - -/// A parsed GFA file line. -pub enum Line<'a> { - Header(&'a [u8]), - Segment(Segment<'a>), - Link(Link), - Path(Path<'a>), -} - -pub struct Segment<'a> { - pub name: usize, - pub seq: &'a [u8], - pub data: &'a [u8], -} - -pub struct Link { - pub from_seg: usize, - pub from_orient: Orientation, - pub to_seg: usize, - pub to_orient: Orientation, - pub overlap: Vec, -} - -pub struct Path<'a> { - pub name: &'a [u8], - pub steps: &'a [u8], - pub overlaps: Vec>, -} - -/// Parse a single line of a GFA file. -pub fn parse_line(line: &[u8]) -> LineResult { - if line.len() < 2 || line[1] != b'\t' { - return Err("expected marker and tab"); - } - let rest = &line[2..]; - match line[0] { - b'H' => parse_header(rest), - b'S' => parse_seg(rest), - b'L' => parse_link(rest), - b'P' => parse_path(rest), - _ => Err("unhandled line kind"), - } -} - -/// Parse a header line, which looks like `H `. -fn parse_header(line: &[u8]) -> LineResult { - Ok(Line::Header(line)) -} - -/// Parse a segment line, which looks like `S `. -fn parse_seg(line: &[u8]) -> LineResult { - let (name, rest) = parse_num(line)?; - let rest = parse_byte(rest, b'\t')?; - let (seq, data) = parse_field(rest)?; - Ok(Line::Segment(Segment { name, seq, data })) -} - -/// Parse a link line, which looks like `L <+-> <+-> `. -fn parse_link(line: &[u8]) -> LineResult { - let (from_seg, rest) = parse_num(line)?; - let rest = parse_byte(rest, b'\t')?; - let (from_orient, rest) = parse_orient(rest)?; - let rest = parse_byte(rest, b'\t')?; - let (to_seg, rest) = parse_num(rest)?; - let rest = parse_byte(rest, b'\t')?; - let (to_orient, rest) = parse_orient(rest)?; - let rest = parse_byte(rest, b'\t')?; - let (overlap, rest) = parse_align(rest)?; - if !rest.is_empty() { - return Err("expected end of line"); - } - Ok(Line::Link(Link { - from_seg, - from_orient, - to_seg, - to_orient, - overlap, - })) -} - -/// Parse a path line, which looks like `P <*|CIGARs>`. -fn parse_path(line: &[u8]) -> LineResult { - let (name, rest) = parse_field(line)?; - let (steps, rest) = parse_field(rest)?; - let (overlaps, rest) = parse_maybe_overlap_list(rest)?; - if !rest.is_empty() { - return Err("expected end of line"); - } - Ok(Line::Path(Path { - name, - steps, - overlaps, - })) -} - -/// Parse a *possible* overlap list, which may be `*` (empty). -pub fn parse_maybe_overlap_list(s: &[u8]) -> PartialParseResult>> { - if s == b"*" { - Ok((vec![], &s[1..])) - } else { - parse_overlap_list(s) - } -} - -/// Parse a comma-separated list of CIGAR strings. -/// -/// TODO: This could be optimized to avoid accumulating into a vector. -fn parse_overlap_list(s: &[u8]) -> PartialParseResult>> { - let mut rest = s; - let mut overlaps = vec![]; - while !rest.is_empty() { - let overlap; - (overlap, rest) = parse_align(rest)?; - overlaps.push(overlap); - if !rest.is_empty() { - rest = parse_byte(rest, b',')?; - } - } - Ok((overlaps, rest)) -} - -/// Consume a chunk of a string up to a given marker byte. -fn parse_until(line: &[u8], marker: u8) -> PartialParseResult<&[u8]> { - let end = memchr::memchr(marker, line).unwrap_or(line.len()); - let rest = if end == line.len() { - &[] - } else { - &line[end + 1..] - }; - Ok((&line[..end], rest)) -} - -/// Consume a string from the line, until a tab (or the end of the line). -pub fn parse_field(line: &[u8]) -> PartialParseResult<&[u8]> { - parse_until(line, b'\t') -} - -/// Consume a specific byte. -fn parse_byte(s: &[u8], byte: u8) -> ParseResult<&[u8]> { - if s.is_empty() || s[0] != byte { - return Err("expected byte"); - } - Ok(&s[1..]) -} - -/// Parse a single integer. -fn parse_num(s: &[u8]) -> PartialParseResult { - match T::from_radix_10(s) { - (_, 0) => Err("expected number"), - (num, used) => Ok((num, &s[used..])), - } -} - -/// Parse a segment orientation (+ or -). -fn parse_orient(line: &[u8]) -> PartialParseResult { - if line.is_empty() { - return Err("expected orientation"); - } - let orient = match line[0] { - b'+' => Orientation::Forward, - b'-' => Orientation::Backward, - _ => return Err("expected orient"), - }; - Ok((orient, &line[1..])) -} - -/// Parse a single CIGAR alignment operation (like `4D`). -fn parse_align_op(s: &[u8]) -> PartialParseResult { - let (len, rest) = parse_num::(s)?; - let op = match rest[0] { - b'M' => crate::flatgfa::AlignOpcode::Match, - b'N' => crate::flatgfa::AlignOpcode::Gap, - b'D' => crate::flatgfa::AlignOpcode::Deletion, - b'I' => crate::flatgfa::AlignOpcode::Insertion, - _ => return Err("expected align op"), - }; - Ok((AlignOp::new(op, len), &rest[1..])) -} - -/// Parse a complete CIGAR alignment string (like `3M2I`). -/// -/// TODO This could be optimized to avoid collecting into a vector. -fn parse_align(s: &[u8]) -> PartialParseResult> { - let mut rest = s; - let mut align = vec![]; - while !rest.is_empty() && rest[0].is_ascii_digit() { - let op; - (op, rest) = parse_align_op(rest)?; - align.push(op); - } - Ok((align, rest)) -} - -/// Parse GFA paths' segment lists. These look like `1+,2-,3+`. -pub struct StepsParser<'a> { - str: &'a [u8], - index: usize, - state: StepsParseState, - seg: usize, -} - -/// The parser state: we're either looking for a segment name (or a +/- terminator), -/// or we're expecting a comma (or end of string). -enum StepsParseState { - Seg, - Comma, -} - -impl<'a> StepsParser<'a> { - pub fn new(str: &'a [u8]) -> Self { - StepsParser { - str, - index: 0, - state: StepsParseState::Seg, - seg: 0, - } - } - - pub fn rest(&self) -> &[u8] { - &self.str[self.index..] - } -} - -impl<'a> Iterator for StepsParser<'a> { - type Item = (usize, bool); - fn next(&mut self) -> Option<(usize, bool)> { - while self.index < self.str.len() { - // Consume one byte. - let byte = self.str[self.index]; - self.index += 1; - - match self.state { - StepsParseState::Seg => { - if byte == b'+' || byte == b'-' { - self.state = StepsParseState::Comma; - return Some((self.seg, byte == b'+')); - } else if byte.is_ascii_digit() { - self.seg *= 10; - self.seg += (byte - b'0') as usize; - } else { - return None; - } - } - StepsParseState::Comma => { - if byte == b',' { - self.state = StepsParseState::Seg; - self.seg = 0; - } else { - return None; - } - } - } - } - - None - } -} - -#[test] -fn test_parse_steps() { - let s = b"1+,23-,4+ suffix"; - let mut parser = StepsParser::new(s); - let path: Vec<_> = (&mut parser).collect(); - assert_eq!(path, vec![(1, true), (23, false), (4, true)]); - assert_eq!(parser.rest(), b"suffix"); -} diff --git a/flatgfa/src/lib.rs b/flatgfa/src/lib.rs deleted file mode 100644 index d6ec729e..00000000 --- a/flatgfa/src/lib.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub mod cmds; -pub mod file; -pub mod flatgfa; -pub mod gfaline; -pub mod parse; -pub mod pool; -pub mod print; - -pub use flatgfa::*; diff --git a/flatgfa/src/main.rs b/flatgfa/src/main.rs index 376404c4..53968e99 100644 --- a/flatgfa/src/main.rs +++ b/flatgfa/src/main.rs @@ -1,8 +1,10 @@ use argh::FromArgs; -use flatgfa::flatgfa::FlatGFA; -use flatgfa::parse::Parser; -use flatgfa::pool::Store; -use flatgfa::{cmds, file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles +use fgfa_ds::flatgfa::FlatGFA; +use fgfa_ds::parse::Parser; +use fgfa_ds::pool::Store; +use fgfa_ds::{file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles + +mod cmds; #[derive(FromArgs)] /// Convert between GFA text and FlatGFA binary formats. @@ -112,7 +114,7 @@ fn main() -> Result<(), &'static str> { // defining here which values from out input `gfa` are needed by our final `flat` gfa. // Here we are reference values in two different Stores to create this Flatgfa, and // have not yet found a good rust-safe way to do this - let flat = flatgfa::FlatGFA { + let flat = FlatGFA { header: gfa.header, seq_data: gfa.seq_data, name_data: gfa.name_data, diff --git a/flatgfa/src/parse.rs b/flatgfa/src/parse.rs deleted file mode 100644 index 0685d3f5..00000000 --- a/flatgfa/src/parse.rs +++ /dev/null @@ -1,283 +0,0 @@ -use crate::flatgfa::{self, Handle, LineKind, Orientation}; -use crate::gfaline; -use std::collections::HashMap; -use std::io::BufRead; - -pub struct Parser<'a, P: flatgfa::StoreFamily<'a>> { - /// The flat representation we're building. - flat: flatgfa::GFAStore<'a, P>, - - /// All segment IDs, indexed by their names, which we need to refer to segments in paths. - seg_ids: NameMap, -} - -impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> { - pub fn new(builder: flatgfa::GFAStore<'a, P>) -> Self { - Self { - flat: builder, - seg_ids: NameMap::default(), - } - } - - /// Parse a GFA text file from an I/O stream. - pub fn parse_stream(mut self, stream: R) -> flatgfa::GFAStore<'a, P> { - // We can parse segments immediately, but we need to defer links and paths until we have all - // the segment names that they might refer to. - let mut deferred_links = Vec::new(); - let mut deferred_paths = Vec::new(); - - // Parse or defer each line. - for line in stream.split(b'\n') { - let line = line.unwrap(); - - // Avoid parsing paths entirely for now; just preserve the entire line for later. - if line[0] == b'P' { - self.flat.record_line(LineKind::Path); - deferred_paths.push(line); - continue; - } - - // Parse other kinds of lines. - let gfa_line = gfaline::parse_line(line.as_ref()).unwrap(); - self.record_line(&gfa_line); - - match gfa_line { - gfaline::Line::Header(data) => { - self.flat.add_header(data); - } - gfaline::Line::Segment(seg) => { - self.add_seg(seg); - } - gfaline::Line::Link(link) => { - deferred_links.push(link); - } - gfaline::Line::Path(_) => { - unreachable!("paths handled separately") - } - } - } - - // "Unwind" the deferred links and paths. - for link in deferred_links { - self.add_link(link); - } - for line in deferred_paths { - self.add_path(&line); - } - - self.flat - } - - /// Parse a GFA text file from an in-memory buffer. - pub fn parse_mem(mut self, buf: &[u8]) -> flatgfa::GFAStore<'a, P> { - let mut deferred_lines = Vec::new(); - - for line in MemchrSplit::new(b'\n', buf) { - // When parsing from memory, it's easy to entirely defer parsing of any line: we just keep - // pointers to them. So we defer both paths and links. - if line[0] == b'P' || line[0] == b'L' { - self.flat.record_line(if line[0] == b'P' { - LineKind::Path - } else { - LineKind::Link - }); - deferred_lines.push(line); - continue; - } - - // Actually parse other lines. - let gfa_line = gfaline::parse_line(line).unwrap(); - self.record_line(&gfa_line); - match gfa_line { - gfaline::Line::Header(data) => { - self.flat.add_header(data); - } - gfaline::Line::Segment(seg) => { - self.add_seg(seg); - } - gfaline::Line::Link(_) | gfaline::Line::Path(_) => { - unreachable!("paths and links handled separately") - } - } - } - - // "Unwind" the deferred lines. - for line in deferred_lines { - if line[0] == b'P' { - self.add_path(line); - } else { - let gfa_line = gfaline::parse_line(line).unwrap(); - if let gfaline::Line::Link(link) = gfa_line { - self.add_link(link); - } else { - unreachable!("unexpected deferred line") - } - } - } - - self.flat - } - - /// Record a marker that captures the original GFA line ordering. - fn record_line(&mut self, line: &gfaline::Line) { - match line { - gfaline::Line::Header(_) => self.flat.record_line(LineKind::Header), - gfaline::Line::Segment(_) => self.flat.record_line(LineKind::Segment), - gfaline::Line::Link(_) => self.flat.record_line(LineKind::Link), - gfaline::Line::Path(_) => self.flat.record_line(LineKind::Path), - } - } - - fn add_seg(&mut self, seg: gfaline::Segment) { - let seg_id = self.flat.add_seg(seg.name, seg.seq, seg.data); - self.seg_ids.insert(seg.name, seg_id.into()); - } - - fn add_link(&mut self, link: gfaline::Link) { - let from = Handle::new(self.seg_ids.get(link.from_seg).into(), link.from_orient); - let to = Handle::new(self.seg_ids.get(link.to_seg).into(), link.to_orient); - self.flat.add_link(from, to, link.overlap); - } - - fn add_path(&mut self, line: &[u8]) { - // This must be a path line. - assert_eq!(&line[..2], b"P\t"); - let line = &line[2..]; - - // Parse the name. - let (name, rest) = gfaline::parse_field(line).unwrap(); - - // Parse the steps. - let mut step_parser = gfaline::StepsParser::new(rest); - let steps = self.flat.add_steps((&mut step_parser).map(|(name, dir)| { - Handle::new( - self.seg_ids.get(name).into(), - if dir { - Orientation::Forward - } else { - Orientation::Backward - }, - ) - })); - let rest = step_parser.rest(); - - // Parse the overlaps. - let (overlaps, rest) = gfaline::parse_maybe_overlap_list(rest).unwrap(); - - assert!(rest.is_empty()); - self.flat.add_path(name, steps, overlaps.into_iter()); - } -} - -impl Parser<'static, flatgfa::HeapFamily> { - pub fn for_heap() -> Self { - Self::new(flatgfa::HeapGFAStore::default()) - } -} - -impl<'a> Parser<'a, flatgfa::FixedFamily> { - pub fn for_slice(store: flatgfa::FixedGFAStore<'a>) -> Self { - Self::new(store) - } -} - -#[derive(Default)] -struct NameMap { - /// Names at most this are assigned *sequential* IDs, i.e., the ID is just the name - /// minus one. - sequential_max: usize, - - /// Non-sequential names go here. - others: HashMap, -} - -impl NameMap { - fn insert(&mut self, name: usize, id: u32) { - // Is this the next sequential name? If so, no need to record it in our hash table; - // just bump the number of sequential names we've seen. - if (name - 1) == self.sequential_max && (name - 1) == (id as usize) { - self.sequential_max += 1; - } else { - self.others.insert(name, id); - } - } - - fn get(&self, name: usize) -> u32 { - if name <= self.sequential_max { - (name - 1) as u32 - } else { - self.others[&name] - } - } -} - -/// Scan a GFA text file to count the number of each type of line and measure some sizes -/// that are useful in estimating the final size of the FlatGFA file. -pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc { - let mut segs = 0; - let mut links = 0; - let mut paths = 0; - let mut header_bytes = 0; - let mut seg_bytes = 0; - let mut path_bytes = 0; - - let mut rest = buf; - while !rest.is_empty() { - let marker = rest[0]; - let next = memchr::memchr(b'\n', rest).unwrap_or(rest.len() + 1); - - match marker { - b'H' => { - header_bytes += next; - } - b'S' => { - segs += 1; - seg_bytes += next; - } - b'L' => { - links += 1; - } - b'P' => { - paths += 1; - path_bytes += next; - } - _ => { - panic!("unknown line type") - } - } - - if next >= rest.len() { - break; - } - rest = &rest[next + 1..]; - } - - crate::file::Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes) -} - -struct MemchrSplit<'a> { - haystack: &'a [u8], - memchr: memchr::Memchr<'a>, - pos: usize, -} - -impl<'a> Iterator for MemchrSplit<'a> { - type Item = &'a [u8]; - - fn next(&mut self) -> Option { - let start = self.pos; - let end = self.memchr.next()?; - self.pos = end + 1; - Some(&self.haystack[start..end]) - } -} - -impl MemchrSplit<'_> { - fn new(needle: u8, haystack: &[u8]) -> MemchrSplit { - MemchrSplit { - haystack, - memchr: memchr::memchr_iter(needle, haystack), - pos: 0, - } - } -} diff --git a/flatgfa/src/pool.rs b/flatgfa/src/pool.rs deleted file mode 100644 index 2872388a..00000000 --- a/flatgfa/src/pool.rs +++ /dev/null @@ -1,299 +0,0 @@ -use std::ops::{Index, Add, Sub}; -use std::{hash::Hash, marker::PhantomData}; -use tinyvec::SliceVec; -use zerocopy::{AsBytes, FromBytes, FromZeroes}; - -/// An index into a pool. -#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] -#[repr(transparent)] -pub struct Id(u32, PhantomData); - -impl PartialEq for Id { - fn eq(&self, other: &Self) -> bool { - self.0 == other.0 - } -} - -impl Eq for Id {} - -impl Hash for Id { - fn hash(&self, state: &mut H) { - self.0.hash(state) - } -} - -impl Add for Id { - type Output = Self; - - #[inline] - fn add(self, rhs: u32) -> Self::Output { - Self(self.0 + rhs, PhantomData) - } -} - -impl Sub for Id { - type Output = Self; - #[inline] - fn sub(self, rhs:u32) -> Self::Output { - Self(self.0 - rhs, PhantomData) - } -} - -impl Id { - pub fn index(self) -> usize { - self.0 as usize - } - - pub fn new(index: usize) -> Self { - Self(index.try_into().expect("id too large"), PhantomData) - } -} - -impl From for Id { - fn from(v: u32) -> Self { - Self(v, PhantomData) - } -} - -impl From> for u32 { - fn from(v: Id) -> Self { - v.0 - } -} - -/// A range of indices into a pool. -/// -/// TODO: Consider smaller indices for this, and possibly base/offset instead -/// of start/end. -#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy, PartialEq, Eq, Hash)] -#[repr(packed)] -pub struct Span { - pub start: Id, - pub end: Id, - _marker: PhantomData, -} - -impl From> for std::ops::Range { - fn from(span: Span) -> std::ops::Range { - (span.start.index())..(span.end.index()) - } -} - -impl From<&Span> for std::ops::Range { - fn from(span: &Span) -> std::ops::Range { - (span.start.0 as usize)..(span.end.0 as usize) - } -} - -impl Span { - pub fn is_empty(&self) -> bool { - self.start.0 == self.end.0 - } - - pub fn len(&self) -> usize { - (self.end.0 - self.start.0) as usize - } - - pub fn contains(&self, id: Id) -> bool { - self.start.0 <= id.0 && id.0 < self.end.0 - } - - pub fn new(start: Id, end: Id) -> Self { - Self { - start, - end, - _marker: PhantomData, - } - } - - pub fn new_empty() -> Self { - Span::new(Id::new(0), Id::new(0)) - } -} - -/// A simple arena for objects of a single type. -/// -/// This trait provides convenient accessors for treating Vec and Vec-like objects -/// as allocation arenas. This trait supports adding to the pool (i.e., growing the -/// arena). Pools also `Deref` to slices, which are `&Pool`s and support convenient -/// access to the current set of objects (but not addition of new objects). -pub trait Store { - /// Get a fixed-size view of the arena. - fn as_ref(&self) -> Pool; - - /// Add an item to the pool and get the new id. - fn add(&mut self, item: T) -> Id; - - /// Add an entire sequence of items to a "pool" vector and return the - /// range of new indices (IDs). - fn add_iter(&mut self, iter: impl IntoIterator) -> Span; - - /// Like `add_iter`, but for slices. - fn add_slice(&mut self, slice: &[T]) -> Span; - - /// Get the number of items in the pool. - fn len(&self) -> usize; - - /// Check whether the pool is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Get the next available ID. - fn next_id(&self) -> Id { - Id::new(self.len()) - } -} - -/// A store that uses a `Vec` to allocate objects on the heap. -/// -/// This is a "normal" arena that can freely grow to fill available memory. -#[repr(transparent)] -pub struct HeapStore(Vec); - -impl Store for HeapStore { - fn as_ref(&self) -> Pool { - Pool(&self.0) - } - - fn add(&mut self, item: T) -> Id { - let id = self.as_ref().next_id(); - self.0.push(item); - id - } - - fn add_iter(&mut self, iter: impl IntoIterator) -> Span { - let start = self.as_ref().next_id(); - self.0.extend(iter); - Span::new(start, self.as_ref().next_id()) - } - - fn add_slice(&mut self, slice: &[T]) -> Span { - let start = self.as_ref().next_id(); - self.0.extend_from_slice(slice); - Span::new(start, self.as_ref().next_id()) - } - - fn len(&self) -> usize { - self.0.len() - } -} - -impl Default for HeapStore { - fn default() -> Self { - Self(Vec::new()) - } -} - -/// A store that keeps its data in fixed locations in memory. -/// -/// This is a funkier kind of arena that uses memory that has already been pre-allocated -/// somewhere else, such as in a memory-mapped file. A consequence is that there is a -/// fixed maximum size for the arena; it's possible to add objects only until it fills up. -#[repr(transparent)] -pub struct FixedStore<'a, T>(SliceVec<'a, T>); - -impl<'a, T: Clone> Store for FixedStore<'a, T> { - fn as_ref(&self) -> Pool { - Pool(&self.0) - } - - fn add(&mut self, item: T) -> Id { - let id = self.next_id(); - self.0.push(item); - id - } - - fn add_iter(&mut self, iter: impl IntoIterator) -> Span { - let start = self.next_id(); - self.0.extend(iter); - Span::new(start, self.next_id()) - } - - fn add_slice(&mut self, slice: &[T]) -> Span { - let start = self.next_id(); - self.0.extend_from_slice(slice); - Span::new(start, self.next_id()) - } - - fn len(&self) -> usize { - self.0.len() - } -} - -impl<'a, T> FixedStore<'a, T> { - pub fn capacity(&self) -> usize { - self.0.capacity() - } -} - -impl<'a, T> From> for FixedStore<'a, T> { - fn from(slice: SliceVec<'a, T>) -> Self { - Self(slice) - } -} - -/// A fixed-sized arena. -/// -/// This trait allows id-based access to a fixed-size chunk of objects reflecting -/// a `Store`. Unlike `Store`, it does not support adding new objects. -#[repr(transparent)] -#[derive(Clone, Copy)] -pub struct Pool<'a, T>(&'a [T]); - -impl<'a, T> Pool<'a, T> { - /// Get the number of items in the pool. - pub fn len(&self) -> usize { - self.0.len() - } - - /// Check if the pool is empty. - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - /// Get the next available ID. - pub fn next_id(&self) -> Id { - Id::new(self.len()) - } - - /// Get the entire pool as a slice. - pub fn all(&self) -> &'a [T] { - self.0 - } - - /// Find the first item in the pool that satisfies a predicate. - pub fn search(&self, pred: impl Fn(&T) -> bool) -> Option> { - self.0.iter().position(pred).map(|i| Id::new(i)) - } - - /// Iterate over id/item pairs in the pool. - pub fn items(&self) -> impl Iterator, &T)> { - self.0 - .iter() - .enumerate() - .map(|(i, item)| (Id::new(i), item)) - } -} - -impl Index> for Pool<'_, T> { - type Output = T; - - fn index(&self, id: Id) -> &T { - &self.0[id.index()] - } -} - -impl Index> for Pool<'_, T> { - type Output = [T]; - - fn index(&self, span: Span) -> &[T] { - &self.0[std::ops::Range::from(span)] - } -} - -impl<'a, T> From<&'a [T]> for Pool<'a, T> { - fn from(slice: &'a [T]) -> Self { - Self(slice) - } -} diff --git a/flatgfa/src/print.rs b/flatgfa/src/print.rs deleted file mode 100644 index b6d28502..00000000 --- a/flatgfa/src/print.rs +++ /dev/null @@ -1,153 +0,0 @@ -use crate::flatgfa; -use std::fmt; - -impl fmt::Display for flatgfa::Orientation { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - flatgfa::Orientation::Forward => write!(f, "+"), - flatgfa::Orientation::Backward => write!(f, "-"), - } - } -} - -impl fmt::Display for flatgfa::AlignOpcode { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - flatgfa::AlignOpcode::Match => write!(f, "M"), - flatgfa::AlignOpcode::Gap => write!(f, "N"), - flatgfa::AlignOpcode::Insertion => write!(f, "D"), - flatgfa::AlignOpcode::Deletion => write!(f, "I"), - } - } -} - -impl<'a> fmt::Display for flatgfa::Alignment<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.ops.len() == 0 { - write!(f, "0M")?; - } - for op in self.ops { - write!(f, "{}{}", op.len(), op.op())?; - } - Ok(()) - } -} - -/// A wrapper for displaying components from FlatGFA. -pub struct Display<'a, T>(pub &'a flatgfa::FlatGFA<'a>, pub T); - -impl<'a> fmt::Display for Display<'a, flatgfa::Handle> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let seg = self.0.get_handle_seg(self.1); - let name = seg.name; - write!(f, "{}{}", name, self.1.orient()) - } -} - -impl<'a> fmt::Display for Display<'a, &flatgfa::Path> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "P\t{}\t", self.0.get_path_name(&self.1))?; - let steps = &self.0.steps[self.1.steps]; - write!(f, "{}", Display(self.0, steps[0]))?; - for step in steps[1..].iter() { - write!(f, ",{}", Display(self.0, *step))?; - } - write!(f, "\t")?; - let overlaps = &self.0.overlaps[self.1.overlaps]; - if overlaps.is_empty() { - write!(f, "*")?; - } else { - write!(f, "{}", self.0.get_alignment(overlaps[0]))?; - for overlap in overlaps[1..].iter() { - write!(f, ",{}", self.0.get_alignment(*overlap))?; - } - } - Ok(()) - } -} - -impl<'a> fmt::Display for Display<'a, &flatgfa::Link> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let from = self.1.from; - let from_name = self.0.get_handle_seg(from).name; - let to = self.1.to; - let to_name = self.0.get_handle_seg(to).name; - write!( - f, - "L\t{}\t{}\t{}\t{}\t{}", - from_name, - from.orient(), - to_name, - to.orient(), - self.0.get_alignment(self.1.overlap) - ) - } -} - -impl<'a> fmt::Display for Display<'a, &flatgfa::Segment> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let name = self.1.name; - write!(f, "S\t{}\t{}", name, self.0.get_seq(self.1))?; - if !self.1.optional.is_empty() { - write!(f, "\t{}", self.0.get_optional_data(self.1))?; - } - Ok(()) - } -} - -/// Print a graph in the order preserved from an original GFA file. -fn write_preserved(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut seg_iter = gfa.segs.all().iter(); - let mut path_iter = gfa.paths.all().iter(); - let mut link_iter = gfa.links.all().iter(); - for kind in gfa.get_line_order() { - match kind { - flatgfa::LineKind::Header => { - let version = gfa.header; - assert!(!version.is_empty()); - writeln!(f, "H\t{}", bstr::BStr::new(version.all()))?; - } - flatgfa::LineKind::Segment => { - let seg = seg_iter.next().expect("too few segments"); - writeln!(f, "{}", Display(gfa, seg))?; - } - flatgfa::LineKind::Path => { - let path = path_iter.next().expect("too few paths"); - writeln!(f, "{}", Display(gfa, path))?; - } - flatgfa::LineKind::Link => { - let link = link_iter.next().expect("too few links"); - writeln!(f, "{}", Display(gfa, link))?; - } - } - } - Ok(()) -} - -/// Print a graph in a normalized order, ignoring the original GFA line order. -pub fn write_normalized(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if !gfa.header.is_empty() { - writeln!(f, "H\t{}", bstr::BStr::new(gfa.header.all()))?; - } - for seg in gfa.segs.all().iter() { - writeln!(f, "{}", Display(gfa, seg))?; - } - for path in gfa.paths.all().iter() { - writeln!(f, "{}", Display(gfa, path))?; - } - for link in gfa.links.all().iter() { - writeln!(f, "{}", Display(gfa, link))?; - } - Ok(()) -} - -/// Print our flat representation as in GFA text format. -impl<'a> fmt::Display for &'a flatgfa::FlatGFA<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.line_order.is_empty() { - write_normalized(self, f) - } else { - write_preserved(self, f) - } - } -} From 9dc781ba792998f47f0c10ad82e1708bce1a0e9c Mon Sep 17 00:00:00 2001 From: susan-garry Date: Fri, 27 Sep 2024 12:18:13 -0400 Subject: [PATCH 2/8] move fgfa_ds and commands into submodules in flatgfa/src --- flatgfa-py/Cargo.lock | 65 ++-- flatgfa-py/src/lib.rs | 2 +- flatgfa/Cargo.lock | 79 ++--- flatgfa/Cargo.toml | 17 +- flatgfa/src/cmds.rs | 547 ----------------------------- flatgfa/src/commands/basic_cmds.rs | 136 +++++++ flatgfa/src/commands/chop.rs | 160 +++++++++ flatgfa/src/commands/depth.rs | 37 ++ flatgfa/src/commands/extract.rs | 224 ++++++++++++ flatgfa/src/commands/mod.rs | 4 + flatgfa/src/fgfa_ds/file.rs | 336 ++++++++++++++++++ flatgfa/src/fgfa_ds/flatgfa.rs | 430 +++++++++++++++++++++++ flatgfa/src/fgfa_ds/gfaline.rs | 272 ++++++++++++++ flatgfa/src/fgfa_ds/mod.rs | 7 + flatgfa/src/fgfa_ds/parse.rs | 284 +++++++++++++++ flatgfa/src/fgfa_ds/pool.rs | 299 ++++++++++++++++ flatgfa/src/fgfa_ds/print.rs | 153 ++++++++ flatgfa/src/lib.rs | 1 + flatgfa/src/main.rs | 37 +- 19 files changed, 2447 insertions(+), 643 deletions(-) delete mode 100644 flatgfa/src/cmds.rs create mode 100644 flatgfa/src/commands/basic_cmds.rs create mode 100644 flatgfa/src/commands/chop.rs create mode 100644 flatgfa/src/commands/depth.rs create mode 100644 flatgfa/src/commands/extract.rs create mode 100644 flatgfa/src/commands/mod.rs create mode 100644 flatgfa/src/fgfa_ds/file.rs create mode 100644 flatgfa/src/fgfa_ds/flatgfa.rs create mode 100644 flatgfa/src/fgfa_ds/gfaline.rs create mode 100644 flatgfa/src/fgfa_ds/mod.rs create mode 100644 flatgfa/src/fgfa_ds/parse.rs create mode 100644 flatgfa/src/fgfa_ds/pool.rs create mode 100644 flatgfa/src/fgfa_ds/print.rs create mode 100644 flatgfa/src/lib.rs diff --git a/flatgfa-py/Cargo.lock b/flatgfa-py/Cargo.lock index dcb76f0d..f8f3d345 100644 --- a/flatgfa-py/Cargo.lock +++ b/flatgfa-py/Cargo.lock @@ -56,9 +56,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "bstr" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", "regex-automata", @@ -77,6 +77,14 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "commands" +version = "0.1.0" +dependencies = [ + "argh", + "fgfa_ds", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -101,6 +109,7 @@ name = "flatgfa" version = "0.1.0" dependencies = [ "argh", + "commands", "fgfa_ds", ] @@ -127,9 +136,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "indexmap" -version = "2.2.6" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -165,9 +174,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap" @@ -190,27 +199,27 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "num_enum" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -255,9 +264,9 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ "toml_edit", ] @@ -355,9 +364,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" [[package]] name = "scopeguard" @@ -410,21 +419,21 @@ checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" [[package]] name = "toml_datetime" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.21.1" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "toml_datetime", @@ -531,18 +540,18 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "winnow" -version = "0.5.40" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", @@ -550,9 +559,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs index 85452d48..a0593909 100644 --- a/flatgfa-py/src/lib.rs +++ b/flatgfa-py/src/lib.rs @@ -1,4 +1,4 @@ -use flatgfa::pool::Id; +use flatgfa::fgfa_ds::pool::Id; use flatgfa::{self, file, print, FlatGFA, HeapGFAStore}; use pyo3::exceptions::PyIndexError; use pyo3::prelude::*; diff --git a/flatgfa/Cargo.lock b/flatgfa/Cargo.lock index bf18eb98..8f3f20c6 100644 --- a/flatgfa/Cargo.lock +++ b/flatgfa/Cargo.lock @@ -44,15 +44,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bstr" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", "regex-automata", @@ -72,9 +72,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "fgfa_ds" +name = "fgfa" version = "0.1.0" dependencies = [ + "argh", "atoi", "bstr", "memchr", @@ -84,25 +85,17 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "flatgfa" -version = "0.1.0" -dependencies = [ - "argh", - "fgfa_ds", -] - [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "indexmap" -version = "2.2.5" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -110,15 +103,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.153" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap" @@ -132,27 +125,27 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "num_enum" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -162,9 +155,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ "toml_edit", ] @@ -189,9 +182,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" [[package]] name = "serde" @@ -226,21 +219,21 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" [[package]] name = "toml_datetime" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.21.1" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "toml_datetime", @@ -277,18 +270,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "winnow" -version = "0.5.40" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", @@ -296,9 +289,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index 6178f607..f8bad886 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -1,18 +1,17 @@ -[workspace] -members = ["fgfa_ds"] - [package] -name = "flatgfa" +name = "fgfa" version = "0.1.0" edition = "2021" -[[bin]] -name = "fgfa" -path = "src/main.rs" - [dependencies] argh = "0.1.12" -fgfa_ds = { path = "fgfa_ds" } +atoi = "2.0.0" +bstr = "1.9.1" +memchr = "2.7.1" +memmap = "0.7.0" +num_enum = "0.7.2" +tinyvec = "1.6.0" +zerocopy = { version = "0.7.32", features = ["derive"] } [profile.profiling] inherits = "release" diff --git a/flatgfa/src/cmds.rs b/flatgfa/src/cmds.rs deleted file mode 100644 index 3e56d721..00000000 --- a/flatgfa/src/cmds.rs +++ /dev/null @@ -1,547 +0,0 @@ -use fgfa_ds::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; -use fgfa_ds::pool::{self, Id, Span, Store}; -use fgfa_ds::{GFAStore, HeapFamily}; -use argh::FromArgs; -use std::collections::{HashMap, HashSet}; - -/// print the FlatGFA table of contents -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "toc")] -pub struct Toc {} - -pub fn toc(gfa: &flatgfa::FlatGFA) { - eprintln!("header: {}", gfa.header.len()); - eprintln!("segs: {}", gfa.segs.len()); - eprintln!("paths: {}", gfa.paths.len()); - eprintln!("links: {}", gfa.links.len()); - eprintln!("steps: {}", gfa.steps.len()); - eprintln!("seq_data: {}", gfa.seq_data.len()); - eprintln!("overlaps: {}", gfa.overlaps.len()); - eprintln!("alignment: {}", gfa.alignment.len()); - eprintln!("name_data: {}", gfa.name_data.len()); - eprintln!("optional_data: {}", gfa.optional_data.len()); - eprintln!("line_order: {}", gfa.line_order.len()); -} - -/// list the paths -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "paths")] -pub struct Paths {} - -pub fn paths(gfa: &flatgfa::FlatGFA) { - for path in gfa.paths.all().iter() { - println!("{}", gfa.get_path_name(path)); - } -} - -/// calculate graph statistics -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "stats")] -pub struct Stats { - /// show basic metrics - #[argh(switch, short = 'S')] - summarize: bool, - - /// number of segments with at least one self-loop link - #[argh(switch, short = 'L')] - self_loops: bool, -} - -pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) { - if args.summarize { - println!("#length\tnodes\tedges\tpaths\tsteps"); - println!( - "{}\t{}\t{}\t{}\t{}", - gfa.seq_data.len(), - gfa.segs.len(), - gfa.links.len(), - gfa.paths.len(), - gfa.steps.len() - ); - } else if args.self_loops { - let mut counts: HashMap, usize> = HashMap::new(); - let mut total: usize = 0; - for link in gfa.links.all().iter() { - if link.from.segment() == link.to.segment() { - let count = counts.entry(link.from.segment()).or_insert(0); - *count += 1; - total += 1; - } - } - println!("#type\tnum"); - println!("total\t{}", total); - println!("unique\t{}", counts.len()); - } -} - -/// find a nucleotide position within a path -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "position")] -pub struct Position { - /// path_name,offset,orientation - #[argh(option, short = 'p')] - path_pos: String, -} - -pub fn position(gfa: &flatgfa::FlatGFA, args: Position) -> Result<(), &'static str> { - // Parse the position triple, which looks like `path,42,+`. - let (path_name, offset, orientation) = { - let parts: Vec<_> = args.path_pos.split(',').collect(); - if parts.len() != 3 { - return Err("position must be path_name,offset,orientation"); - } - let off: usize = parts[1].parse().or(Err("offset must be a number"))?; - let ori: flatgfa::Orientation = parts[2].parse().or(Err("orientation must be + or -"))?; - (parts[0], off, ori) - }; - - let path_id = gfa.find_path(path_name.into()).ok_or("path not found")?; - let path = &gfa.paths[path_id]; - assert_eq!( - orientation, - flatgfa::Orientation::Forward, - "only + is implemented so far" - ); - - // Traverse the path until we reach the position. - let mut cur_pos = 0; - let mut found = None; - for step in &gfa.steps[path.steps] { - let seg = gfa.get_handle_seg(*step); - let end_pos = cur_pos + seg.len(); - if offset < end_pos { - // Found it! - found = Some((*step, offset - cur_pos)); - break; - } - cur_pos = end_pos; - } - - // Print the match. - if let Some((handle, seg_off)) = found { - let seg = gfa.get_handle_seg(handle); - let seg_name = seg.name; - println!("#source.path.pos\ttarget.graph.pos"); - println!( - "{},{},{}\t{},{},{}", - path_name, - offset, - orientation, - seg_name, - seg_off, - handle.orient() - ); - } - - Ok(()) -} - -/// create a subset graph -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "extract")] -pub struct Extract { - /// segment to extract around - #[argh(option, short = 'n')] - seg_name: usize, - - /// number of edges "away" from the node to include - #[argh(option, short = 'c')] - link_distance: usize, - - /// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together - #[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")] - max_distance_subpaths: usize, // TODO: possibly make this bigger - - /// maximum number of iterations before we stop merging subpaths - #[argh(option, short = 'e', long = "max-merging-iterations", default = "6")] - num_iterations: usize // TODO: probably make this smaller -} - -pub fn extract( - gfa: &flatgfa::FlatGFA, - args: Extract, -) -> Result { - let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?; - - let mut subgraph = SubgraphBuilder::new(gfa); - subgraph.add_header(); - subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations); - Ok(subgraph.store) -} - -/// A helper to construct a new graph that includes part of an old graph. -struct SubgraphBuilder<'a> { - old: &'a flatgfa::FlatGFA<'a>, - store: flatgfa::HeapGFAStore, - seg_map: HashMap, Id>, -} - -struct SubpathStart { - step: Id, // The id of the first step in the subpath. - pos: usize, // The bp position at the start of the subpath. -} - -impl<'a> SubgraphBuilder<'a> { - fn new(old: &'a flatgfa::FlatGFA) -> Self { - Self { - old, - store: flatgfa::HeapGFAStore::default(), - seg_map: HashMap::new(), - } - } - - /// Include the old graph's header - fn add_header(&mut self) { - // pub fn add_header(&mut self, version: &[u8]) { - // assert!(self.header.as_ref().is_empty()); - // self.header.add_slice(version); - // } - assert!(self.store.header.as_ref().is_empty()); - self.store.header.add_slice(self.old.header.all()); - } - - /// Add a segment from the source graph to this subgraph. - fn include_seg(&mut self, seg_id: Id) { - let seg = &self.old.segs[seg_id]; - let new_seg_id = self.store.add_seg( - seg.name, - self.old.get_seq(seg), - self.old.get_optional_data(seg), - ); - self.seg_map.insert(seg_id, new_seg_id); - } - - /// Add a link from the source graph to the subgraph. - fn include_link(&mut self, link: &flatgfa::Link) { - let from = self.tr_handle(link.from); - let to = self.tr_handle(link.to); - let overlap = self.old.get_alignment(link.overlap); - self.store.add_link(from, to, overlap.ops.into()); - } - - /// Add a single subpath from the given path to the subgraph. - fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) { - let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id? - let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos); - self.store - .add_path(name.as_bytes(), steps, std::iter::empty()); - } - - /// Identify all the subpaths in a path from the original graph that cross through - /// segments in this subgraph and merge them if possible. - fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) { - // these are subpaths which *aren't* already included in the new graph - let mut cur_subpath_start: Option = Some(0); - let mut subpath_length = 0; - let mut ignore_path = true; - - for (idx, step) in self.old.steps[path.steps].iter().enumerate() { - let in_neighb = self.seg_map.contains_key(&step.segment()); - - if let (Some(start), true) = (&cur_subpath_start, in_neighb) { - // We just entered the subgraph. End the current subpath. - if !ignore_path && subpath_length <= max_distance_subpaths { - // TODO: type safety - let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32); - for step in &self.old.steps[subpath_span] { - if !self.seg_map.contains_key(&step.segment()) { - self.include_seg(step.segment()); - } - } - } - cur_subpath_start = None; - ignore_path = false; - } else if let (None, false) = (&cur_subpath_start, in_neighb) { - // We've exited the current subgraph, start a new subpath - cur_subpath_start = Some(idx); - } - - // Track the current bp position in the path. - subpath_length += self.old.get_handle_seg(*step).len(); - } - } - - /// Identify all the subpaths in a path from the original graph that cross through - /// segments in this subgraph and add them. - fn find_subpaths(&mut self, path: &flatgfa::Path) { - let mut cur_subpath_start: Option = None; - let mut path_pos = 0; - - for step in &self.old.steps[path.steps] { - let in_neighb = self.seg_map.contains_key(&step.segment()); - - if let (Some(start), false) = (&cur_subpath_start, in_neighb) { - // End the current subpath. - self.include_subpath(path, start, path_pos); - cur_subpath_start = None; - } else if let (None, true) = (&cur_subpath_start, in_neighb) { - // Start a new subpath. - cur_subpath_start = Some(SubpathStart { - step: self.store.steps.next_id(), - pos: path_pos, - }); - } - - // Add the (translated) step to the new graph. - if in_neighb { - self.store.add_step(self.tr_handle(*step)); - } - - // Track the current bp position in the path. - path_pos += self.old.get_handle_seg(*step).len(); - } - - // Did we reach the end of the path while still in the neighborhood? - if let Some(start) = cur_subpath_start { - self.include_subpath(path, &start, path_pos); - } - } - - /// Translate a handle from the source graph to this subgraph. - fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle { - // TODO: is this just generating the handle or should we add it to the new graph? - flatgfa::Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient()) - } - - /// Check whether a segment from the old graph is in the subgraph. - fn contains(&self, old_seg_id: Id) -> bool { - self.seg_map.contains_key(&old_seg_id) - } - - /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away - /// from the given segment in the original graph. - /// - /// Include any links between the segments in the neighborhood and subpaths crossing - /// through the neighborhood. - fn extract(&mut self, origin: Id, dist: usize, max_distance_subpaths: usize, num_iterations: usize) { - self.include_seg(origin); - - // Find the set of all segments that are c links away. - let mut frontier: Vec> = Vec::new(); - let mut next_frontier: Vec> = Vec::new(); - frontier.push(origin); - for _ in 0..dist { - while let Some(seg_id) = frontier.pop() { - for link in self.old.links.all().iter() { - if let Some(other_seg) = link.incident_seg(seg_id) { - // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map - if !self.seg_map.contains_key(&other_seg) { - self.include_seg(other_seg); - next_frontier.push(other_seg); - } - } - } - } - (frontier, next_frontier) = (next_frontier, frontier); - } - - // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times - for _ in 0..num_iterations { - for path in self.old.paths.all().iter() { - self.merge_subpaths(path, max_distance_subpaths); - } - } - - // Include all links within the subgraph. - for link in self.old.links.all().iter() { - if self.contains(link.from.segment()) && self.contains(link.to.segment()) { - self.include_link(link); - } - } - - // Find subpaths within the subgraph. - for path in self.old.paths.all().iter() { - self.find_subpaths(path); - } - } -} - -/// compute node depth, the number of times paths cross a node -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "depth")] -pub struct Depth {} - -pub fn depth(gfa: &flatgfa::FlatGFA) { - // Initialize node depth - let mut depths = vec![0; gfa.segs.len()]; - // Initialize uniq_paths - let mut uniq_paths = Vec::>::new(); - uniq_paths.resize(gfa.segs.len(), HashSet::new()); - // do not assume that each handle in `gfa.steps()` is unique - for (idx, path) in gfa.paths.all().iter().enumerate() { - for step in &gfa.steps[path.steps] { - let seg_id = step.segment().index(); - // Increment depths - depths[seg_id] += 1; - // Update uniq_paths - uniq_paths[seg_id].insert(idx); - } - } - // print out depth and depth.uniq - println!("#node.id\tdepth\tdepth.uniq"); - for (id, seg) in gfa.segs.items() { - let name: u32 = seg.name as u32; - println!( - "{}\t{}\t{}", - name, - depths[id.index()], - uniq_paths[id.index()].len() - ); - } -} - -/// chop the segments in a graph into sizes of N or smaller -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "chop")] -pub struct Chop { - /// maximimum segment size. - // Use c in keeping with odgi convention - #[argh(option, short = 'c')] - c: usize, - - /// compute new links - #[argh(switch, short = 'l')] - l: bool, -} - -/// Chop a graph into segments of size no larger than c -/// By default, compact node ids -/// CIGAR strings, links, and optional Segment data are invalidated by chop -/// Generates a new graph, rather than modifying the old one in place -pub fn chop<'a>( - gfa: &'a flatgfa::FlatGFA<'a>, - args: Chop, -) -> Result { - - let mut flat = flatgfa::HeapGFAStore::default(); - - // when segment S is chopped into segments S1 through S2 (exclusive), - // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1 - let mut seg_map: Vec> = Vec::new(); - // The smallest id (>0) which does not already belong to a segment in `flat` - let mut max_node_id = 1; - - fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span) { - // Link segments spanned by `span` from head to tail - let overlap = Span::new_empty(); - flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link { - from: Handle::new(Id::new(idx), Orientation::Forward), - to: Handle::new(Id::new(idx + 1), Orientation::Forward), - overlap, - })); - } - - // Add new, chopped segments - for seg in gfa.segs.all().iter() { - let len = seg.len(); - if len <= args.c { - // Leave the segment as is - let id = flat.segs.add(Segment { - name: max_node_id, - seq: seg.seq, - optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped? - }); - max_node_id += 1; - seg_map.push(Span::new(id, flat.segs.next_id())); - } else { - let seq_end = seg.seq.end; - let mut offset = seg.seq.start.index(); - let segs_start = flat.segs.next_id(); - // Could also generate end_id by setting it equal to the start_id and - // updating it for each segment that is added - only benefits us if we - // don't unroll the last iteration of this loop - while offset < seq_end.index() - args.c { - // Generate a new segment of length c - flat.segs.add(Segment { - name: max_node_id, - seq: Span::new(Id::new(offset), Id::new(offset + args.c)), - optional: Span::new_empty() - }); - offset += args.c; - max_node_id += 1; - } - // Generate the last segment - flat.segs.add(Segment { - name: max_node_id, - seq: Span::new(Id::new(offset), seq_end), - optional: Span::new_empty(), - }); - max_node_id += 1; - let new_seg_span = Span::new(segs_start, flat.segs.next_id()); - seg_map.push(new_seg_span); - if args.l { - link_forward(&mut flat, &new_seg_span); - } - } - } - - // For each path, add updated handles. Then add the updated path - for path in gfa.paths.all().iter() { - let path_start = flat.steps.next_id(); - let mut path_end = flat.steps.next_id(); - // Generate the new handles - // Tentative to-do: see if it is faster to read Id from segs than to re-generate it? - for step in gfa.get_path_steps(path) { - let range = { - let span = seg_map[step.segment().index()]; - std::ops::Range::from(span) - }; - match step.orient() { - Orientation::Forward => { - // In this builder, Id.index() == seg.name - 1 for all seg - path_end = flat - .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward))) - .end; - } - Orientation::Backward => { - path_end = flat - .add_steps( - range - .rev() - .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)), - ) - .end; - } - } - } - - // Add the updated path - flat.paths.add(Path { - name: path.name, - steps: Span::new(path_start, path_end), - overlaps: Span::new_empty(), - }); - } - - // If the 'l' flag is specified, compute the links in the new graph - if args.l { - // For each link in the old graph, from handle A -> B: - // Add a link from - // (A.forward ? (A.end, forward) : (A.begin, backwards)) - // -> (B.forward ? (B.begin, forward) : (B.end ? backwards)) - - for link in gfa.links.all().iter() { - let new_from = { - let old_from = link.from; - let chopped_segs = seg_map[old_from.segment().index()]; - let seg_id = match old_from.orient() { - Orientation::Forward => chopped_segs.end - 1, - Orientation::Backward => chopped_segs.start, - }; - Handle::new(seg_id, old_from.orient()) - }; - let new_to = { - let old_to = link.to; - let chopped_segs = seg_map[old_to.segment().index()]; - let seg_id = match old_to.orient() { - Orientation::Forward => chopped_segs.start, - Orientation::Backward => chopped_segs.end - 1, - }; - Handle::new(seg_id, old_to.orient()) - }; - flat.add_link(new_from, new_to, vec![]); - } - } - - Ok(flat) -} diff --git a/flatgfa/src/commands/basic_cmds.rs b/flatgfa/src/commands/basic_cmds.rs new file mode 100644 index 00000000..537b49e8 --- /dev/null +++ b/flatgfa/src/commands/basic_cmds.rs @@ -0,0 +1,136 @@ +use crate::fgfa_ds::flatgfa::{self, Orientation, Segment}; +use crate::fgfa_ds::pool::Id; +use argh::FromArgs; +use std::collections::HashMap; + +/// print the FlatGFA table of contents +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "toc")] +pub struct Toc {} + +pub fn toc(gfa: &flatgfa::FlatGFA) { + eprintln!("header: {}", gfa.header.len()); + eprintln!("segs: {}", gfa.segs.len()); + eprintln!("paths: {}", gfa.paths.len()); + eprintln!("links: {}", gfa.links.len()); + eprintln!("steps: {}", gfa.steps.len()); + eprintln!("seq_data: {}", gfa.seq_data.len()); + eprintln!("overlaps: {}", gfa.overlaps.len()); + eprintln!("alignment: {}", gfa.alignment.len()); + eprintln!("name_data: {}", gfa.name_data.len()); + eprintln!("optional_data: {}", gfa.optional_data.len()); + eprintln!("line_order: {}", gfa.line_order.len()); +} + +/// list the paths +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "paths")] +pub struct Paths {} + +pub fn paths(gfa: &flatgfa::FlatGFA) { + for path in gfa.paths.all().iter() { + println!("{}", gfa.get_path_name(path)); + } +} + +/// calculate graph statistics +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "stats")] +pub struct Stats { + /// show basic metrics + #[argh(switch, short = 'S')] + summarize: bool, + + /// number of segments with at least one self-loop link + #[argh(switch, short = 'L')] + self_loops: bool, +} + +pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) { + if args.summarize { + println!("#length\tnodes\tedges\tpaths\tsteps"); + println!( + "{}\t{}\t{}\t{}\t{}", + gfa.seq_data.len(), + gfa.segs.len(), + gfa.links.len(), + gfa.paths.len(), + gfa.steps.len() + ); + } else if args.self_loops { + let mut counts: HashMap, usize> = HashMap::new(); + let mut total: usize = 0; + for link in gfa.links.all().iter() { + if link.from.segment() == link.to.segment() { + let count = counts.entry(link.from.segment()).or_insert(0); + *count += 1; + total += 1; + } + } + println!("#type\tnum"); + println!("total\t{}", total); + println!("unique\t{}", counts.len()); + } +} + +/// find a nucleotide position within a path +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "position")] +pub struct Position { + /// path_name,offset,orientation + #[argh(option, short = 'p')] + path_pos: String, +} + +pub fn position(gfa: &flatgfa::FlatGFA, args: Position) -> Result<(), &'static str> { + // Parse the position triple, which looks like `path,42,+`. + let (path_name, offset, orientation) = { + let parts: Vec<_> = args.path_pos.split(',').collect(); + if parts.len() != 3 { + return Err("position must be path_name,offset,orientation"); + } + let off: usize = parts[1].parse().or(Err("offset must be a number"))?; + let ori: Orientation = parts[2].parse().or(Err("orientation must be + or -"))?; + (parts[0], off, ori) + }; + + let path_id = gfa.find_path(path_name.into()).ok_or("path not found")?; + let path = &gfa.paths[path_id]; + assert_eq!( + orientation, + Orientation::Forward, + "only + is implemented so far" + ); + + // Traverse the path until we reach the position. + let mut cur_pos = 0; + let mut found = None; + for step in &gfa.steps[path.steps] { + let seg = gfa.get_handle_seg(*step); + let end_pos = cur_pos + seg.len(); + if offset < end_pos { + // Found it! + found = Some((*step, offset - cur_pos)); + break; + } + cur_pos = end_pos; + } + + // Print the match. + if let Some((handle, seg_off)) = found { + let seg = gfa.get_handle_seg(handle); + let seg_name = seg.name; + println!("#source.path.pos\ttarget.graph.pos"); + println!( + "{},{},{}\t{},{},{}", + path_name, + offset, + orientation, + seg_name, + seg_off, + handle.orient() + ); + } + + Ok(()) +} \ No newline at end of file diff --git a/flatgfa/src/commands/chop.rs b/flatgfa/src/commands/chop.rs new file mode 100644 index 00000000..695fd52a --- /dev/null +++ b/flatgfa/src/commands/chop.rs @@ -0,0 +1,160 @@ +use crate::fgfa_ds::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; +use crate::fgfa_ds::pool::{Id, Span, Store}; +use crate::fgfa_ds::flatgfa::{GFAStore, HeapFamily}; +use argh::FromArgs; + +/// chop the segments in a graph into sizes of N or smaller +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "chop")] +pub struct Chop { + /// maximimum segment size. + // Use c in keeping with odgi convention + #[argh(option, short = 'c')] + c: usize, + + /// compute new links + #[argh(switch, short = 'l')] + l: bool, +} + +/// Chop a graph into segments of size no larger than c +/// By default, compact node ids +/// CIGAR strings, links, and optional Segment data are invalidated by chop +/// Generates a new graph, rather than modifying the old one in place +pub fn chop<'a>( + gfa: &'a flatgfa::FlatGFA<'a>, + args: Chop, +) -> Result { + + let mut flat = flatgfa::HeapGFAStore::default(); + + // when segment S is chopped into segments S1 through S2 (exclusive), + // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1 + let mut seg_map: Vec> = Vec::new(); + // The smallest id (>0) which does not already belong to a segment in `flat` + let mut max_node_id = 1; + + fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span) { + // Link segments spanned by `span` from head to tail + let overlap = Span::new_empty(); + flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link { + from: Handle::new(Id::new(idx), Orientation::Forward), + to: Handle::new(Id::new(idx + 1), Orientation::Forward), + overlap, + })); + } + + // Add new, chopped segments + for seg in gfa.segs.all().iter() { + let len = seg.len(); + if len <= args.c { + // Leave the segment as is + let id = flat.segs.add(Segment { + name: max_node_id, + seq: seg.seq, + optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped? + }); + max_node_id += 1; + seg_map.push(Span::new(id, flat.segs.next_id())); + } else { + let seq_end = seg.seq.end; + let mut offset = seg.seq.start.index(); + let segs_start = flat.segs.next_id(); + // Could also generate end_id by setting it equal to the start_id and + // updating it for each segment that is added - only benefits us if we + // don't unroll the last iteration of this loop + while offset < seq_end.index() - args.c { + // Generate a new segment of length c + flat.segs.add(Segment { + name: max_node_id, + seq: Span::new(Id::new(offset), Id::new(offset + args.c)), + optional: Span::new_empty() + }); + offset += args.c; + max_node_id += 1; + } + // Generate the last segment + flat.segs.add(Segment { + name: max_node_id, + seq: Span::new(Id::new(offset), seq_end), + optional: Span::new_empty(), + }); + max_node_id += 1; + let new_seg_span = Span::new(segs_start, flat.segs.next_id()); + seg_map.push(new_seg_span); + if args.l { + link_forward(&mut flat, &new_seg_span); + } + } + } + + // For each path, add updated handles. Then add the updated path + for path in gfa.paths.all().iter() { + let path_start = flat.steps.next_id(); + let mut path_end = flat.steps.next_id(); + // Generate the new handles + // Tentative to-do: see if it is faster to read Id from segs than to re-generate it? + for step in gfa.get_path_steps(path) { + let range = { + let span = seg_map[step.segment().index()]; + std::ops::Range::from(span) + }; + match step.orient() { + Orientation::Forward => { + // In this builder, Id.index() == seg.name - 1 for all seg + path_end = flat + .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward))) + .end; + } + Orientation::Backward => { + path_end = flat + .add_steps( + range + .rev() + .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)), + ) + .end; + } + } + } + + // Add the updated path + flat.paths.add(Path { + name: path.name, + steps: Span::new(path_start, path_end), + overlaps: Span::new_empty(), + }); + } + + // If the 'l' flag is specified, compute the links in the new graph + if args.l { + // For each link in the old graph, from handle A -> B: + // Add a link from + // (A.forward ? (A.end, forward) : (A.begin, backwards)) + // -> (B.forward ? (B.begin, forward) : (B.end ? backwards)) + + for link in gfa.links.all().iter() { + let new_from = { + let old_from = link.from; + let chopped_segs = seg_map[old_from.segment().index()]; + let seg_id = match old_from.orient() { + Orientation::Forward => chopped_segs.end - 1, + Orientation::Backward => chopped_segs.start, + }; + Handle::new(seg_id, old_from.orient()) + }; + let new_to = { + let old_to = link.to; + let chopped_segs = seg_map[old_to.segment().index()]; + let seg_id = match old_to.orient() { + Orientation::Forward => chopped_segs.start, + Orientation::Backward => chopped_segs.end - 1, + }; + Handle::new(seg_id, old_to.orient()) + }; + flat.add_link(new_from, new_to, vec![]); + } + } + + Ok(flat) +} \ No newline at end of file diff --git a/flatgfa/src/commands/depth.rs b/flatgfa/src/commands/depth.rs new file mode 100644 index 00000000..ef0b97ca --- /dev/null +++ b/flatgfa/src/commands/depth.rs @@ -0,0 +1,37 @@ +use crate::fgfa_ds::flatgfa::FlatGFA; +use argh::FromArgs; +use std::collections::HashSet; + +/// compute node depth, the number of times paths cross a node +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "depth")] +pub struct Depth {} + +pub fn depth(gfa: &FlatGFA) { + // Initialize node depth + let mut depths = vec![0; gfa.segs.len()]; + // Initialize uniq_paths + let mut uniq_paths = Vec::>::new(); + uniq_paths.resize(gfa.segs.len(), HashSet::new()); + // do not assume that each handle in `gfa.steps()` is unique + for (idx, path) in gfa.paths.all().iter().enumerate() { + for step in &gfa.steps[path.steps] { + let seg_id = step.segment().index(); + // Increment depths + depths[seg_id] += 1; + // Update uniq_paths + uniq_paths[seg_id].insert(idx); + } + } + // print out depth and depth.uniq + println!("#node.id\tdepth\tdepth.uniq"); + for (id, seg) in gfa.segs.items() { + let name: u32 = seg.name as u32; + println!( + "{}\t{}\t{}", + name, + depths[id.index()], + uniq_paths[id.index()].len() + ); + } +} \ No newline at end of file diff --git a/flatgfa/src/commands/extract.rs b/flatgfa/src/commands/extract.rs new file mode 100644 index 00000000..1a0ada61 --- /dev/null +++ b/flatgfa/src/commands/extract.rs @@ -0,0 +1,224 @@ +use crate::fgfa_ds::flatgfa::{self, Handle, Link, Path, Segment}; +use crate::fgfa_ds::pool::{self, Id, Span, Store}; +use argh::FromArgs; +use std::collections::HashMap; + +/// create a subset graph +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "extract")] +pub struct Extract { + /// segment to extract around + #[argh(option, short = 'n')] + seg_name: usize, + + /// number of edges "away" from the node to include + #[argh(option, short = 'c')] + link_distance: usize, + + /// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together + #[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")] + max_distance_subpaths: usize, // TODO: possibly make this bigger + + /// maximum number of iterations before we stop merging subpaths + #[argh(option, short = 'e', long = "max-merging-iterations", default = "6")] + num_iterations: usize // TODO: probably make this smaller +} + +pub fn extract( + gfa: &flatgfa::FlatGFA, + args: Extract, +) -> Result { + let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?; + + let mut subgraph = SubgraphBuilder::new(gfa); + subgraph.add_header(); + subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations); + Ok(subgraph.store) +} + +/// A helper to construct a new graph that includes part of an old graph. +struct SubgraphBuilder<'a> { + old: &'a flatgfa::FlatGFA<'a>, + store: flatgfa::HeapGFAStore, + seg_map: HashMap, Id>, +} + +struct SubpathStart { + step: Id, // The id of the first step in the subpath. + pos: usize, // The bp position at the start of the subpath. +} + +impl<'a> SubgraphBuilder<'a> { + fn new(old: &'a flatgfa::FlatGFA) -> Self { + Self { + old, + store: flatgfa::HeapGFAStore::default(), + seg_map: HashMap::new(), + } + } + + /// Include the old graph's header + fn add_header(&mut self) { + // pub fn add_header(&mut self, version: &[u8]) { + // assert!(self.header.as_ref().is_empty()); + // self.header.add_slice(version); + // } + assert!(self.store.header.as_ref().is_empty()); + self.store.header.add_slice(self.old.header.all()); + } + + /// Add a segment from the source graph to this subgraph. + fn include_seg(&mut self, seg_id: Id) { + let seg = &self.old.segs[seg_id]; + let new_seg_id = self.store.add_seg( + seg.name, + self.old.get_seq(seg), + self.old.get_optional_data(seg), + ); + self.seg_map.insert(seg_id, new_seg_id); + } + + /// Add a link from the source graph to the subgraph. + fn include_link(&mut self, link: &Link) { + let from = self.tr_handle(link.from); + let to = self.tr_handle(link.to); + let overlap = self.old.get_alignment(link.overlap); + self.store.add_link(from, to, overlap.ops.into()); + } + + /// Add a single subpath from the given path to the subgraph. + fn include_subpath(&mut self, path: &Path, start: &SubpathStart, end_pos: usize) { + let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id? + let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos); + self.store + .add_path(name.as_bytes(), steps, std::iter::empty()); + } + + /// Identify all the subpaths in a path from the original graph that cross through + /// segments in this subgraph and merge them if possible. + fn merge_subpaths(&mut self, path: &Path, max_distance_subpaths: usize) { + // these are subpaths which *aren't* already included in the new graph + let mut cur_subpath_start: Option = Some(0); + let mut subpath_length = 0; + let mut ignore_path = true; + + for (idx, step) in self.old.steps[path.steps].iter().enumerate() { + let in_neighb = self.seg_map.contains_key(&step.segment()); + + if let (Some(start), true) = (&cur_subpath_start, in_neighb) { + // We just entered the subgraph. End the current subpath. + if !ignore_path && subpath_length <= max_distance_subpaths { + // TODO: type safety + let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32); + for step in &self.old.steps[subpath_span] { + if !self.seg_map.contains_key(&step.segment()) { + self.include_seg(step.segment()); + } + } + } + cur_subpath_start = None; + ignore_path = false; + } else if let (None, false) = (&cur_subpath_start, in_neighb) { + // We've exited the current subgraph, start a new subpath + cur_subpath_start = Some(idx); + } + + // Track the current bp position in the path. + subpath_length += self.old.get_handle_seg(*step).len(); + } + } + + /// Identify all the subpaths in a path from the original graph that cross through + /// segments in this subgraph and add them. + fn find_subpaths(&mut self, path: &Path) { + let mut cur_subpath_start: Option = None; + let mut path_pos = 0; + + for step in &self.old.steps[path.steps] { + let in_neighb = self.seg_map.contains_key(&step.segment()); + + if let (Some(start), false) = (&cur_subpath_start, in_neighb) { + // End the current subpath. + self.include_subpath(path, start, path_pos); + cur_subpath_start = None; + } else if let (None, true) = (&cur_subpath_start, in_neighb) { + // Start a new subpath. + cur_subpath_start = Some(SubpathStart { + step: self.store.steps.next_id(), + pos: path_pos, + }); + } + + // Add the (translated) step to the new graph. + if in_neighb { + self.store.add_step(self.tr_handle(*step)); + } + + // Track the current bp position in the path. + path_pos += self.old.get_handle_seg(*step).len(); + } + + // Did we reach the end of the path while still in the neighborhood? + if let Some(start) = cur_subpath_start { + self.include_subpath(path, &start, path_pos); + } + } + + /// Translate a handle from the source graph to this subgraph. + fn tr_handle(&self, old_handle: Handle) -> Handle { + // TODO: is this just generating the handle or should we add it to the new graph? + Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient()) + } + + /// Check whether a segment from the old graph is in the subgraph. + fn contains(&self, old_seg_id: Id) -> bool { + self.seg_map.contains_key(&old_seg_id) + } + + /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away + /// from the given segment in the original graph. + /// + /// Include any links between the segments in the neighborhood and subpaths crossing + /// through the neighborhood. + fn extract(&mut self, origin: Id, dist: usize, max_distance_subpaths: usize, num_iterations: usize) { + self.include_seg(origin); + + // Find the set of all segments that are c links away. + let mut frontier: Vec> = Vec::new(); + let mut next_frontier: Vec> = Vec::new(); + frontier.push(origin); + for _ in 0..dist { + while let Some(seg_id) = frontier.pop() { + for link in self.old.links.all().iter() { + if let Some(other_seg) = link.incident_seg(seg_id) { + // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map + if !self.seg_map.contains_key(&other_seg) { + self.include_seg(other_seg); + next_frontier.push(other_seg); + } + } + } + } + (frontier, next_frontier) = (next_frontier, frontier); + } + + // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times + for _ in 0..num_iterations { + for path in self.old.paths.all().iter() { + self.merge_subpaths(path, max_distance_subpaths); + } + } + + // Include all links within the subgraph. + for link in self.old.links.all().iter() { + if self.contains(link.from.segment()) && self.contains(link.to.segment()) { + self.include_link(link); + } + } + + // Find subpaths within the subgraph. + for path in self.old.paths.all().iter() { + self.find_subpaths(path); + } + } +} \ No newline at end of file diff --git a/flatgfa/src/commands/mod.rs b/flatgfa/src/commands/mod.rs new file mode 100644 index 00000000..b4801f6b --- /dev/null +++ b/flatgfa/src/commands/mod.rs @@ -0,0 +1,4 @@ +pub mod basic_cmds; +pub mod chop; +pub mod depth; +pub mod extract; \ No newline at end of file diff --git a/flatgfa/src/fgfa_ds/file.rs b/flatgfa/src/fgfa_ds/file.rs new file mode 100644 index 00000000..a0a55515 --- /dev/null +++ b/flatgfa/src/fgfa_ds/file.rs @@ -0,0 +1,336 @@ +use super::pool::{FixedStore, Pool, Span, Store}; +use super::flatgfa::{AlignOp, FlatGFA, FixedGFAStore, Handle, Link, Path, Segment}; +use memmap::{Mmap, MmapMut}; +use std::mem::{size_of, size_of_val}; +use tinyvec::SliceVec; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; + +const MAGIC_NUMBER: u64 = 0xB101_1054; + +/// A table of contents for the FlatGFA file. +#[derive(FromBytes, FromZeroes, AsBytes, Debug)] +#[repr(packed)] +pub struct Toc { + magic: u64, + header: Size, + segs: Size, + paths: Size, + links: Size, + steps: Size, + seq_data: Size, + overlaps: Size, + alignment: Size, + name_data: Size, + optional_data: Size, + line_order: Size, +} + +/// A table-of-contents entry for a pool in the FlatGFA file. +#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy, Debug)] +#[repr(packed)] +struct Size { + /// The number of actual elements in the pool. + len: usize, + + // The allocated space for the pool. `capacity - len` slots are "empty." + capacity: usize, +} + +impl Size { + fn of_pool(pool: Pool) -> Self { + Size { + len: pool.len(), + capacity: pool.len(), + } + } + + fn of_store(store: &FixedStore<'_, T>) -> Self { + Size { + len: store.len(), + capacity: store.capacity(), + } + } + + fn bytes(&self) -> usize { + self.capacity * size_of::() + } + + fn empty(capacity: usize) -> Self { + Size { len: 0, capacity } + } +} + +impl Toc { + /// Get the total size in bytes of the file described. + pub fn size(&self) -> usize { + size_of::() + + self.header.bytes::() + + self.segs.bytes::() + + self.paths.bytes::() + + self.links.bytes::() + + self.steps.bytes::() + + self.seq_data.bytes::() + + self.overlaps.bytes::>() + + self.alignment.bytes::() + + self.name_data.bytes::() + + self.optional_data.bytes::() + + self.line_order.bytes::() + } + + /// Get a table of contents that fits a FlatGFA with no spare space. + fn full(gfa: &FlatGFA) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::of_pool(gfa.header), + segs: Size::of_pool(gfa.segs), + paths: Size::of_pool(gfa.paths), + links: Size::of_pool(gfa.links), + steps: Size::of_pool(gfa.steps), + seq_data: Size::of_pool(gfa.seq_data), + overlaps: Size::of_pool(gfa.overlaps), + alignment: Size::of_pool(gfa.alignment), + name_data: Size::of_pool(gfa.name_data), + optional_data: Size::of_pool(gfa.optional_data), + line_order: Size::of_pool(gfa.line_order), + } + } + + pub fn for_fixed_store(store: &FixedGFAStore) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::of_store(&store.header), + segs: Size::of_store(&store.segs), + paths: Size::of_store(&store.paths), + links: Size::of_store(&store.links), + steps: Size::of_store(&store.steps), + seq_data: Size::of_store(&store.seq_data), + overlaps: Size::of_store(&store.overlaps), + alignment: Size::of_store(&store.alignment), + name_data: Size::of_store(&store.name_data), + optional_data: Size::of_store(&store.optional_data), + line_order: Size::of_store(&store.line_order), + } + } + + /// Guess a reasonable set of capacities for a fresh file. + pub fn guess(factor: usize) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::empty(128), + segs: Size::empty(32 * factor * factor), + paths: Size::empty(factor), + links: Size::empty(32 * factor * factor), + steps: Size::empty(1024 * factor * factor), + seq_data: Size::empty(512 * factor * factor), + overlaps: Size::empty(256 * factor), + alignment: Size::empty(64 * factor * factor), + name_data: Size::empty(64 * factor), + optional_data: Size::empty(512 * factor * factor), + line_order: Size::empty(64 * factor * factor), + } + } + + /// Estimate a reasonable set of capacities for a fresh file based on some + /// measurements of the GFA text. + pub fn estimate( + segs: usize, + links: usize, + paths: usize, + header_bytes: usize, + seg_bytes: usize, + path_bytes: usize, + ) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::empty(header_bytes), + segs: Size::empty(segs), + paths: Size::empty(paths), + links: Size::empty(links), + steps: Size::empty(path_bytes / 3), + seq_data: Size::empty(seg_bytes), + overlaps: Size::empty((links + paths) * 2), + alignment: Size::empty(links * 2 + paths * 4), + name_data: Size::empty(paths * 512), + optional_data: Size::empty(links * 16), + line_order: Size::empty(segs + links + paths + 8), + } + } +} + +/// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity` +/// elements, and return the items and the rest of the slice. +fn slice_prefix(data: &[u8], size: Size) -> (&[T], &[u8]) { + let (prefix, rest) = T::slice_from_prefix(data, size.len).unwrap(); + let pad = size_of::() * (size.capacity - size.len); + (prefix, &rest[pad..]) +} + +/// Read the table of contents from a prefix of the byte buffer. +fn read_toc(data: &[u8]) -> (&Toc, &[u8]) { + let toc = Toc::ref_from_prefix(data).unwrap(); + let rest = &data[size_of::()..]; + let magic = toc.magic; + assert_eq!(magic, MAGIC_NUMBER); + (toc, rest) +} + +fn read_toc_mut(data: &mut [u8]) -> (&mut Toc, &mut [u8]) { + let (toc_slice, rest) = Toc::mut_slice_from_prefix(data, 1).unwrap(); + let toc = &mut toc_slice[0]; + let magic = toc.magic; + assert_eq!(magic, MAGIC_NUMBER); + (toc, rest) +} + +/// Get a FlatGFA backed by the data in a byte buffer. +pub fn view(data: &[u8]) -> FlatGFA { + let (toc, rest) = read_toc(data); + + let (header, rest) = slice_prefix(rest, toc.header); + let (segs, rest) = slice_prefix(rest, toc.segs); + let (paths, rest) = slice_prefix(rest, toc.paths); + let (links, rest) = slice_prefix(rest, toc.links); + let (steps, rest) = slice_prefix(rest, toc.steps); + let (seq_data, rest) = slice_prefix(rest, toc.seq_data); + let (overlaps, rest) = slice_prefix(rest, toc.overlaps); + let (alignment, rest) = slice_prefix(rest, toc.alignment); + let (name_data, rest) = slice_prefix(rest, toc.name_data); + let (optional_data, rest) = slice_prefix(rest, toc.optional_data); + let (line_order, _) = slice_prefix(rest, toc.line_order); + + FlatGFA { + header: header.into(), + segs: segs.into(), + paths: paths.into(), + links: links.into(), + steps: steps.into(), + seq_data: seq_data.into(), + overlaps: overlaps.into(), + alignment: alignment.into(), + name_data: name_data.into(), + optional_data: optional_data.into(), + line_order: line_order.into(), + } +} + +/// Like `slice_prefix`, but produce a `SliceVec`. +fn slice_vec_prefix( + data: &mut [u8], + size: Size, +) -> (SliceVec, &mut [u8]) { + let (prefix, rest) = T::mut_slice_from_prefix(data, size.capacity).unwrap(); + let vec = SliceVec::from_slice_len(prefix, size.len); + (vec, rest) +} + +/// Get a FlatGFA `SliceStore` from the suffix of a file just following the table of contents. +fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> FixedGFAStore<'a> { + let (header, rest) = slice_vec_prefix(data, toc.header); + let (segs, rest) = slice_vec_prefix(rest, toc.segs); + let (paths, rest) = slice_vec_prefix(rest, toc.paths); + let (links, rest) = slice_vec_prefix(rest, toc.links); + let (steps, rest) = slice_vec_prefix(rest, toc.steps); + let (seq_data, rest) = slice_vec_prefix(rest, toc.seq_data); + let (overlaps, rest) = slice_vec_prefix(rest, toc.overlaps); + let (alignment, rest) = slice_vec_prefix(rest, toc.alignment); + let (name_data, rest) = slice_vec_prefix(rest, toc.name_data); + let (optional_data, rest) = slice_vec_prefix(rest, toc.optional_data); + let (line_order, _) = slice_vec_prefix(rest, toc.line_order); + + FixedGFAStore { + header: header.into(), + segs: segs.into(), + paths: paths.into(), + links: links.into(), + steps: steps.into(), + seq_data: seq_data.into(), + overlaps: overlaps.into(), + alignment: alignment.into(), + name_data: name_data.into(), + optional_data: optional_data.into(), + line_order: line_order.into(), + } +} + +/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer. +pub fn view_store(data: &mut [u8]) -> FixedGFAStore { + let (toc, rest) = read_toc_mut(data); + slice_store(rest, toc) +} + +/// Initialize a buffer with an empty FlatGFA store. +pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, FixedGFAStore) { + // Write the table of contents. + assert!(data.len() == toc.size()); + toc.write_to_prefix(data).unwrap(); + + // Get a mutable reference to the embedded TOC. + let (toc_bytes, rest) = data.split_at_mut(size_of::()); + let toc_mut = Toc::mut_from(toc_bytes).unwrap(); + + // Extract a store from the remaining bytes. + (toc_mut, slice_store(rest, &toc)) +} + +fn write_bump<'a, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &T) -> Option<&'a mut [u8]> { + let len = size_of_val(data); + data.write_to_prefix(buf)?; + Some(&mut buf[len..]) +} + +fn write_bytes<'a>(buf: &'a mut [u8], data: &[u8]) -> Option<&'a mut [u8]> { + let len = data.len(); + buf[0..len].copy_from_slice(data); + Some(&mut buf[len..]) +} + +/// Copy a FlatGFA into a byte buffer. +pub fn dump(gfa: &FlatGFA, buf: &mut [u8]) { + // Table of contents. + let toc = Toc::full(gfa); + let rest = write_bump(buf, &toc).unwrap(); + + // All the slices. + let rest = write_bytes(rest, gfa.header.all()).unwrap(); + let rest = write_bump(rest, gfa.segs.all()).unwrap(); + let rest = write_bump(rest, gfa.paths.all()).unwrap(); + let rest = write_bump(rest, gfa.links.all()).unwrap(); + let rest = write_bump(rest, gfa.steps.all()).unwrap(); + let rest = write_bytes(rest, gfa.seq_data.all()).unwrap(); + let rest = write_bump(rest, gfa.overlaps.all()).unwrap(); + let rest = write_bump(rest, gfa.alignment.all()).unwrap(); + let rest = write_bytes(rest, gfa.name_data.all()).unwrap(); + let rest = write_bytes(rest, gfa.optional_data.all()).unwrap(); + write_bytes(rest, gfa.line_order.all()).unwrap(); +} + +/// Get the total size in bytes of a FlatGFA structure. This should result in a big +/// enough buffer to write the entire FlatGFA into with `dump`. +pub fn size(gfa: &FlatGFA) -> usize { + Toc::full(gfa).size() +} + +pub fn map_file(name: &str) -> Mmap { + let file = std::fs::File::open(name).unwrap(); + unsafe { Mmap::map(&file) }.unwrap() +} + +pub fn map_new_file(name: &str, size: u64) -> MmapMut { + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(name) + .unwrap(); + file.set_len(size).unwrap(); + unsafe { MmapMut::map_mut(&file) }.unwrap() +} + +pub fn map_file_mut(name: &str) -> MmapMut { + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(name) + .unwrap(); + unsafe { MmapMut::map_mut(&file) }.unwrap() +} diff --git a/flatgfa/src/fgfa_ds/flatgfa.rs b/flatgfa/src/fgfa_ds/flatgfa.rs new file mode 100644 index 00000000..9a9e53d4 --- /dev/null +++ b/flatgfa/src/fgfa_ds/flatgfa.rs @@ -0,0 +1,430 @@ +use std::str::FromStr; + +use super::pool::{self, Id, Pool, Span, Store}; +use bstr::BStr; +use num_enum::{IntoPrimitive, TryFromPrimitive}; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; + +/// An efficient flattened representation of a GFA file. +/// +/// This struct *borrows* the underlying data from some other data store. Namely, the +/// `GFAStore` structs contain `Vec`s or `Vec`-like arenas as backing stores for each +/// of the slices in this struct. `FlatGFA` itself provides access to the GFA data +/// structure that is agnostic to the location of the underlying bytes. However, all +/// its components have a fixed size; unlike the underlying `GFAStore`, it is not +/// possible to add new objects. +pub struct FlatGFA<'a> { + /// A GFA may optionally have a single header line, with a version number. + /// If this is empty, there is no header line. + pub header: Pool<'a, u8>, + + /// The segment (S) lines in the GFA file. + pub segs: Pool<'a, Segment>, + + /// The path (P) lines. + pub paths: Pool<'a, Path>, + + /// The link (L) lines. + pub links: Pool<'a, Link>, + + /// Paths consist of steps. This is a flat pool of steps, chunks of which are + /// associated with each path. + pub steps: Pool<'a, Handle>, + + /// The actual base-pair sequences for the segments. This is a pool of + /// base-pair symbols, chunks of which are associated with each segment. + /// + /// TODO: This could certainly use a smaller representation than `u8` + /// (since we care only about 4 base pairs). If we want to pay the cost + /// of bit-packing. + pub seq_data: Pool<'a, u8>, + + /// Both paths and links can have overlaps, which are CIGAR sequences. They + /// are all stored together here in a flat pool, elements of which point + /// to chunks of `alignment`. + pub overlaps: Pool<'a, Span>, + + /// The CIGAR aligment operations that make up the overlaps. `overlaps` + /// contains range of indices in this pool. + pub alignment: Pool<'a, AlignOp>, + + /// The string names: currenly, just of paths. (We assume segments have integer + /// names, so they don't need to be stored separately.) + pub name_data: Pool<'a, u8>, + + /// Segments can come with optional extra fields, which we store in a flat pool + /// as raw characters because we don't currently care about them. + pub optional_data: Pool<'a, u8>, + + /// An "interleaving" order of GFA lines. This is to preserve perfect round-trip + /// fidelity: we record the order of lines as we saw them when parsing a GFA file + /// so we can emit them again in that order. Elements should be `LineKind` values + /// (but they are checked before we use them). + pub line_order: Pool<'a, u8>, +} + +/// GFA graphs consist of "segment" nodes, which are fragments of base-pair sequences +/// that can be strung together into paths. +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] +pub struct Segment { + /// The segment's name. We assume all names are just plain numbers. + pub name: usize, + + /// The base-pair sequence for the segment. This is a range in the `seq_data` pool. + pub seq: Span, + + /// Segments can have optional fields. This is a range in the `optional_data` pool. + pub optional: Span, +} + +impl Segment { + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + self.seq.len() + } +} + +/// A path is a sequence of oriented references to segments. +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] +pub struct Path { + /// The path's name. This can be an arbitrary string. It is a range in the + /// `name_data` pool. + pub name: Span, + + /// The sequence of path steps. This is a range in the `steps` pool. + pub steps: Span, + + /// The CIGAR overlaps for each step on the path. This is a range in the + /// `overlaps` pool. + pub overlaps: Span>, +} + +impl Path { + pub fn step_count(&self) -> usize { + self.steps.end.index() - self.steps.start.index() + } +} + +/// An allowed edge between two oriented segments. +#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)] +#[repr(packed)] +pub struct Link { + /// The source of the edge. + pub from: Handle, + + // The destination of the edge. + pub to: Handle, + + /// The CIGAR overlap between the segments. This is a range in the + /// `alignment` pool. + pub overlap: Span, +} + +impl Link { + /// Is either end of the link the given segment? If so, return the other end. + pub fn incident_seg(&self, seg_id: Id) -> Option> { + if self.from.segment() == seg_id { + Some(self.to.segment()) + } else if self.to.segment() == seg_id { + Some(self.from.segment()) + } else { + None + } + } +} + +/// A forward or backward direction. +#[derive(Debug, PartialEq, IntoPrimitive, TryFromPrimitive)] +#[repr(u8)] +pub enum Orientation { + Forward, // + + Backward, // - +} + +impl FromStr for Orientation { + type Err = (); + + fn from_str(s: &str) -> Result { + if s == "+" { + Ok(Orientation::Forward) + } else if s == "-" { + Ok(Orientation::Backward) + } else { + Err(()) + } + } +} + +/// An oriented reference to a segment. +/// +/// A Handle refers to the forward (+) or backward (-) orientation for a given segment. +/// So, logically, it consists of a pair of a segment reference (usize) and an +/// orientation (1 bit). We pack the two values into a single word. +#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(packed)] +pub struct Handle(u32); + +impl Handle { + /// Create a new handle referring to a segment ID and an orientation. + pub fn new(segment: Id, orient: Orientation) -> Self { + let seg_num: u32 = segment.into(); + assert!(seg_num & (1 << (u32::BITS - 1)) == 0, "index too large"); + let orient_bit: u8 = orient.into(); + assert!(orient_bit & !1 == 0, "invalid orientation"); + Self(seg_num << 1 | (orient_bit as u32)) + } + + /// Get the segment ID. This is an index in the `segs` pool. + pub fn segment(&self) -> Id { + (self.0 >> 1).into() + } + + /// Get the orientation (+ or -) for the handle. + pub fn orient(&self) -> Orientation { + ((self.0 & 1) as u8).try_into().unwrap() + } +} + +/// The kind of each operation in a CIGAR alignment. +#[derive(Debug, IntoPrimitive, TryFromPrimitive, Clone, Copy)] +#[repr(u8)] +pub enum AlignOpcode { + Match, // M + Gap, // N + Insertion, // D + Deletion, // I +} + +/// A single operation in a CIGAR alignment, like "3M" or "1D". +/// +/// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two +/// into a single u32. +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] +pub struct AlignOp(u32); + +impl AlignOp { + /// Create a new alignment operation from an opcode and count. + pub fn new(op: AlignOpcode, len: u32) -> Self { + let op_byte: u8 = op.into(); + assert!(len & !0xff == 0, "length too large"); + Self((len << 8) | (op_byte as u32)) + } + + /// Get the operation (M, I, etc.) for this operation. + pub fn op(&self) -> AlignOpcode { + ((self.0 & 0xff) as u8).try_into().unwrap() + } + + /// Get the length of the operation. + pub fn len(&self) -> u32 { + self.0 >> 8 + } + + /// Check whether there are zero operations in this alignment. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// An entire CIGAR alignment string, like "3M1D2M". +#[derive(Debug)] +#[repr(transparent)] +pub struct Alignment<'a> { + /// The sequence of operations that make up the alignment. + pub ops: &'a [AlignOp], +} + +/// A kind of GFA line. We use this in `line_order` to preserve the textual order +/// in a GFA file for round-tripping. +#[derive(Debug, IntoPrimitive, TryFromPrimitive)] +#[repr(u8)] +pub enum LineKind { + Header, + Segment, + Path, + Link, +} + +impl<'a> FlatGFA<'a> { + /// Get the base-pair sequence for a segment. + pub fn get_seq(&self, seg: &Segment) -> &BStr { + self.seq_data[seg.seq].as_ref() + } + + /// Look up a segment by its name. + pub fn find_seg(&self, name: usize) -> Option> { + // TODO Make this more efficient by maintaining the name index? This would not be + // too hard; we already have the machinery in `parse.rs`... + self.segs.search(|seg| seg.name == name) + } + + /// Look up a path by its name. + pub fn find_path(&self, name: &BStr) -> Option> { + self.paths.search(|path| self.get_path_name(path) == name) + } + + /// Get the string name of a path. + pub fn get_path_name(&self, path: &Path) -> &BStr { + self.name_data[path.name].as_ref() + } + + pub fn get_path_steps(&self, path: &Path) -> impl Iterator { + self.steps[path.steps].iter() + } + + /// Get a handle's associated segment. + pub fn get_handle_seg(&self, handle: Handle) -> &Segment { + &self.segs[handle.segment()] + } + + /// Get the optional data for a segment, as a tab-separated string. + pub fn get_optional_data(&self, seg: &Segment) -> &BStr { + self.optional_data[seg.optional].as_ref() + } + + /// Look up a CIGAR alignment. + pub fn get_alignment(&self, overlap: Span) -> Alignment { + Alignment { + ops: &self.alignment[overlap], + } + } + + /// Get the recorded order of line kinds. + pub fn get_line_order(&self) -> impl Iterator + 'a { + self.line_order + .all() + .iter() + .map(|b| (*b).try_into().unwrap()) + } +} + +/// The data storage pools for a `FlatGFA`. +#[derive(Default)] +pub struct GFAStore<'a, P: StoreFamily<'a>> { + pub header: P::Store, + pub segs: P::Store, + pub paths: P::Store, + pub links: P::Store, + pub steps: P::Store, + pub seq_data: P::Store, + pub overlaps: P::Store>, + pub alignment: P::Store, + pub name_data: P::Store, + pub optional_data: P::Store, + pub line_order: P::Store, +} + +impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> { + /// Add a header line for the GFA file. This may only be added once. + pub fn add_header(&mut self, version: &[u8]) { + assert!(self.header.as_ref().is_empty()); + self.header.add_slice(version); + } + + /// Add a new segment to the GFA file. + pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Id { + self.segs.add(Segment { + name, + seq: self.seq_data.add_slice(seq), + optional: self.optional_data.add_slice(optional), + }) + } + + /// Add a new path. + pub fn add_path( + &mut self, + name: &[u8], + steps: Span, + overlaps: impl Iterator>, + ) -> Id { + let overlaps = self.overlaps.add_iter( + overlaps + .into_iter() + .map(|align| self.alignment.add_iter(align)), + ); + let name = self.name_data.add_slice(name); + self.paths.add(Path { + name, + steps, + overlaps, + }) + } + + /// Add a sequence of steps. + pub fn add_steps(&mut self, steps: impl Iterator) -> Span { + self.steps.add_iter(steps) + } + + /// Add a single step. + pub fn add_step(&mut self, step: Handle) -> Id { + self.steps.add(step) + } + + /// Add a sequence of links. + pub fn add_links(&mut self, links: impl Iterator) -> Span { + self.links.add_iter(links) + } + + /// Add a link between two (oriented) segments. + pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec) -> Id { + self.links.add(Link { + from, + to, + overlap: self.alignment.add_iter(overlap), + }) + } + + /// Record a line type to preserve the line order. + pub fn record_line(&mut self, kind: LineKind) { + self.line_order.add(kind.into()); + } + + /// Borrow a FlatGFA view of this data store. + pub fn as_ref(&self) -> FlatGFA { + FlatGFA { + header: self.header.as_ref(), + segs: self.segs.as_ref(), + paths: self.paths.as_ref(), + links: self.links.as_ref(), + name_data: self.name_data.as_ref(), + seq_data: self.seq_data.as_ref(), + steps: self.steps.as_ref(), + overlaps: self.overlaps.as_ref(), + alignment: self.alignment.as_ref(), + optional_data: self.optional_data.as_ref(), + line_order: self.line_order.as_ref(), + } + } +} + +pub trait StoreFamily<'a> { + type Store: pool::Store; +} + +#[derive(Default)] +pub struct HeapFamily; +impl<'a> StoreFamily<'a> for HeapFamily { + type Store = pool::HeapStore; +} + +pub struct FixedFamily; +impl<'a> StoreFamily<'a> for FixedFamily { + type Store = pool::FixedStore<'a, T>; +} + +/// A store for `FlatGFA` data backed by fixed-size slices. +/// +/// This store contains `SliceVec`s, which act like `Vec`s but are allocated within +/// a fixed region. This means they have a maximum size, but they can directly map +/// onto the contents of a file. +pub type FixedGFAStore<'a> = GFAStore<'a, FixedFamily>; + +/// A mutable, in-memory data store for `FlatGFA`. +/// +/// This store contains a bunch of `Vec`s: one per array required to implement a +/// `FlatGFA`. It exposes an API for building up a GFA data structure, so it is +/// useful for creating new ones from scratch. +pub type HeapGFAStore = GFAStore<'static, HeapFamily>; diff --git a/flatgfa/src/fgfa_ds/gfaline.rs b/flatgfa/src/fgfa_ds/gfaline.rs new file mode 100644 index 00000000..87178bbf --- /dev/null +++ b/flatgfa/src/fgfa_ds/gfaline.rs @@ -0,0 +1,272 @@ +use super::flatgfa::{AlignOp, AlignOpcode, Orientation}; +use atoi::FromRadix10; + +type ParseResult = Result; +type LineResult<'a> = ParseResult>; +type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>; + +/// A parsed GFA file line. +pub enum Line<'a> { + Header(&'a [u8]), + Segment(Segment<'a>), + Link(Link), + Path(Path<'a>), +} + +pub struct Segment<'a> { + pub name: usize, + pub seq: &'a [u8], + pub data: &'a [u8], +} + +pub struct Link { + pub from_seg: usize, + pub from_orient: Orientation, + pub to_seg: usize, + pub to_orient: Orientation, + pub overlap: Vec, +} + +pub struct Path<'a> { + pub name: &'a [u8], + pub steps: &'a [u8], + pub overlaps: Vec>, +} + +/// Parse a single line of a GFA file. +pub fn parse_line(line: &[u8]) -> LineResult { + if line.len() < 2 || line[1] != b'\t' { + return Err("expected marker and tab"); + } + let rest = &line[2..]; + match line[0] { + b'H' => parse_header(rest), + b'S' => parse_seg(rest), + b'L' => parse_link(rest), + b'P' => parse_path(rest), + _ => Err("unhandled line kind"), + } +} + +/// Parse a header line, which looks like `H `. +fn parse_header(line: &[u8]) -> LineResult { + Ok(Line::Header(line)) +} + +/// Parse a segment line, which looks like `S `. +fn parse_seg(line: &[u8]) -> LineResult { + let (name, rest) = parse_num(line)?; + let rest = parse_byte(rest, b'\t')?; + let (seq, data) = parse_field(rest)?; + Ok(Line::Segment(Segment { name, seq, data })) +} + +/// Parse a link line, which looks like `L <+-> <+-> `. +fn parse_link(line: &[u8]) -> LineResult { + let (from_seg, rest) = parse_num(line)?; + let rest = parse_byte(rest, b'\t')?; + let (from_orient, rest) = parse_orient(rest)?; + let rest = parse_byte(rest, b'\t')?; + let (to_seg, rest) = parse_num(rest)?; + let rest = parse_byte(rest, b'\t')?; + let (to_orient, rest) = parse_orient(rest)?; + let rest = parse_byte(rest, b'\t')?; + let (overlap, rest) = parse_align(rest)?; + if !rest.is_empty() { + return Err("expected end of line"); + } + Ok(Line::Link(Link { + from_seg, + from_orient, + to_seg, + to_orient, + overlap, + })) +} + +/// Parse a path line, which looks like `P <*|CIGARs>`. +fn parse_path(line: &[u8]) -> LineResult { + let (name, rest) = parse_field(line)?; + let (steps, rest) = parse_field(rest)?; + let (overlaps, rest) = parse_maybe_overlap_list(rest)?; + if !rest.is_empty() { + return Err("expected end of line"); + } + Ok(Line::Path(Path { + name, + steps, + overlaps, + })) +} + +/// Parse a *possible* overlap list, which may be `*` (empty). +pub fn parse_maybe_overlap_list(s: &[u8]) -> PartialParseResult>> { + if s == b"*" { + Ok((vec![], &s[1..])) + } else { + parse_overlap_list(s) + } +} + +/// Parse a comma-separated list of CIGAR strings. +/// +/// TODO: This could be optimized to avoid accumulating into a vector. +fn parse_overlap_list(s: &[u8]) -> PartialParseResult>> { + let mut rest = s; + let mut overlaps = vec![]; + while !rest.is_empty() { + let overlap; + (overlap, rest) = parse_align(rest)?; + overlaps.push(overlap); + if !rest.is_empty() { + rest = parse_byte(rest, b',')?; + } + } + Ok((overlaps, rest)) +} + +/// Consume a chunk of a string up to a given marker byte. +fn parse_until(line: &[u8], marker: u8) -> PartialParseResult<&[u8]> { + let end = memchr::memchr(marker, line).unwrap_or(line.len()); + let rest = if end == line.len() { + &[] + } else { + &line[end + 1..] + }; + Ok((&line[..end], rest)) +} + +/// Consume a string from the line, until a tab (or the end of the line). +pub fn parse_field(line: &[u8]) -> PartialParseResult<&[u8]> { + parse_until(line, b'\t') +} + +/// Consume a specific byte. +fn parse_byte(s: &[u8], byte: u8) -> ParseResult<&[u8]> { + if s.is_empty() || s[0] != byte { + return Err("expected byte"); + } + Ok(&s[1..]) +} + +/// Parse a single integer. +fn parse_num(s: &[u8]) -> PartialParseResult { + match T::from_radix_10(s) { + (_, 0) => Err("expected number"), + (num, used) => Ok((num, &s[used..])), + } +} + +/// Parse a segment orientation (+ or -). +fn parse_orient(line: &[u8]) -> PartialParseResult { + if line.is_empty() { + return Err("expected orientation"); + } + let orient = match line[0] { + b'+' => Orientation::Forward, + b'-' => Orientation::Backward, + _ => return Err("expected orient"), + }; + Ok((orient, &line[1..])) +} + +/// Parse a single CIGAR alignment operation (like `4D`). +fn parse_align_op(s: &[u8]) -> PartialParseResult { + let (len, rest) = parse_num::(s)?; + let op = match rest[0] { + b'M' => AlignOpcode::Match, + b'N' => AlignOpcode::Gap, + b'D' => AlignOpcode::Deletion, + b'I' => AlignOpcode::Insertion, + _ => return Err("expected align op"), + }; + Ok((AlignOp::new(op, len), &rest[1..])) +} + +/// Parse a complete CIGAR alignment string (like `3M2I`). +/// +/// TODO This could be optimized to avoid collecting into a vector. +fn parse_align(s: &[u8]) -> PartialParseResult> { + let mut rest = s; + let mut align = vec![]; + while !rest.is_empty() && rest[0].is_ascii_digit() { + let op; + (op, rest) = parse_align_op(rest)?; + align.push(op); + } + Ok((align, rest)) +} + +/// Parse GFA paths' segment lists. These look like `1+,2-,3+`. +pub struct StepsParser<'a> { + str: &'a [u8], + index: usize, + state: StepsParseState, + seg: usize, +} + +/// The parser state: we're either looking for a segment name (or a +/- terminator), +/// or we're expecting a comma (or end of string). +enum StepsParseState { + Seg, + Comma, +} + +impl<'a> StepsParser<'a> { + pub fn new(str: &'a [u8]) -> Self { + StepsParser { + str, + index: 0, + state: StepsParseState::Seg, + seg: 0, + } + } + + pub fn rest(&self) -> &[u8] { + &self.str[self.index..] + } +} + +impl<'a> Iterator for StepsParser<'a> { + type Item = (usize, bool); + fn next(&mut self) -> Option<(usize, bool)> { + while self.index < self.str.len() { + // Consume one byte. + let byte = self.str[self.index]; + self.index += 1; + + match self.state { + StepsParseState::Seg => { + if byte == b'+' || byte == b'-' { + self.state = StepsParseState::Comma; + return Some((self.seg, byte == b'+')); + } else if byte.is_ascii_digit() { + self.seg *= 10; + self.seg += (byte - b'0') as usize; + } else { + return None; + } + } + StepsParseState::Comma => { + if byte == b',' { + self.state = StepsParseState::Seg; + self.seg = 0; + } else { + return None; + } + } + } + } + + None + } +} + +#[test] +fn test_parse_steps() { + let s = b"1+,23-,4+ suffix"; + let mut parser = StepsParser::new(s); + let path: Vec<_> = (&mut parser).collect(); + assert_eq!(path, vec![(1, true), (23, false), (4, true)]); + assert_eq!(parser.rest(), b"suffix"); +} diff --git a/flatgfa/src/fgfa_ds/mod.rs b/flatgfa/src/fgfa_ds/mod.rs new file mode 100644 index 00000000..32fd106b --- /dev/null +++ b/flatgfa/src/fgfa_ds/mod.rs @@ -0,0 +1,7 @@ +pub mod file; +pub mod flatgfa; +pub mod gfaline; +pub mod parse; +pub mod pool; +pub mod print; + diff --git a/flatgfa/src/fgfa_ds/parse.rs b/flatgfa/src/fgfa_ds/parse.rs new file mode 100644 index 00000000..76ae4d90 --- /dev/null +++ b/flatgfa/src/fgfa_ds/parse.rs @@ -0,0 +1,284 @@ +use super::file::Toc; +use super::flatgfa::{self, Handle, LineKind, Orientation}; +use super::gfaline; +use std::collections::HashMap; +use std::io::BufRead; + +pub struct Parser<'a, P: flatgfa::StoreFamily<'a>> { + /// The flat representation we're building. + flat: flatgfa::GFAStore<'a, P>, + + /// All segment IDs, indexed by their names, which we need to refer to segments in paths. + seg_ids: NameMap, +} + +impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> { + pub fn new(builder: flatgfa::GFAStore<'a, P>) -> Self { + Self { + flat: builder, + seg_ids: NameMap::default(), + } + } + + /// Parse a GFA text file from an I/O stream. + pub fn parse_stream(mut self, stream: R) -> flatgfa::GFAStore<'a, P> { + // We can parse segments immediately, but we need to defer links and paths until we have all + // the segment names that they might refer to. + let mut deferred_links = Vec::new(); + let mut deferred_paths = Vec::new(); + + // Parse or defer each line. + for line in stream.split(b'\n') { + let line = line.unwrap(); + + // Avoid parsing paths entirely for now; just preserve the entire line for later. + if line[0] == b'P' { + self.flat.record_line(LineKind::Path); + deferred_paths.push(line); + continue; + } + + // Parse other kinds of lines. + let gfa_line = gfaline::parse_line(line.as_ref()).unwrap(); + self.record_line(&gfa_line); + + match gfa_line { + gfaline::Line::Header(data) => { + self.flat.add_header(data); + } + gfaline::Line::Segment(seg) => { + self.add_seg(seg); + } + gfaline::Line::Link(link) => { + deferred_links.push(link); + } + gfaline::Line::Path(_) => { + unreachable!("paths handled separately") + } + } + } + + // "Unwind" the deferred links and paths. + for link in deferred_links { + self.add_link(link); + } + for line in deferred_paths { + self.add_path(&line); + } + + self.flat + } + + /// Parse a GFA text file from an in-memory buffer. + pub fn parse_mem(mut self, buf: &[u8]) -> flatgfa::GFAStore<'a, P> { + let mut deferred_lines = Vec::new(); + + for line in MemchrSplit::new(b'\n', buf) { + // When parsing from memory, it's easy to entirely defer parsing of any line: we just keep + // pointers to them. So we defer both paths and links. + if line[0] == b'P' || line[0] == b'L' { + self.flat.record_line(if line[0] == b'P' { + LineKind::Path + } else { + LineKind::Link + }); + deferred_lines.push(line); + continue; + } + + // Actually parse other lines. + let gfa_line = gfaline::parse_line(line).unwrap(); + self.record_line(&gfa_line); + match gfa_line { + gfaline::Line::Header(data) => { + self.flat.add_header(data); + } + gfaline::Line::Segment(seg) => { + self.add_seg(seg); + } + gfaline::Line::Link(_) | gfaline::Line::Path(_) => { + unreachable!("paths and links handled separately") + } + } + } + + // "Unwind" the deferred lines. + for line in deferred_lines { + if line[0] == b'P' { + self.add_path(line); + } else { + let gfa_line = gfaline::parse_line(line).unwrap(); + if let gfaline::Line::Link(link) = gfa_line { + self.add_link(link); + } else { + unreachable!("unexpected deferred line") + } + } + } + + self.flat + } + + /// Record a marker that captures the original GFA line ordering. + fn record_line(&mut self, line: &gfaline::Line) { + match line { + gfaline::Line::Header(_) => self.flat.record_line(LineKind::Header), + gfaline::Line::Segment(_) => self.flat.record_line(LineKind::Segment), + gfaline::Line::Link(_) => self.flat.record_line(LineKind::Link), + gfaline::Line::Path(_) => self.flat.record_line(LineKind::Path), + } + } + + fn add_seg(&mut self, seg: gfaline::Segment) { + let seg_id = self.flat.add_seg(seg.name, seg.seq, seg.data); + self.seg_ids.insert(seg.name, seg_id.into()); + } + + fn add_link(&mut self, link: gfaline::Link) { + let from = Handle::new(self.seg_ids.get(link.from_seg).into(), link.from_orient); + let to = Handle::new(self.seg_ids.get(link.to_seg).into(), link.to_orient); + self.flat.add_link(from, to, link.overlap); + } + + fn add_path(&mut self, line: &[u8]) { + // This must be a path line. + assert_eq!(&line[..2], b"P\t"); + let line = &line[2..]; + + // Parse the name. + let (name, rest) = gfaline::parse_field(line).unwrap(); + + // Parse the steps. + let mut step_parser = gfaline::StepsParser::new(rest); + let steps = self.flat.add_steps((&mut step_parser).map(|(name, dir)| { + Handle::new( + self.seg_ids.get(name).into(), + if dir { + Orientation::Forward + } else { + Orientation::Backward + }, + ) + })); + let rest = step_parser.rest(); + + // Parse the overlaps. + let (overlaps, rest) = gfaline::parse_maybe_overlap_list(rest).unwrap(); + + assert!(rest.is_empty()); + self.flat.add_path(name, steps, overlaps.into_iter()); + } +} + +impl Parser<'static, flatgfa::HeapFamily> { + pub fn for_heap() -> Self { + Self::new(flatgfa::HeapGFAStore::default()) + } +} + +impl<'a> Parser<'a, flatgfa::FixedFamily> { + pub fn for_slice(store: flatgfa::FixedGFAStore<'a>) -> Self { + Self::new(store) + } +} + +#[derive(Default)] +struct NameMap { + /// Names at most this are assigned *sequential* IDs, i.e., the ID is just the name + /// minus one. + sequential_max: usize, + + /// Non-sequential names go here. + others: HashMap, +} + +impl NameMap { + fn insert(&mut self, name: usize, id: u32) { + // Is this the next sequential name? If so, no need to record it in our hash table; + // just bump the number of sequential names we've seen. + if (name - 1) == self.sequential_max && (name - 1) == (id as usize) { + self.sequential_max += 1; + } else { + self.others.insert(name, id); + } + } + + fn get(&self, name: usize) -> u32 { + if name <= self.sequential_max { + (name - 1) as u32 + } else { + self.others[&name] + } + } +} + +/// Scan a GFA text file to count the number of each type of line and measure some sizes +/// that are useful in estimating the final size of the FlatGFA file. +pub fn estimate_toc(buf: &[u8]) -> Toc { + let mut segs = 0; + let mut links = 0; + let mut paths = 0; + let mut header_bytes = 0; + let mut seg_bytes = 0; + let mut path_bytes = 0; + + let mut rest = buf; + while !rest.is_empty() { + let marker = rest[0]; + let next = memchr::memchr(b'\n', rest).unwrap_or(rest.len() + 1); + + match marker { + b'H' => { + header_bytes += next; + } + b'S' => { + segs += 1; + seg_bytes += next; + } + b'L' => { + links += 1; + } + b'P' => { + paths += 1; + path_bytes += next; + } + _ => { + panic!("unknown line type") + } + } + + if next >= rest.len() { + break; + } + rest = &rest[next + 1..]; + } + + Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes) +} + +struct MemchrSplit<'a> { + haystack: &'a [u8], + memchr: memchr::Memchr<'a>, + pos: usize, +} + +impl<'a> Iterator for MemchrSplit<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + let start = self.pos; + let end = self.memchr.next()?; + self.pos = end + 1; + Some(&self.haystack[start..end]) + } +} + +impl MemchrSplit<'_> { + fn new(needle: u8, haystack: &[u8]) -> MemchrSplit { + MemchrSplit { + haystack, + memchr: memchr::memchr_iter(needle, haystack), + pos: 0, + } + } +} diff --git a/flatgfa/src/fgfa_ds/pool.rs b/flatgfa/src/fgfa_ds/pool.rs new file mode 100644 index 00000000..2872388a --- /dev/null +++ b/flatgfa/src/fgfa_ds/pool.rs @@ -0,0 +1,299 @@ +use std::ops::{Index, Add, Sub}; +use std::{hash::Hash, marker::PhantomData}; +use tinyvec::SliceVec; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; + +/// An index into a pool. +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(transparent)] +pub struct Id(u32, PhantomData); + +impl PartialEq for Id { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl Eq for Id {} + +impl Hash for Id { + fn hash(&self, state: &mut H) { + self.0.hash(state) + } +} + +impl Add for Id { + type Output = Self; + + #[inline] + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs, PhantomData) + } +} + +impl Sub for Id { + type Output = Self; + #[inline] + fn sub(self, rhs:u32) -> Self::Output { + Self(self.0 - rhs, PhantomData) + } +} + +impl Id { + pub fn index(self) -> usize { + self.0 as usize + } + + pub fn new(index: usize) -> Self { + Self(index.try_into().expect("id too large"), PhantomData) + } +} + +impl From for Id { + fn from(v: u32) -> Self { + Self(v, PhantomData) + } +} + +impl From> for u32 { + fn from(v: Id) -> Self { + v.0 + } +} + +/// A range of indices into a pool. +/// +/// TODO: Consider smaller indices for this, and possibly base/offset instead +/// of start/end. +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(packed)] +pub struct Span { + pub start: Id, + pub end: Id, + _marker: PhantomData, +} + +impl From> for std::ops::Range { + fn from(span: Span) -> std::ops::Range { + (span.start.index())..(span.end.index()) + } +} + +impl From<&Span> for std::ops::Range { + fn from(span: &Span) -> std::ops::Range { + (span.start.0 as usize)..(span.end.0 as usize) + } +} + +impl Span { + pub fn is_empty(&self) -> bool { + self.start.0 == self.end.0 + } + + pub fn len(&self) -> usize { + (self.end.0 - self.start.0) as usize + } + + pub fn contains(&self, id: Id) -> bool { + self.start.0 <= id.0 && id.0 < self.end.0 + } + + pub fn new(start: Id, end: Id) -> Self { + Self { + start, + end, + _marker: PhantomData, + } + } + + pub fn new_empty() -> Self { + Span::new(Id::new(0), Id::new(0)) + } +} + +/// A simple arena for objects of a single type. +/// +/// This trait provides convenient accessors for treating Vec and Vec-like objects +/// as allocation arenas. This trait supports adding to the pool (i.e., growing the +/// arena). Pools also `Deref` to slices, which are `&Pool`s and support convenient +/// access to the current set of objects (but not addition of new objects). +pub trait Store { + /// Get a fixed-size view of the arena. + fn as_ref(&self) -> Pool; + + /// Add an item to the pool and get the new id. + fn add(&mut self, item: T) -> Id; + + /// Add an entire sequence of items to a "pool" vector and return the + /// range of new indices (IDs). + fn add_iter(&mut self, iter: impl IntoIterator) -> Span; + + /// Like `add_iter`, but for slices. + fn add_slice(&mut self, slice: &[T]) -> Span; + + /// Get the number of items in the pool. + fn len(&self) -> usize; + + /// Check whether the pool is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the next available ID. + fn next_id(&self) -> Id { + Id::new(self.len()) + } +} + +/// A store that uses a `Vec` to allocate objects on the heap. +/// +/// This is a "normal" arena that can freely grow to fill available memory. +#[repr(transparent)] +pub struct HeapStore(Vec); + +impl Store for HeapStore { + fn as_ref(&self) -> Pool { + Pool(&self.0) + } + + fn add(&mut self, item: T) -> Id { + let id = self.as_ref().next_id(); + self.0.push(item); + id + } + + fn add_iter(&mut self, iter: impl IntoIterator) -> Span { + let start = self.as_ref().next_id(); + self.0.extend(iter); + Span::new(start, self.as_ref().next_id()) + } + + fn add_slice(&mut self, slice: &[T]) -> Span { + let start = self.as_ref().next_id(); + self.0.extend_from_slice(slice); + Span::new(start, self.as_ref().next_id()) + } + + fn len(&self) -> usize { + self.0.len() + } +} + +impl Default for HeapStore { + fn default() -> Self { + Self(Vec::new()) + } +} + +/// A store that keeps its data in fixed locations in memory. +/// +/// This is a funkier kind of arena that uses memory that has already been pre-allocated +/// somewhere else, such as in a memory-mapped file. A consequence is that there is a +/// fixed maximum size for the arena; it's possible to add objects only until it fills up. +#[repr(transparent)] +pub struct FixedStore<'a, T>(SliceVec<'a, T>); + +impl<'a, T: Clone> Store for FixedStore<'a, T> { + fn as_ref(&self) -> Pool { + Pool(&self.0) + } + + fn add(&mut self, item: T) -> Id { + let id = self.next_id(); + self.0.push(item); + id + } + + fn add_iter(&mut self, iter: impl IntoIterator) -> Span { + let start = self.next_id(); + self.0.extend(iter); + Span::new(start, self.next_id()) + } + + fn add_slice(&mut self, slice: &[T]) -> Span { + let start = self.next_id(); + self.0.extend_from_slice(slice); + Span::new(start, self.next_id()) + } + + fn len(&self) -> usize { + self.0.len() + } +} + +impl<'a, T> FixedStore<'a, T> { + pub fn capacity(&self) -> usize { + self.0.capacity() + } +} + +impl<'a, T> From> for FixedStore<'a, T> { + fn from(slice: SliceVec<'a, T>) -> Self { + Self(slice) + } +} + +/// A fixed-sized arena. +/// +/// This trait allows id-based access to a fixed-size chunk of objects reflecting +/// a `Store`. Unlike `Store`, it does not support adding new objects. +#[repr(transparent)] +#[derive(Clone, Copy)] +pub struct Pool<'a, T>(&'a [T]); + +impl<'a, T> Pool<'a, T> { + /// Get the number of items in the pool. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Check if the pool is empty. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Get the next available ID. + pub fn next_id(&self) -> Id { + Id::new(self.len()) + } + + /// Get the entire pool as a slice. + pub fn all(&self) -> &'a [T] { + self.0 + } + + /// Find the first item in the pool that satisfies a predicate. + pub fn search(&self, pred: impl Fn(&T) -> bool) -> Option> { + self.0.iter().position(pred).map(|i| Id::new(i)) + } + + /// Iterate over id/item pairs in the pool. + pub fn items(&self) -> impl Iterator, &T)> { + self.0 + .iter() + .enumerate() + .map(|(i, item)| (Id::new(i), item)) + } +} + +impl Index> for Pool<'_, T> { + type Output = T; + + fn index(&self, id: Id) -> &T { + &self.0[id.index()] + } +} + +impl Index> for Pool<'_, T> { + type Output = [T]; + + fn index(&self, span: Span) -> &[T] { + &self.0[std::ops::Range::from(span)] + } +} + +impl<'a, T> From<&'a [T]> for Pool<'a, T> { + fn from(slice: &'a [T]) -> Self { + Self(slice) + } +} diff --git a/flatgfa/src/fgfa_ds/print.rs b/flatgfa/src/fgfa_ds/print.rs new file mode 100644 index 00000000..09532389 --- /dev/null +++ b/flatgfa/src/fgfa_ds/print.rs @@ -0,0 +1,153 @@ +use super::flatgfa; +use std::fmt; + +impl fmt::Display for flatgfa::Orientation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + flatgfa::Orientation::Forward => write!(f, "+"), + flatgfa::Orientation::Backward => write!(f, "-"), + } + } +} + +impl fmt::Display for flatgfa::AlignOpcode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + flatgfa::AlignOpcode::Match => write!(f, "M"), + flatgfa::AlignOpcode::Gap => write!(f, "N"), + flatgfa::AlignOpcode::Insertion => write!(f, "D"), + flatgfa::AlignOpcode::Deletion => write!(f, "I"), + } + } +} + +impl<'a> fmt::Display for flatgfa::Alignment<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.ops.len() == 0 { + write!(f, "0M")?; + } + for op in self.ops { + write!(f, "{}{}", op.len(), op.op())?; + } + Ok(()) + } +} + +/// A wrapper for displaying components from FlatGFA. +pub struct Display<'a, T>(pub &'a flatgfa::FlatGFA<'a>, pub T); + +impl<'a> fmt::Display for Display<'a, flatgfa::Handle> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let seg = self.0.get_handle_seg(self.1); + let name = seg.name; + write!(f, "{}{}", name, self.1.orient()) + } +} + +impl<'a> fmt::Display for Display<'a, &flatgfa::Path> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "P\t{}\t", self.0.get_path_name(&self.1))?; + let steps = &self.0.steps[self.1.steps]; + write!(f, "{}", Display(self.0, steps[0]))?; + for step in steps[1..].iter() { + write!(f, ",{}", Display(self.0, *step))?; + } + write!(f, "\t")?; + let overlaps = &self.0.overlaps[self.1.overlaps]; + if overlaps.is_empty() { + write!(f, "*")?; + } else { + write!(f, "{}", self.0.get_alignment(overlaps[0]))?; + for overlap in overlaps[1..].iter() { + write!(f, ",{}", self.0.get_alignment(*overlap))?; + } + } + Ok(()) + } +} + +impl<'a> fmt::Display for Display<'a, &flatgfa::Link> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let from = self.1.from; + let from_name = self.0.get_handle_seg(from).name; + let to = self.1.to; + let to_name = self.0.get_handle_seg(to).name; + write!( + f, + "L\t{}\t{}\t{}\t{}\t{}", + from_name, + from.orient(), + to_name, + to.orient(), + self.0.get_alignment(self.1.overlap) + ) + } +} + +impl<'a> fmt::Display for Display<'a, &flatgfa::Segment> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let name = self.1.name; + write!(f, "S\t{}\t{}", name, self.0.get_seq(self.1))?; + if !self.1.optional.is_empty() { + write!(f, "\t{}", self.0.get_optional_data(self.1))?; + } + Ok(()) + } +} + +/// Print a graph in the order preserved from an original GFA file. +fn write_preserved(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut seg_iter = gfa.segs.all().iter(); + let mut path_iter = gfa.paths.all().iter(); + let mut link_iter = gfa.links.all().iter(); + for kind in gfa.get_line_order() { + match kind { + flatgfa::LineKind::Header => { + let version = gfa.header; + assert!(!version.is_empty()); + writeln!(f, "H\t{}", bstr::BStr::new(version.all()))?; + } + flatgfa::LineKind::Segment => { + let seg = seg_iter.next().expect("too few segments"); + writeln!(f, "{}", Display(gfa, seg))?; + } + flatgfa::LineKind::Path => { + let path = path_iter.next().expect("too few paths"); + writeln!(f, "{}", Display(gfa, path))?; + } + flatgfa::LineKind::Link => { + let link = link_iter.next().expect("too few links"); + writeln!(f, "{}", Display(gfa, link))?; + } + } + } + Ok(()) +} + +/// Print a graph in a normalized order, ignoring the original GFA line order. +pub fn write_normalized(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if !gfa.header.is_empty() { + writeln!(f, "H\t{}", bstr::BStr::new(gfa.header.all()))?; + } + for seg in gfa.segs.all().iter() { + writeln!(f, "{}", Display(gfa, seg))?; + } + for path in gfa.paths.all().iter() { + writeln!(f, "{}", Display(gfa, path))?; + } + for link in gfa.links.all().iter() { + writeln!(f, "{}", Display(gfa, link))?; + } + Ok(()) +} + +/// Print our flat representation as in GFA text format. +impl<'a> fmt::Display for &'a flatgfa::FlatGFA<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.line_order.is_empty() { + write_normalized(self, f) + } else { + write_preserved(self, f) + } + } +} diff --git a/flatgfa/src/lib.rs b/flatgfa/src/lib.rs new file mode 100644 index 00000000..9b7dc559 --- /dev/null +++ b/flatgfa/src/lib.rs @@ -0,0 +1 @@ +pub mod fgfa_ds; \ No newline at end of file diff --git a/flatgfa/src/main.rs b/flatgfa/src/main.rs index 53968e99..3541b49a 100644 --- a/flatgfa/src/main.rs +++ b/flatgfa/src/main.rs @@ -1,10 +1,17 @@ use argh::FromArgs; + +mod fgfa_ds; use fgfa_ds::flatgfa::FlatGFA; use fgfa_ds::parse::Parser; use fgfa_ds::pool::Store; use fgfa_ds::{file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles -mod cmds; +mod commands; +use commands::basic_cmds::{Toc, Paths, Stats, Position}; +use commands::{chop::Chop, depth::Depth, extract::Extract}; + +use commands::basic_cmds::{toc, paths, stats, position}; +use commands::{chop::chop, depth::depth, extract::extract}; #[derive(FromArgs)] /// Convert between GFA text and FlatGFA binary formats. @@ -36,13 +43,13 @@ struct PolBin { #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand)] enum Command { - Toc(cmds::Toc), - Paths(cmds::Paths), - Stats(cmds::Stats), - Position(cmds::Position), - Extract(cmds::Extract), - Depth(cmds::Depth), - Chop(cmds::Chop), + Toc(Toc), + Paths(Paths), + Stats(Stats), + Position(Position), + Extract(Extract), + Depth(Depth), + Chop(Chop), } fn main() -> Result<(), &'static str> { @@ -90,26 +97,26 @@ fn main() -> Result<(), &'static str> { match args.command { Some(Command::Toc(_)) => { - cmds::toc(&gfa); + toc(&gfa); } Some(Command::Paths(_)) => { - cmds::paths(&gfa); + paths(&gfa); } Some(Command::Stats(sub_args)) => { - cmds::stats(&gfa, sub_args); + stats(&gfa, sub_args); } Some(Command::Position(sub_args)) => { - cmds::position(&gfa, sub_args)?; + position(&gfa, sub_args)?; } Some(Command::Extract(sub_args)) => { - let store = cmds::extract(&gfa, sub_args)?; + let store = extract(&gfa, sub_args)?; dump(&store.as_ref(), &args.output); } Some(Command::Depth(_)) => { - cmds::depth(&gfa); + depth(&gfa); } Some(Command::Chop(sub_args)) => { - let store = cmds::chop(&gfa, sub_args)?; + let store = chop(&gfa, sub_args)?; // TODO: Ideally, find a way to encapsulate the logic of chop in `cmd.rs`, instead of // defining here which values from out input `gfa` are needed by our final `flat` gfa. // Here we are reference values in two different Stores to create this Flatgfa, and From 29e8a5f4737ea96d479a26cfe37a6d344b1ff11f Mon Sep 17 00:00:00 2001 From: susan-garry Date: Mon, 30 Sep 2024 11:24:50 -0400 Subject: [PATCH 3/8] comment out unused code, could be useful in the future if we want to use spans as ranges --- flatgfa/src/fgfa_ds/pool.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flatgfa/src/fgfa_ds/pool.rs b/flatgfa/src/fgfa_ds/pool.rs index 2872388a..2080ad13 100644 --- a/flatgfa/src/fgfa_ds/pool.rs +++ b/flatgfa/src/fgfa_ds/pool.rs @@ -86,9 +86,9 @@ impl From<&Span> for std::ops::Range { } impl Span { - pub fn is_empty(&self) -> bool { - self.start.0 == self.end.0 - } + // pub fn is_empty(&self) -> bool { + // self.start.0 == self.end.0 + // } pub fn len(&self) -> usize { (self.end.0 - self.start.0) as usize From 7a839a7d3ed4c55e6bcf6b924f8abd73d84accfb Mon Sep 17 00:00:00 2001 From: susan-garry Date: Mon, 30 Sep 2024 11:25:50 -0400 Subject: [PATCH 4/8] nevermind, commit out the actual unused code which may or may not ever be useful --- flatgfa/src/fgfa_ds/pool.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flatgfa/src/fgfa_ds/pool.rs b/flatgfa/src/fgfa_ds/pool.rs index 2080ad13..94376abb 100644 --- a/flatgfa/src/fgfa_ds/pool.rs +++ b/flatgfa/src/fgfa_ds/pool.rs @@ -86,9 +86,9 @@ impl From<&Span> for std::ops::Range { } impl Span { - // pub fn is_empty(&self) -> bool { - // self.start.0 == self.end.0 - // } + pub fn is_empty(&self) -> bool { + self.start.0 == self.end.0 + } pub fn len(&self) -> usize { (self.end.0 - self.start.0) as usize @@ -135,9 +135,9 @@ pub trait Store { fn len(&self) -> usize; /// Check whether the pool is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } + // fn is_empty(&self) -> bool { + // self.len() == 0 + // } /// Get the next available ID. fn next_id(&self) -> Id { From 18eeb0df6b071af920b903ae70caf61ee9a2bd7f Mon Sep 17 00:00:00 2001 From: susan-garry Date: Mon, 30 Sep 2024 12:16:19 -0400 Subject: [PATCH 5/8] comment out docs for unused function --- flatgfa/src/fgfa_ds/pool.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flatgfa/src/fgfa_ds/pool.rs b/flatgfa/src/fgfa_ds/pool.rs index 94376abb..6055c166 100644 --- a/flatgfa/src/fgfa_ds/pool.rs +++ b/flatgfa/src/fgfa_ds/pool.rs @@ -134,7 +134,7 @@ pub trait Store { /// Get the number of items in the pool. fn len(&self) -> usize; - /// Check whether the pool is empty. + // /// Check whether the pool is empty. // fn is_empty(&self) -> bool { // self.len() == 0 // } From eaa48d75046c93c08245f9f115bc90c7f8913c8d Mon Sep 17 00:00:00 2001 From: susan-garry Date: Wed, 2 Oct 2024 12:47:15 -0400 Subject: [PATCH 6/8] fix flatgfa imports for flatgfa-py --- flatgfa-py/Cargo.lock | 20 ++------------------ flatgfa-py/src/lib.rs | 4 ++-- flatgfa/Cargo.lock | 2 +- flatgfa/Cargo.toml | 2 +- flatgfa/src/lib.rs | 5 ++++- 5 files changed, 10 insertions(+), 23 deletions(-) diff --git a/flatgfa-py/Cargo.lock b/flatgfa-py/Cargo.lock index f8f3d345..bb352e08 100644 --- a/flatgfa-py/Cargo.lock +++ b/flatgfa-py/Cargo.lock @@ -77,14 +77,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "commands" -version = "0.1.0" -dependencies = [ - "argh", - "fgfa_ds", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -92,9 +84,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "fgfa_ds" +name = "flatgfa" version = "0.1.0" dependencies = [ + "argh", "atoi", "bstr", "memchr", @@ -104,15 +97,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "flatgfa" -version = "0.1.0" -dependencies = [ - "argh", - "commands", - "fgfa_ds", -] - [[package]] name = "flatgfa-py" version = "0.1.0" diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs index a0593909..8a3a4a5f 100644 --- a/flatgfa-py/src/lib.rs +++ b/flatgfa-py/src/lib.rs @@ -1,5 +1,5 @@ -use flatgfa::fgfa_ds::pool::Id; -use flatgfa::{self, file, print, FlatGFA, HeapGFAStore}; +use flatgfa::fgfa_ds::{file, print, pool::Id}; +use flatgfa::{self, FlatGFA, HeapGFAStore}; use pyo3::exceptions::PyIndexError; use pyo3::prelude::*; use pyo3::types::{PyBytes, PySlice}; diff --git a/flatgfa/Cargo.lock b/flatgfa/Cargo.lock index 8f3f20c6..7a0c0020 100644 --- a/flatgfa/Cargo.lock +++ b/flatgfa/Cargo.lock @@ -72,7 +72,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "fgfa" +name = "flatgfa" version = "0.1.0" dependencies = [ "argh", diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index f8bad886..89c08188 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "fgfa" +name = "flatgfa" version = "0.1.0" edition = "2021" diff --git a/flatgfa/src/lib.rs b/flatgfa/src/lib.rs index 9b7dc559..b0f1f4bc 100644 --- a/flatgfa/src/lib.rs +++ b/flatgfa/src/lib.rs @@ -1 +1,4 @@ -pub mod fgfa_ds; \ No newline at end of file +pub mod fgfa_ds; + +pub use fgfa_ds::*; +pub use fgfa_ds::flatgfa::*; \ No newline at end of file From 8da1f54dc461fc576f268bde9f99a20656da784d Mon Sep 17 00:00:00 2001 From: susan-garry Date: Wed, 2 Oct 2024 16:46:19 -0400 Subject: [PATCH 7/8] turnt calls are verbose --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 15b77ce4..e2c374df 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ test-slow-odgi: fetch test-flatgfa: fetch cd flatgfa ; cargo build - turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa + turnt -v -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa -turnt --save -v -e chop_oracle_fgfa tests/*.gfa turnt -v -e flatgfa_chop tests/*.gfa From d6f99e40a840313bd03ee372cfab892cc3cc2215 Mon Sep 17 00:00:00 2001 From: susan-garry Date: Wed, 2 Oct 2024 17:02:51 -0400 Subject: [PATCH 8/8] fixy fixy --- flatgfa/Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index 89c08188..2426a543 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -3,6 +3,10 @@ name = "flatgfa" version = "0.1.0" edition = "2021" +[[bin]] +name = "fgfa" +path = "src/main.rs" + [dependencies] argh = "0.1.12" atoi = "2.0.0"