From 406d372a0688302bd9c98f306ae2a520979fdb5c Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 09:58:29 -0400 Subject: [PATCH 01/12] Centralize some size logic --- polbin/src/file.rs | 69 +++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index d0bc7ecc..0e03965f 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -1,4 +1,5 @@ use crate::flatgfa; +use crate::pool::Span; use std::mem::{size_of, size_of_val}; use tinyvec::SliceVec; use zerocopy::{AsBytes, FromBytes, FromZeroes}; @@ -41,6 +42,46 @@ impl Size { capacity: slice.len(), } } + + fn bytes(&self) -> usize { + self.capacity * size_of::() + } +} + +impl Toc { + /// Get the total size in bytes of the file described. + fn size(&self) -> usize { + size_of::() + + self.header.bytes::() + + self.segs.bytes::() + + self.paths.bytes::() + + self.links.bytes::() + + self.steps.bytes::() + + self.seq_data.bytes::() + + self.overlaps.bytes::() + + self.alignment.bytes::() + + self.name_data.bytes::() + + self.optional_data.bytes::() + + self.line_order.bytes::() + } + + /// Get a table of contents that fits a FlatGFA with no spare space. + fn full(gfa: &flatgfa::FlatGFA) -> Self { + Toc { + magic: MAGIC_NUMBER, + header: Size::of_slice(gfa.header), + segs: Size::of_slice(gfa.segs), + paths: Size::of_slice(gfa.paths), + links: Size::of_slice(gfa.links), + steps: Size::of_slice(gfa.steps), + seq_data: Size::of_slice(gfa.seq_data), + overlaps: Size::of_slice(gfa.overlaps), + alignment: Size::of_slice(gfa.alignment), + name_data: Size::of_slice(gfa.name_data), + optional_data: Size::of_slice(gfa.optional_data), + line_order: Size::of_slice(gfa.line_order), + } + } } /// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity` @@ -156,20 +197,7 @@ fn write_bytes<'a>(buf: &'a mut [u8], data: &[u8]) -> Option<&'a mut [u8]> { /// Copy a FlatGFA into a byte buffer. pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { // Table of contents. - let toc = Toc { - magic: MAGIC_NUMBER, - header: Size::of_slice(gfa.header), - segs: Size::of_slice(gfa.segs), - paths: Size::of_slice(gfa.paths), - links: Size::of_slice(gfa.links), - steps: Size::of_slice(gfa.steps), - seq_data: Size::of_slice(gfa.seq_data), - overlaps: Size::of_slice(gfa.overlaps), - alignment: Size::of_slice(gfa.alignment), - name_data: Size::of_slice(gfa.name_data), - optional_data: Size::of_slice(gfa.optional_data), - line_order: Size::of_slice(gfa.line_order), - }; + let toc = Toc::full(gfa); let rest = write_bump(buf, &toc).unwrap(); // All the slices. @@ -189,16 +217,5 @@ pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { /// Get the total size in bytes of a FlatGFA structure. This should result in a big /// enough buffer to write the entire FlatGFA into with `dump`. pub fn size(gfa: &flatgfa::FlatGFA) -> usize { - size_of::() - + gfa.header.len() - + size_of_val(gfa.segs) - + size_of_val(gfa.paths) - + size_of_val(gfa.links) - + size_of_val(gfa.steps) - + size_of_val(gfa.seq_data) - + size_of_val(gfa.overlaps) - + size_of_val(gfa.alignment) - + gfa.name_data.len() - + gfa.optional_data.len() - + gfa.line_order.len() + Toc::full(gfa).size() } From 3b54a34b70fed7b14479fac9f7d4aacee32b2384 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 13:36:06 -0400 Subject: [PATCH 02/12] Initialize buffer --- polbin/src/file.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 0e03965f..9f3fa8d7 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -9,7 +9,7 @@ const MAGIC_NUMBER: u64 = 0xB101_1054; /// A table of contents for the FlatGFA file. #[derive(FromBytes, FromZeroes, AsBytes)] #[repr(packed)] -struct Toc { +pub struct Toc { magic: u64, header: Size, segs: Size, @@ -150,12 +150,9 @@ fn slice_vec_prefix( (vec, rest) } -/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer. -pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore { - let (toc, rest) = read_toc_mut(data); - - // Get slices for each chunk. - let (header, rest) = slice_vec_prefix(rest, toc.header); +/// Get a FlatGFA `SliceStore` from the suffix of a file just following the table of contents. +fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::SliceStore<'a> { + let (header, rest) = slice_vec_prefix(data, toc.header); let (segs, rest) = slice_vec_prefix(rest, toc.segs); let (paths, rest) = slice_vec_prefix(rest, toc.paths); let (links, rest) = slice_vec_prefix(rest, toc.links); @@ -182,6 +179,23 @@ pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore { } } +/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer. +pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore { + let (toc, rest) = read_toc_mut(data); + slice_store(rest, toc) +} + +/// Initialize a buffer with an empty FlatGFA store. +pub fn init(data: &mut [u8], toc: Toc) -> flatgfa::SliceStore { + // Write the table of contents. + assert!(data.len() == toc.size()); + toc.write_to_prefix(data).unwrap(); + let rest = &mut data[size_of::()..]; + + // Extract a store from the remaining bytes. + slice_store(rest, &toc) +} + fn write_bump<'a, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &T) -> Option<&'a mut [u8]> { let len = size_of_val(data); data.write_to_prefix(buf)?; From 3f36cf6a0a0dfd6138193929136e939ca1dfdd88 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 13:43:25 -0400 Subject: [PATCH 03/12] Guess some capacities To be refined... --- polbin/src/file.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 9f3fa8d7..30a590fb 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -46,6 +46,10 @@ impl Size { fn bytes(&self) -> usize { self.capacity * size_of::() } + + fn empty(capacity: usize) -> Self { + Size { len: 0, capacity } + } } impl Toc { @@ -67,7 +71,7 @@ impl Toc { /// Get a table of contents that fits a FlatGFA with no spare space. fn full(gfa: &flatgfa::FlatGFA) -> Self { - Toc { + Self { magic: MAGIC_NUMBER, header: Size::of_slice(gfa.header), segs: Size::of_slice(gfa.segs), @@ -82,6 +86,24 @@ impl Toc { line_order: Size::of_slice(gfa.line_order), } } + + /// Guess a reasonable set of capacities for a fresh file. + fn guess(factor: usize) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::empty(128), + segs: Size::empty(512 * factor), + paths: Size::empty(16 * factor), + links: Size::empty(1024 * factor), + steps: Size::empty(512 * factor), + seq_data: Size::empty(4096 * factor), + overlaps: Size::empty(512 * factor), + alignment: Size::empty(1024 * factor), + name_data: Size::empty(64 * factor), + optional_data: Size::empty(64 * factor), + line_order: Size::empty(2048 * factor), + } + } } /// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity` From fffb15521f2f69af9f82424d7e50902c001704e9 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 13:56:18 -0400 Subject: [PATCH 04/12] Make the parser parametric over builders --- polbin/src/flatgfa.rs | 49 ++++++++++++++++++++++++++++++------------- polbin/src/main.rs | 4 +++- polbin/src/parse.rs | 23 ++++++++++++-------- 3 files changed, 52 insertions(+), 24 deletions(-) diff --git a/polbin/src/flatgfa.rs b/polbin/src/flatgfa.rs index dc8c4632..7733eabe 100644 --- a/polbin/src/flatgfa.rs +++ b/polbin/src/flatgfa.rs @@ -259,15 +259,41 @@ pub struct Store<'a, P: PoolFamily<'a>> { pub line_order: P::Pool, } -impl<'a, P: PoolFamily<'a>> Store<'a, P> { +pub trait GFABuilder { /// Add a header line for the GFA file. This may only be added once. - pub fn add_header(&mut self, version: &[u8]) { + fn add_header(&mut self, version: &[u8]); + + /// Add a new segment to the GFA file. + fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index; + + /// Add a new path. + fn add_path( + &mut self, + name: &[u8], + steps: Span, + overlaps: impl Iterator>, + ) -> Index; + + /// Add a sequence of steps. + fn add_steps(&mut self, steps: impl Iterator) -> Span; + + /// Add a link between two (oriented) segments. + fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec) -> Index; + + /// Record a line type to preserve the line order. + fn record_line(&mut self, kind: LineKind); + + /// Borrow a FlatGFA view of this data store. + fn view(&self) -> FlatGFA; +} + +impl<'a, P: PoolFamily<'a>> GFABuilder for Store<'a, P> { + fn add_header(&mut self, version: &[u8]) { assert!(self.header.count() == 0); self.header.add_slice(version); } - /// Add a new segment to the GFA file. - pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index { + fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index { self.segs.add(Segment { name, seq: self.seq_data.add_slice(seq), @@ -275,8 +301,7 @@ impl<'a, P: PoolFamily<'a>> Store<'a, P> { }) } - /// Add a new path. - pub fn add_path( + fn add_path( &mut self, name: &[u8], steps: Span, @@ -295,13 +320,11 @@ impl<'a, P: PoolFamily<'a>> Store<'a, P> { }) } - /// Add a sequence of steps. - pub fn add_steps(&mut self, steps: impl Iterator) -> Span { + fn add_steps(&mut self, steps: impl Iterator) -> Span { self.steps.add_iter(steps) } - /// Add a link between two (oriented) segments. - pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec) -> Index { + fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec) -> Index { self.links.add(Link { from, to, @@ -309,13 +332,11 @@ impl<'a, P: PoolFamily<'a>> Store<'a, P> { }) } - /// Record a line type to preserve the line order. - pub fn record_line(&mut self, kind: LineKind) { + fn record_line(&mut self, kind: LineKind) { self.line_order.add(kind.into()); } - /// Borrow a FlatGFA view of this data store. - pub fn view(&self) -> FlatGFA { + fn view(&self) -> FlatGFA { FlatGFA { header: self.header.all(), segs: self.segs.all(), diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 9ad0f314..3ed13200 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -5,6 +5,7 @@ mod parse; mod pool; mod print; use argh::FromArgs; +use flatgfa::GFABuilder; use memmap::{Mmap, MmapMut}; fn map_file(name: &str) -> Mmap { @@ -69,7 +70,8 @@ fn main() { } None => { let stdin = std::io::stdin(); - store = parse::Parser::parse(stdin.lock()); + let parser = parse::Parser::new(flatgfa::HeapStore::default()); + store = parser.parse(stdin.lock()); store.view() } }; diff --git a/polbin/src/parse.rs b/polbin/src/parse.rs index 0f7ddf65..42b8d81e 100644 --- a/polbin/src/parse.rs +++ b/polbin/src/parse.rs @@ -2,10 +2,9 @@ use crate::flatgfa::{self, Handle, LineKind, Orientation}; use crate::gfaline; use std::collections::HashMap; -#[derive(Default)] -pub struct Parser { +pub struct Parser { /// The flat representation we're building. - flat: flatgfa::HeapStore, + flat: B, /// All segment IDs, indexed by their names, which we need to refer to segments in paths. seg_ids: NameMap, @@ -17,19 +16,25 @@ struct Deferred { paths: Vec>, } -impl Parser { +impl Parser { + pub fn new(builder: B) -> Self { + Self { + flat: builder, + seg_ids: NameMap::default(), + } + } + /// Parse a GFA text file. - pub fn parse(stream: R) -> flatgfa::HeapStore { - let mut parser = Self::default(); + pub fn parse(mut self, stream: R) -> B { let mut deferred = Deferred { links: Vec::new(), paths: Vec::new(), }; for line in stream.split(b'\n') { let line = line.unwrap(); - parser.parse_line(line, &mut deferred); + self.parse_line(line, &mut deferred); } - parser.finish(deferred) + self.finish(deferred) } /// Parse a single GFA line. @@ -105,7 +110,7 @@ impl Parser { /// /// We "unwind" the buffers of links and paths, now that we have all /// the segments. - fn finish(mut self, deferred: Deferred) -> flatgfa::HeapStore { + fn finish(mut self, deferred: Deferred) -> B { for link in deferred.links { self.add_link(link); } From c62cd040372597e9418852e4be3444384f0ad101 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 13:59:05 -0400 Subject: [PATCH 05/12] A little utility --- polbin/src/main.rs | 2 +- polbin/src/parse.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 3ed13200..bfadb84f 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -70,7 +70,7 @@ fn main() { } None => { let stdin = std::io::stdin(); - let parser = parse::Parser::new(flatgfa::HeapStore::default()); + let parser = parse::heap_parser(); store = parser.parse(stdin.lock()); store.view() } diff --git a/polbin/src/parse.rs b/polbin/src/parse.rs index 42b8d81e..139a3d52 100644 --- a/polbin/src/parse.rs +++ b/polbin/src/parse.rs @@ -16,6 +16,10 @@ struct Deferred { paths: Vec>, } +pub fn heap_parser() -> Parser { + Parser::::new(flatgfa::HeapStore::default()) +} + impl Parser { pub fn new(builder: B) -> Self { Self { From 69802b2b38a5f5695224417cc322b016e81872af Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 15:44:46 -0400 Subject: [PATCH 06/12] Special case for in-place parsing --- polbin/src/file.rs | 4 ++-- polbin/src/main.rs | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 30a590fb..6e4cf9b6 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -54,7 +54,7 @@ impl Size { impl Toc { /// Get the total size in bytes of the file described. - fn size(&self) -> usize { + pub fn size(&self) -> usize { size_of::() + self.header.bytes::() + self.segs.bytes::() @@ -88,7 +88,7 @@ impl Toc { } /// Guess a reasonable set of capacities for a fresh file. - fn guess(factor: usize) -> Self { + pub fn guess(factor: usize) -> Self { Self { magic: MAGIC_NUMBER, header: Size::empty(128), diff --git a/polbin/src/main.rs b/polbin/src/main.rs index bfadb84f..40d7b7f5 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -52,6 +52,20 @@ struct PolBin { fn main() { let args: PolBin = argh::from_env(); + // A special case for converting from GFA text to an in-place FlatGFA binary. + if args.mutate { + if let (None, Some(out_name)) = (&args.input, &args.output) { + let stdin = std::io::stdin(); + let toc = file::Toc::guess(5); + let mut mmap = map_new_file(out_name, toc.size() as u64); + let store = file::init(&mut mmap, toc); + let parser = parse::Parser::new(store); + parser.parse(stdin.lock()); + mmap.flush().unwrap(); + return; + } + } + // Load the input from a file (binary) or stdin (text). let mmap; let mut mmap_mut; From ba4d94073a12b5a46b4f0abaf17d550c5bc31701 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 18:39:19 -0400 Subject: [PATCH 07/12] Actually correct in-place file write --- polbin/src/file.rs | 37 ++++++++++++++++++++++++++++++++----- polbin/src/main.rs | 20 +++++++++++++++----- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 6e4cf9b6..68c5a2a0 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -7,7 +7,7 @@ use zerocopy::{AsBytes, FromBytes, FromZeroes}; const MAGIC_NUMBER: u64 = 0xB101_1054; /// A table of contents for the FlatGFA file. -#[derive(FromBytes, FromZeroes, AsBytes)] +#[derive(FromBytes, FromZeroes, AsBytes, Debug)] #[repr(packed)] pub struct Toc { magic: u64, @@ -25,7 +25,7 @@ pub struct Toc { } /// A table-of-contents entry for a pool in the FlatGFA file. -#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy)] +#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy, Debug)] #[repr(packed)] struct Size { /// The number of actual elements in the pool. @@ -43,6 +43,13 @@ impl Size { } } + fn of_slice_vec(slice_vec: &SliceVec<'_, T>) -> Self { + Size { + len: slice_vec.len(), + capacity: slice_vec.capacity(), + } + } + fn bytes(&self) -> usize { self.capacity * size_of::() } @@ -87,6 +94,23 @@ impl Toc { } } + pub fn for_slice_store(store: &flatgfa::SliceStore) -> Self { + Self { + magic: MAGIC_NUMBER, + header: Size::of_slice_vec(&store.header), + segs: Size::of_slice_vec(&store.segs), + paths: Size::of_slice_vec(&store.paths), + links: Size::of_slice_vec(&store.links), + steps: Size::of_slice_vec(&store.steps), + seq_data: Size::of_slice_vec(&store.seq_data), + overlaps: Size::of_slice_vec(&store.overlaps), + alignment: Size::of_slice_vec(&store.alignment), + name_data: Size::of_slice_vec(&store.name_data), + optional_data: Size::of_slice_vec(&store.optional_data), + line_order: Size::of_slice_vec(&store.line_order), + } + } + /// Guess a reasonable set of capacities for a fresh file. pub fn guess(factor: usize) -> Self { Self { @@ -208,14 +232,17 @@ pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore { } /// Initialize a buffer with an empty FlatGFA store. -pub fn init(data: &mut [u8], toc: Toc) -> flatgfa::SliceStore { +pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, flatgfa::SliceStore) { // Write the table of contents. assert!(data.len() == toc.size()); toc.write_to_prefix(data).unwrap(); - let rest = &mut data[size_of::()..]; + + // Get a mutable reference to the embedded TOC. + let (toc_bytes, rest) = data.split_at_mut(size_of::()); + let toc_mut = Toc::mut_from(toc_bytes).unwrap(); // Extract a store from the remaining bytes. - slice_store(rest, &toc) + (toc_mut, slice_store(rest, &toc)) } fn write_bump<'a, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &T) -> Option<&'a mut [u8]> { diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 40d7b7f5..761982e9 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -56,11 +56,21 @@ fn main() { if args.mutate { if let (None, Some(out_name)) = (&args.input, &args.output) { let stdin = std::io::stdin(); - let toc = file::Toc::guess(5); - let mut mmap = map_new_file(out_name, toc.size() as u64); - let store = file::init(&mut mmap, toc); - let parser = parse::Parser::new(store); - parser.parse(stdin.lock()); + + // Create a file with an empty table of contents. + let empty_toc = file::Toc::guess(5); + let mut mmap = map_new_file(out_name, empty_toc.size() as u64); + let (toc, store) = file::init(&mut mmap, empty_toc); + + // Parse the input. + let store = { + let parser = parse::Parser::new(store); + parser.parse(stdin.lock()) + }; + + // Update the table of contents. + *toc = file::Toc::for_slice_store(&store); + mmap.flush().unwrap(); return; } From 9bc43fe3416f58a86892d8928952897c74f2be0b Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 18:41:00 -0400 Subject: [PATCH 08/12] Slightly clearer Turnt lines --- tests/turnt.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/turnt.toml b/tests/turnt.toml index 11d99459..bb595974 100644 --- a/tests/turnt.toml +++ b/tests/turnt.toml @@ -164,9 +164,9 @@ command = "../polbin/target/debug/polbin < {filename}" output.gfa = "-" [envs.polbin_file] -command = "../polbin/target/debug/polbin < {filename} -o {base}.flatgfa ; ../polbin/target/debug/polbin -i {base}.flatgfa" +command = "../polbin/target/debug/polbin -o {base}.flatgfa < {filename} ; ../polbin/target/debug/polbin -i {base}.flatgfa" output.gfa = "-" [envs.polbin_file_inplace] -command = "../polbin/target/debug/polbin < {filename} -o {base}.flatgfa ; ../polbin/target/debug/polbin -m -i {base}.flatgfa" +command = "../polbin/target/debug/polbin -o {base}.flatgfa < {filename} ; ../polbin/target/debug/polbin -m -i {base}.flatgfa" output.gfa = "-" From b70eab106b50864cbcb21c07f780ab1815036ee7 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 18 Mar 2024 21:37:43 -0400 Subject: [PATCH 09/12] Refactor --- polbin/src/main.rs | 17 ++++------------- polbin/src/parse.rs | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 761982e9..48863941 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -55,22 +55,14 @@ fn main() { // A special case for converting from GFA text to an in-place FlatGFA binary. if args.mutate { if let (None, Some(out_name)) = (&args.input, &args.output) { - let stdin = std::io::stdin(); - // Create a file with an empty table of contents. let empty_toc = file::Toc::guess(5); let mut mmap = map_new_file(out_name, empty_toc.size() as u64); let (toc, store) = file::init(&mut mmap, empty_toc); - // Parse the input. - let store = { - let parser = parse::Parser::new(store); - parser.parse(stdin.lock()) - }; - - // Update the table of contents. - *toc = file::Toc::for_slice_store(&store); - + // Parse the input into the file. + let stdin = std::io::stdin(); + parse::buf_parse(store, toc, stdin.lock()); mmap.flush().unwrap(); return; } @@ -94,8 +86,7 @@ fn main() { } None => { let stdin = std::io::stdin(); - let parser = parse::heap_parser(); - store = parser.parse(stdin.lock()); + store = parse::heap_parse(stdin.lock()); store.view() } }; diff --git a/polbin/src/parse.rs b/polbin/src/parse.rs index 139a3d52..ab569a20 100644 --- a/polbin/src/parse.rs +++ b/polbin/src/parse.rs @@ -1,6 +1,8 @@ +use crate::file; use crate::flatgfa::{self, Handle, LineKind, Orientation}; use crate::gfaline; use std::collections::HashMap; +use std::io::BufRead; pub struct Parser { /// The flat representation we're building. @@ -16,8 +18,20 @@ struct Deferred { paths: Vec>, } -pub fn heap_parser() -> Parser { - Parser::::new(flatgfa::HeapStore::default()) +pub fn heap_parse(stream: R) -> flatgfa::HeapStore { + let parser = Parser::::new(flatgfa::HeapStore::default()); + parser.parse(stream) +} + +pub fn buf_parse<'a, R: BufRead>( + store: flatgfa::SliceStore<'a>, + toc: &mut file::Toc, + stream: R, +) -> flatgfa::SliceStore<'a> { + let parser = Parser::::new(store); + let store = parser.parse(stream); + *toc = file::Toc::for_slice_store(&store); + store } impl Parser { @@ -29,7 +43,7 @@ impl Parser { } /// Parse a GFA text file. - pub fn parse(mut self, stream: R) -> B { + pub fn parse(mut self, stream: R) -> B { let mut deferred = Deferred { links: Vec::new(), paths: Vec::new(), From b7b6d134ca36c986acbb12aabfe9f4414e268bbf Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Tue, 19 Mar 2024 07:09:52 -0400 Subject: [PATCH 10/12] Stats mode --- polbin/src/main.rs | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 48863941..6a4463a1 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -33,6 +33,20 @@ fn map_file_mut(name: &str) -> MmapMut { unsafe { MmapMut::map_mut(&file) }.unwrap() } +fn print_stats(gfa: &flatgfa::FlatGFA) { + eprintln!("header: {}", gfa.header.len()); + eprintln!("segs: {}", gfa.segs.len()); + eprintln!("paths: {}", gfa.paths.len()); + eprintln!("links: {}", gfa.links.len()); + eprintln!("steps: {}", gfa.steps.len()); + eprintln!("seq_data: {}", gfa.seq_data.len()); + eprintln!("overlaps: {}", gfa.overlaps.len()); + eprintln!("alignment: {}", gfa.alignment.len()); + eprintln!("name_data: {}", gfa.name_data.len()); + eprintln!("optional_data: {}", gfa.optional_data.len()); + eprintln!("line_order: {}", gfa.line_order.len()); +} + #[derive(FromArgs)] /// Convert between GFA text and FlatGFA binary formats. struct PolBin { @@ -47,6 +61,10 @@ struct PolBin { /// mutate the input file in place #[argh(switch, short = 'm')] mutate: bool, + + /// print statistics about the graph + #[argh(switch, short = 's')] + stats: bool, } fn main() { @@ -62,7 +80,10 @@ fn main() { // Parse the input into the file. let stdin = std::io::stdin(); - parse::buf_parse(store, toc, stdin.lock()); + let store = parse::buf_parse(store, toc, stdin.lock()); + if args.stats { + print_stats(&store.view()); + } mmap.flush().unwrap(); return; } @@ -91,6 +112,11 @@ fn main() { } }; + // Perhaps print some statistics. + if args.stats { + print_stats(&gfa); + } + // Write the output to a file (binary) or stdout (text). match args.output { Some(name) => { From 9c23064839ba5021b199069364dc2001d25d1b0c Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Tue, 19 Mar 2024 07:36:25 -0400 Subject: [PATCH 11/12] Controllable preallocation --- polbin/src/file.rs | 18 +++++++++--------- polbin/src/main.rs | 6 +++++- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 68c5a2a0..07b8023a 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -116,16 +116,16 @@ impl Toc { Self { magic: MAGIC_NUMBER, header: Size::empty(128), - segs: Size::empty(512 * factor), - paths: Size::empty(16 * factor), - links: Size::empty(1024 * factor), - steps: Size::empty(512 * factor), - seq_data: Size::empty(4096 * factor), - overlaps: Size::empty(512 * factor), - alignment: Size::empty(1024 * factor), + segs: Size::empty(32 * factor * factor), + paths: Size::empty(factor), + links: Size::empty(32 * factor * factor), + steps: Size::empty(1024 * factor * factor), + seq_data: Size::empty(512 * factor * factor), + overlaps: Size::empty(256 * factor), + alignment: Size::empty(64 * factor * factor), name_data: Size::empty(64 * factor), - optional_data: Size::empty(64 * factor), - line_order: Size::empty(2048 * factor), + optional_data: Size::empty(512 * factor * factor), + line_order: Size::empty(64 * factor * factor), } } } diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 6a4463a1..d14f182b 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -65,6 +65,10 @@ struct PolBin { /// print statistics about the graph #[argh(switch, short = 's')] stats: bool, + + /// preallocation size factor + #[argh(option, short = 'p', default = "32")] + prealloc_factor: usize, } fn main() { @@ -74,7 +78,7 @@ fn main() { if args.mutate { if let (None, Some(out_name)) = (&args.input, &args.output) { // Create a file with an empty table of contents. - let empty_toc = file::Toc::guess(5); + let empty_toc = file::Toc::guess(args.prealloc_factor); let mut mmap = map_new_file(out_name, empty_toc.size() as u64); let (toc, store) = file::init(&mut mmap, empty_toc); From dc7024d15ebddf1aea8e16a3b712f15cf747fcc7 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Tue, 19 Mar 2024 07:40:14 -0400 Subject: [PATCH 12/12] Add in-place emission to tests --- tests/turnt.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/turnt.toml b/tests/turnt.toml index bb595974..5e23e6c1 100644 --- a/tests/turnt.toml +++ b/tests/turnt.toml @@ -168,5 +168,5 @@ command = "../polbin/target/debug/polbin -o {base}.flatgfa < {filename} ; ../pol output.gfa = "-" [envs.polbin_file_inplace] -command = "../polbin/target/debug/polbin -o {base}.flatgfa < {filename} ; ../polbin/target/debug/polbin -m -i {base}.flatgfa" +command = "../polbin/target/debug/polbin -m -p 128 -o {base}.inplace.flatgfa < {filename} ; ../polbin/target/debug/polbin -m -i {base}.inplace.flatgfa" output.gfa = "-"