Skip to content

Commit

Permalink
FlatGFA: Pre-allocated writes (#156)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Mar 19, 2024
2 parents f73db4c + dc7024d commit d9009c8
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 61 deletions.
150 changes: 115 additions & 35 deletions polbin/src/file.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
use crate::flatgfa;
use crate::pool::Span;
use std::mem::{size_of, size_of_val};
use tinyvec::SliceVec;
use zerocopy::{AsBytes, FromBytes, FromZeroes};

const MAGIC_NUMBER: u64 = 0xB101_1054;

/// A table of contents for the FlatGFA file.
#[derive(FromBytes, FromZeroes, AsBytes)]
#[derive(FromBytes, FromZeroes, AsBytes, Debug)]
#[repr(packed)]
struct Toc {
pub struct Toc {
magic: u64,
header: Size,
segs: Size,
Expand All @@ -24,7 +25,7 @@ struct Toc {
}

/// A table-of-contents entry for a pool in the FlatGFA file.
#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[derive(FromBytes, FromZeroes, AsBytes, Clone, Copy, Debug)]
#[repr(packed)]
struct Size {
/// The number of actual elements in the pool.
Expand All @@ -41,6 +42,92 @@ impl Size {
capacity: slice.len(),
}
}

fn of_slice_vec<T>(slice_vec: &SliceVec<'_, T>) -> Self {
Size {
len: slice_vec.len(),
capacity: slice_vec.capacity(),
}
}

fn bytes<T>(&self) -> usize {
self.capacity * size_of::<T>()
}

fn empty(capacity: usize) -> Self {
Size { len: 0, capacity }
}
}

impl Toc {
/// Get the total size in bytes of the file described.
pub fn size(&self) -> usize {
size_of::<Self>()
+ self.header.bytes::<u8>()
+ self.segs.bytes::<flatgfa::Segment>()
+ self.paths.bytes::<flatgfa::Path>()
+ self.links.bytes::<flatgfa::Link>()
+ self.steps.bytes::<flatgfa::Handle>()
+ self.seq_data.bytes::<u8>()
+ self.overlaps.bytes::<Span>()
+ self.alignment.bytes::<flatgfa::AlignOp>()
+ self.name_data.bytes::<u8>()
+ self.optional_data.bytes::<u8>()
+ self.line_order.bytes::<u8>()
}

/// Get a table of contents that fits a FlatGFA with no spare space.
fn full(gfa: &flatgfa::FlatGFA) -> Self {
Self {
magic: MAGIC_NUMBER,
header: Size::of_slice(gfa.header),
segs: Size::of_slice(gfa.segs),
paths: Size::of_slice(gfa.paths),
links: Size::of_slice(gfa.links),
steps: Size::of_slice(gfa.steps),
seq_data: Size::of_slice(gfa.seq_data),
overlaps: Size::of_slice(gfa.overlaps),
alignment: Size::of_slice(gfa.alignment),
name_data: Size::of_slice(gfa.name_data),
optional_data: Size::of_slice(gfa.optional_data),
line_order: Size::of_slice(gfa.line_order),
}
}

pub fn for_slice_store(store: &flatgfa::SliceStore) -> Self {
Self {
magic: MAGIC_NUMBER,
header: Size::of_slice_vec(&store.header),
segs: Size::of_slice_vec(&store.segs),
paths: Size::of_slice_vec(&store.paths),
links: Size::of_slice_vec(&store.links),
steps: Size::of_slice_vec(&store.steps),
seq_data: Size::of_slice_vec(&store.seq_data),
overlaps: Size::of_slice_vec(&store.overlaps),
alignment: Size::of_slice_vec(&store.alignment),
name_data: Size::of_slice_vec(&store.name_data),
optional_data: Size::of_slice_vec(&store.optional_data),
line_order: Size::of_slice_vec(&store.line_order),
}
}

/// Guess a reasonable set of capacities for a fresh file.
pub fn guess(factor: usize) -> Self {
Self {
magic: MAGIC_NUMBER,
header: Size::empty(128),
segs: Size::empty(32 * factor * factor),
paths: Size::empty(factor),
links: Size::empty(32 * factor * factor),
steps: Size::empty(1024 * factor * factor),
seq_data: Size::empty(512 * factor * factor),
overlaps: Size::empty(256 * factor),
alignment: Size::empty(64 * factor * factor),
name_data: Size::empty(64 * factor),
optional_data: Size::empty(512 * factor * factor),
line_order: Size::empty(64 * factor * factor),
}
}
}

/// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity`
Expand Down Expand Up @@ -109,12 +196,9 @@ fn slice_vec_prefix<T: FromBytes + AsBytes>(
(vec, rest)
}

/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer.
pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore {
let (toc, rest) = read_toc_mut(data);

// Get slices for each chunk.
let (header, rest) = slice_vec_prefix(rest, toc.header);
/// Get a FlatGFA `SliceStore` from the suffix of a file just following the table of contents.
fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::SliceStore<'a> {
let (header, rest) = slice_vec_prefix(data, toc.header);
let (segs, rest) = slice_vec_prefix(rest, toc.segs);
let (paths, rest) = slice_vec_prefix(rest, toc.paths);
let (links, rest) = slice_vec_prefix(rest, toc.links);
Expand All @@ -141,6 +225,26 @@ pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore {
}
}

/// Get a mutable FlatGFA `SliceStore` backed by a byte buffer.
pub fn view_store(data: &mut [u8]) -> flatgfa::SliceStore {
let (toc, rest) = read_toc_mut(data);
slice_store(rest, toc)
}

/// Initialize a buffer with an empty FlatGFA store.
pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, flatgfa::SliceStore) {
// Write the table of contents.
assert!(data.len() == toc.size());
toc.write_to_prefix(data).unwrap();

// Get a mutable reference to the embedded TOC.
let (toc_bytes, rest) = data.split_at_mut(size_of::<Toc>());
let toc_mut = Toc::mut_from(toc_bytes).unwrap();

// Extract a store from the remaining bytes.
(toc_mut, slice_store(rest, &toc))
}

fn write_bump<'a, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &T) -> Option<&'a mut [u8]> {
let len = size_of_val(data);
data.write_to_prefix(buf)?;
Expand All @@ -156,20 +260,7 @@ fn write_bytes<'a>(buf: &'a mut [u8], data: &[u8]) -> Option<&'a mut [u8]> {
/// Copy a FlatGFA into a byte buffer.
pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) {
// Table of contents.
let toc = Toc {
magic: MAGIC_NUMBER,
header: Size::of_slice(gfa.header),
segs: Size::of_slice(gfa.segs),
paths: Size::of_slice(gfa.paths),
links: Size::of_slice(gfa.links),
steps: Size::of_slice(gfa.steps),
seq_data: Size::of_slice(gfa.seq_data),
overlaps: Size::of_slice(gfa.overlaps),
alignment: Size::of_slice(gfa.alignment),
name_data: Size::of_slice(gfa.name_data),
optional_data: Size::of_slice(gfa.optional_data),
line_order: Size::of_slice(gfa.line_order),
};
let toc = Toc::full(gfa);
let rest = write_bump(buf, &toc).unwrap();

// All the slices.
Expand All @@ -189,16 +280,5 @@ pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) {
/// Get the total size in bytes of a FlatGFA structure. This should result in a big
/// enough buffer to write the entire FlatGFA into with `dump`.
pub fn size(gfa: &flatgfa::FlatGFA) -> usize {
size_of::<Toc>()
+ gfa.header.len()
+ size_of_val(gfa.segs)
+ size_of_val(gfa.paths)
+ size_of_val(gfa.links)
+ size_of_val(gfa.steps)
+ size_of_val(gfa.seq_data)
+ size_of_val(gfa.overlaps)
+ size_of_val(gfa.alignment)
+ gfa.name_data.len()
+ gfa.optional_data.len()
+ gfa.line_order.len()
Toc::full(gfa).size()
}
49 changes: 35 additions & 14 deletions polbin/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,24 +259,49 @@ pub struct Store<'a, P: PoolFamily<'a>> {
pub line_order: P::Pool<u8>,
}

impl<'a, P: PoolFamily<'a>> Store<'a, P> {
pub trait GFABuilder {
/// Add a header line for the GFA file. This may only be added once.
pub fn add_header(&mut self, version: &[u8]) {
fn add_header(&mut self, version: &[u8]);

/// Add a new segment to the GFA file.
fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index;

/// Add a new path.
fn add_path(
&mut self,
name: &[u8],
steps: Span,
overlaps: impl Iterator<Item = Vec<AlignOp>>,
) -> Index;

/// Add a sequence of steps.
fn add_steps(&mut self, steps: impl Iterator<Item = Handle>) -> Span;

/// Add a link between two (oriented) segments.
fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Index;

/// Record a line type to preserve the line order.
fn record_line(&mut self, kind: LineKind);

/// Borrow a FlatGFA view of this data store.
fn view(&self) -> FlatGFA;
}

impl<'a, P: PoolFamily<'a>> GFABuilder for Store<'a, P> {
fn add_header(&mut self, version: &[u8]) {
assert!(self.header.count() == 0);
self.header.add_slice(version);
}

/// Add a new segment to the GFA file.
pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index {
fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Index {
self.segs.add(Segment {
name,
seq: self.seq_data.add_slice(seq),
optional: self.optional_data.add_slice(optional),
})
}

/// Add a new path.
pub fn add_path(
fn add_path(
&mut self,
name: &[u8],
steps: Span,
Expand All @@ -295,27 +320,23 @@ impl<'a, P: PoolFamily<'a>> Store<'a, P> {
})
}

/// Add a sequence of steps.
pub fn add_steps(&mut self, steps: impl Iterator<Item = Handle>) -> Span {
fn add_steps(&mut self, steps: impl Iterator<Item = Handle>) -> Span {
self.steps.add_iter(steps)
}

/// Add a link between two (oriented) segments.
pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Index {
fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Index {
self.links.add(Link {
from,
to,
overlap: self.alignment.add_iter(overlap),
})
}

/// Record a line type to preserve the line order.
pub fn record_line(&mut self, kind: LineKind) {
fn record_line(&mut self, kind: LineKind) {
self.line_order.add(kind.into());
}

/// Borrow a FlatGFA view of this data store.
pub fn view(&self) -> FlatGFA {
fn view(&self) -> FlatGFA {
FlatGFA {
header: self.header.all(),
segs: self.segs.all(),
Expand Down
49 changes: 48 additions & 1 deletion polbin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod parse;
mod pool;
mod print;
use argh::FromArgs;
use flatgfa::GFABuilder;
use memmap::{Mmap, MmapMut};

fn map_file(name: &str) -> Mmap {
Expand Down Expand Up @@ -32,6 +33,20 @@ fn map_file_mut(name: &str) -> MmapMut {
unsafe { MmapMut::map_mut(&file) }.unwrap()
}

fn print_stats(gfa: &flatgfa::FlatGFA) {
eprintln!("header: {}", gfa.header.len());
eprintln!("segs: {}", gfa.segs.len());
eprintln!("paths: {}", gfa.paths.len());
eprintln!("links: {}", gfa.links.len());
eprintln!("steps: {}", gfa.steps.len());
eprintln!("seq_data: {}", gfa.seq_data.len());
eprintln!("overlaps: {}", gfa.overlaps.len());
eprintln!("alignment: {}", gfa.alignment.len());
eprintln!("name_data: {}", gfa.name_data.len());
eprintln!("optional_data: {}", gfa.optional_data.len());
eprintln!("line_order: {}", gfa.line_order.len());
}

#[derive(FromArgs)]
/// Convert between GFA text and FlatGFA binary formats.
struct PolBin {
Expand All @@ -46,11 +61,38 @@ struct PolBin {
/// mutate the input file in place
#[argh(switch, short = 'm')]
mutate: bool,

/// print statistics about the graph
#[argh(switch, short = 's')]
stats: bool,

/// preallocation size factor
#[argh(option, short = 'p', default = "32")]
prealloc_factor: usize,
}

fn main() {
let args: PolBin = argh::from_env();

// A special case for converting from GFA text to an in-place FlatGFA binary.
if args.mutate {
if let (None, Some(out_name)) = (&args.input, &args.output) {
// Create a file with an empty table of contents.
let empty_toc = file::Toc::guess(args.prealloc_factor);
let mut mmap = map_new_file(out_name, empty_toc.size() as u64);
let (toc, store) = file::init(&mut mmap, empty_toc);

// Parse the input into the file.
let stdin = std::io::stdin();
let store = parse::buf_parse(store, toc, stdin.lock());
if args.stats {
print_stats(&store.view());
}
mmap.flush().unwrap();
return;
}
}

// Load the input from a file (binary) or stdin (text).
let mmap;
let mut mmap_mut;
Expand All @@ -69,11 +111,16 @@ fn main() {
}
None => {
let stdin = std::io::stdin();
store = parse::Parser::parse(stdin.lock());
store = parse::heap_parse(stdin.lock());
store.view()
}
};

// Perhaps print some statistics.
if args.stats {
print_stats(&gfa);
}

// Write the output to a file (binary) or stdout (text).
match args.output {
Some(name) => {
Expand Down
Loading

0 comments on commit d9009c8

Please sign in to comment.