Skip to content

Commit

Permalink
FlatGFA: Preprocess GFA files to estimate pre-allocation size (#159)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Mar 20, 2024
2 parents 104a873 + d279dc0 commit 4478339
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 6 deletions.
26 changes: 26 additions & 0 deletions polbin/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,32 @@ impl Toc {
line_order: Size::empty(64 * factor * factor),
}
}

/// Estimate a reasonable set of capacities for a fresh file based on some
/// measurements of the GFA text.
pub fn estimate(
segs: usize,
links: usize,
paths: usize,
header_bytes: usize,
seg_bytes: usize,
path_bytes: usize,
) -> Self {
Self {
magic: MAGIC_NUMBER,
header: Size::empty(header_bytes),
segs: Size::empty(segs),
paths: Size::empty(paths),
links: Size::empty(links),
steps: Size::empty(path_bytes / 3),
seq_data: Size::empty(seg_bytes),
overlaps: Size::empty((links + paths) * 2),
alignment: Size::empty(links * 2 + paths * 4),
name_data: Size::empty(paths * 512),
optional_data: Size::empty(links * 16),
line_order: Size::empty(segs + links + paths + 8),
}
}
}

/// Consume `size.len` items from a byte slice, skip the remainder of `size.capacity`
Expand Down
21 changes: 16 additions & 5 deletions polbin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,27 @@ fn main() {
// A special case for converting from GFA text to an in-place FlatGFA binary.
if args.mutate {
if let (None, Some(out_name)) = (&args.input, &args.output) {
let file;
let (input_buf, empty_toc) = match args.input_gfa {
// If we have an input GFA file, we can estimate its sizes for the TOC.
Some(name) => {
file = map_file(&name);
let toc = parse::estimate_toc(file.as_ref());
(Some(file.as_ref()), toc)
}

// Otherwise, we ened to guess.
None => (None, file::Toc::guess(args.prealloc_factor)),
};

// Create a file with an empty table of contents.
let empty_toc = file::Toc::guess(args.prealloc_factor);
let mut mmap = map_new_file(out_name, empty_toc.size() as u64);
let (toc, store) = file::init(&mut mmap, empty_toc);

// Parse the input into the file.
let store = match args.input_gfa {
Some(name) => {
let file = map_file(&name);
let store = Parser::for_slice(store).parse_mem(file.as_ref());
let store = match input_buf {
Some(buf) => {
let store = Parser::for_slice(store).parse_mem(buf);
*toc = file::Toc::for_slice_store(&store);
store
}
Expand Down
46 changes: 45 additions & 1 deletion polbin/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ impl<B: flatgfa::GFABuilder> Parser<B> {
}

// Actually parse other lines.
let gfa_line = gfaline::parse_line(line).unwrap();
let gfa_line = gfaline::parse_line(line.as_ref()).unwrap();
self.record_line(&gfa_line);
match gfa_line {
gfaline::Line::Header(data) => {
Expand Down Expand Up @@ -211,6 +211,50 @@ impl NameMap {
}
}

/// Scan a GFA text file to count the number of each type of line and measure some sizes
/// that are useful in estimating the final size of the FlatGFA file.
pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc {
let mut segs = 0;
let mut links = 0;
let mut paths = 0;
let mut header_bytes = 0;
let mut seg_bytes = 0;
let mut path_bytes = 0;

let mut rest = buf;
while !rest.is_empty() {
let marker = rest[0];
let next = memchr::memchr(b'\n', rest).unwrap_or(rest.len() + 1);

match marker {
b'H' => {
header_bytes += next;
}
b'S' => {
segs += 1;
seg_bytes += next;
}
b'L' => {
links += 1;
}
b'P' => {
paths += 1;
path_bytes += next;
}
_ => {
panic!("unknown line type")
}
}

if next >= rest.len() {
break;
}
rest = &rest[next + 1..];
}

crate::file::Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes)
}

struct MemchrSplit<'a> {
haystack: &'a [u8],
memchr: memchr::Memchr<'a>,
Expand Down

0 comments on commit 4478339

Please sign in to comment.