Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

polbin: Actually dump FlatGFA binary files #152

Merged
merged 7 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions polbin/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions polbin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
argh = "0.1.12"
bstr = "1.9.1"
gfa = "0.10.1"
memmap = "0.7.0"
Expand Down
78 changes: 72 additions & 6 deletions polbin/src/file.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use crate::flatgfa;
use zerocopy::{FromBytes, FromZeroes};
use std::mem::{size_of, size_of_val};
use zerocopy::{AsBytes, FromBytes, FromZeroes};

const MAGIC_NUMBER: usize = 0x1337_4915;
const MAGIC_NUMBER: u64 = 0xB101_1054;

#[derive(FromBytes, FromZeroes)]
#[derive(FromBytes, FromZeroes, AsBytes)]
#[repr(packed)]
struct TOC {
magic: usize,
magic: u64,
header_len: usize,
segs_count: usize,
paths_count: usize,
Expand All @@ -25,11 +27,13 @@ fn get_prefix(data: &[u8], len: usize) -> (&[u8], &[u8]) {
(&data[0..len], &data[len..])
}

/// Get a FlatGFA backed by the data in a byte buffer.
pub fn view(data: &[u8]) -> flatgfa::FlatGFA {
// Table of contents.
let toc = TOC::ref_from_prefix(data).unwrap();
let rest = &data[std::mem::size_of::<TOC>()..];
assert_eq!(toc.magic, MAGIC_NUMBER);
let rest = &data[size_of::<TOC>()..];
let magic = toc.magic;
assert_eq!(magic, MAGIC_NUMBER);

// Get slices for each chunk.
let (header, rest) = get_prefix(rest, toc.header_len);
Expand Down Expand Up @@ -58,3 +62,65 @@ pub fn view(data: &[u8]) -> flatgfa::FlatGFA {
line_order,
}
}

fn write_bump<'a, 'b, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &'b T) -> Option<&'a mut [u8]> {
let len = size_of_val(data);
data.write_to_prefix(buf)?;
Some(&mut buf[len..])
}

fn write_bytes<'a, 'b>(buf: &'a mut [u8], data: &'b [u8]) -> Option<&'a mut [u8]> {
let len = data.len();
buf[0..len].copy_from_slice(data);
Some(&mut buf[len..])
}

/// Copy a FlatGFA into a byte buffer.
pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) {
// Table of contents.
let toc = TOC {
magic: MAGIC_NUMBER,
header_len: gfa.header.len(),
segs_count: gfa.segs.len(),
paths_count: gfa.paths.len(),
links_count: gfa.links.len(),
steps_count: gfa.steps.len(),
seq_data_len: gfa.seq_data.len(),
overlaps_count: gfa.overlaps.len(),
alignment_count: gfa.alignment.len(),
name_data_len: gfa.name_data.len(),
optional_data_len: gfa.optional_data.len(),
line_order_len: gfa.line_order.len(),
};
let rest = write_bump(buf, &toc).unwrap();

// All the slices.
let rest = write_bytes(rest, gfa.header).unwrap();
let rest = write_bump(rest, gfa.segs).unwrap();
let rest = write_bump(rest, gfa.paths).unwrap();
let rest = write_bump(rest, gfa.links).unwrap();
let rest = write_bump(rest, gfa.steps).unwrap();
let rest = write_bytes(rest, gfa.seq_data).unwrap();
let rest = write_bump(rest, gfa.overlaps).unwrap();
let rest = write_bump(rest, gfa.alignment).unwrap();
let rest = write_bytes(rest, gfa.name_data).unwrap();
let rest = write_bytes(rest, gfa.optional_data).unwrap();
write_bytes(rest, gfa.line_order).unwrap();
}

/// Get the total size in bytes of a FlatGFA structure. This should result in a big
/// enough buffer to write the entire FlatGFA into with `dump`.
pub fn size(gfa: &flatgfa::FlatGFA) -> usize {
size_of::<TOC>()
+ gfa.header.len()
+ gfa.segs.len() * size_of::<flatgfa::Segment>()
+ gfa.paths.len() * size_of::<flatgfa::Path>()
+ gfa.links.len() * size_of::<flatgfa::Link>()
+ gfa.steps.len() * size_of::<flatgfa::Handle>()
+ gfa.seq_data.len()
+ gfa.overlaps.len() * size_of::<flatgfa::Span>()
+ gfa.alignment.len() * size_of::<flatgfa::AlignOp>()
+ gfa.name_data.len()
+ gfa.optional_data.len()
+ gfa.line_order.len()
}
42 changes: 24 additions & 18 deletions polbin/src/flatgfa.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use bstr::{BStr, BString};
use num_enum::{IntoPrimitive, TryFromPrimitive};
use zerocopy::{FromBytes, FromZeroes};
use zerocopy::{AsBytes, FromBytes, FromZeroes};

/// An efficient flattened representation of a GFA file.
///
Expand Down Expand Up @@ -65,22 +65,23 @@ pub struct FlatGFA<'a> {
/// useful for creating new ones from scratch.
#[derive(Default)]
pub struct FlatGFAStore {
header: BString,
segs: Vec<Segment>,
paths: Vec<Path>,
links: Vec<Link>,
steps: Vec<Handle>,
seq_data: Vec<u8>,
overlaps: Vec<Span>,
alignment: Vec<AlignOp>,
name_data: BString,
optional_data: BString,
line_order: Vec<u8>,
pub header: BString,
pub segs: Vec<Segment>,
pub paths: Vec<Path>,
pub links: Vec<Link>,
pub steps: Vec<Handle>,
pub seq_data: Vec<u8>,
pub overlaps: Vec<Span>,
pub alignment: Vec<AlignOp>,
pub name_data: BString,
pub optional_data: BString,
pub line_order: Vec<u8>,
}

/// GFA graphs consist of "segment" nodes, which are fragments of base-pair sequences
/// that can be strung together into paths.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Segment {
/// The segment's name. We assume all names are just plain numbers.
pub name: usize,
Expand All @@ -93,7 +94,8 @@ pub struct Segment {
}

/// A path is a sequence of oriented references to segments.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Path {
/// The path's name. This can be an arbitrary string. It is a renge in the
/// `name_data` pool.
Expand All @@ -108,7 +110,8 @@ pub struct Path {
}

/// An allowed edge between two oriented segments.
#[derive(Debug, FromBytes, FromZeroes)]
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Link {
/// The source of the edge.
pub from: Handle,
Expand All @@ -134,7 +137,8 @@ pub enum Orientation {
/// A Handle refers to the forward (+) or backward (-) orientation for a given segment.
/// So, logically, it consists of a pair of a segment reference (usize) and an
/// orientation (1 bit). We pack the two values into a single word.
#[derive(Debug, FromBytes, FromZeroes)]
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Handle(usize);

impl Handle {
Expand Down Expand Up @@ -171,7 +175,8 @@ pub enum AlignOpcode {
///
/// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two
/// into a single u32.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes)]
#[repr(packed)]
pub struct AlignOp(u32);

impl AlignOp {
Expand Down Expand Up @@ -216,7 +221,8 @@ pub enum LineKind {
///
/// TODO: Consider smaller indices for this, and possibly base/offset instead
/// of start/end.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Span {
pub start: usize,
pub end: usize,
Expand Down
61 changes: 50 additions & 11 deletions polbin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,62 @@ mod file;
mod flatgfa;
mod parse;
mod print;
use memmap::Mmap;
use argh::FromArgs;
use memmap::{Mmap, MmapMut};

fn map_file(name: &str) -> Mmap {
let file = std::fs::File::open(name).unwrap();
unsafe { Mmap::map(&file) }.unwrap()
}

fn map_file_mut(name: &str, size: u64) -> MmapMut {
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.open(name)
.unwrap();
file.set_len(size).unwrap();
unsafe { MmapMut::map_mut(&file) }.unwrap()
}

#[derive(FromArgs)]
/// Convert between GFA text and FlatGFA binary formats.
struct PolBin {
/// read from a binary FlatGFA file
#[argh(option, short = 'i')]
input: Option<String>,

/// write to a binary FlatGFA file
#[argh(option, short = 'o')]
output: Option<String>,
}

fn main() {
// Read either GFA text from stdin or a binary file from the first argument.
if let Some(name) = std::env::args().nth(1) {
let mmap = map_file(&name);
let gfa = file::view(&mmap);
print::print(&gfa);
} else {
let stdin = std::io::stdin();
let store = parse::Parser::parse(stdin.lock());
let gfa = store.view();
print::print(&gfa);
let args: PolBin = argh::from_env();

// Load the input from a file (binary) or stdin (text).
let mmap;
let store;
let gfa = match args.input {
Some(name) => {
mmap = map_file(&name);
file::view(&mmap)
}
None => {
let stdin = std::io::stdin();
store = parse::Parser::parse(stdin.lock());
store.view()
}
};

// Write the output to a file (binary) or stdout (text).
match args.output {
Some(name) => {
let mut mmap = map_file_mut(&name, file::size(&gfa) as u64);
file::dump(&gfa, &mut mmap);
mmap.flush().unwrap();
}
None => print::print(&gfa),
}
}
20 changes: 13 additions & 7 deletions polbin/src/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ impl<'a> fmt::Display for flatgfa::Alignment<'a> {
}

fn print_step(gfa: &flatgfa::FlatGFA, handle: &flatgfa::Handle) {
let seg = &gfa.segs[handle.segment()];
print!("{}{}", seg.name, handle.orient());
let seg = gfa.segs[handle.segment()];
let name = seg.name;
print!("{}{}", name, handle.orient());
}

fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) {
Expand All @@ -57,18 +58,23 @@ fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) {
}

fn print_link(gfa: &flatgfa::FlatGFA, link: &flatgfa::Link) {
let from = link.from;
let from_name = gfa.segs[from.segment()].name;
let to = link.to;
let to_name = gfa.segs[to.segment()].name;
println!(
"L\t{}\t{}\t{}\t{}\t{}",
gfa.segs[link.from.segment()].name,
link.from.orient(),
gfa.segs[link.to.segment()].name,
link.to.orient(),
from_name,
from.orient(),
to_name,
to.orient(),
gfa.get_alignment(&link.overlap)
);
}

fn print_seg(gfa: &flatgfa::FlatGFA, seg: &flatgfa::Segment) {
print!("S\t{}\t{}", seg.name, gfa.get_seq(seg));
let name = seg.name;
print!("S\t{}\t{}", name, gfa.get_seq(seg));
if !seg.optional.is_empty() {
print!("\t{}", gfa.get_optional_data(seg));
}
Expand Down
6 changes: 5 additions & 1 deletion tests/turnt.toml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ binary = true
command = "pollen_data_gen simple {filename} | jq .depth"
output.json = "-"

[envs.polbin_roundtrip]
[envs.polbin_mem]
command = "../polbin/target/debug/polbin < {filename}"
output.gfa = "-"

[envs.polbin_file]
command = "../polbin/target/debug/polbin < {filename} -o {base}.flatgfa ; ../polbin/target/debug/polbin -i {base}.flatgfa"
output.gfa = "-"
Loading