From 1f294c6f76fd4b17a694735601ef183b2c16d517 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 15:59:33 -0400 Subject: [PATCH 1/7] Unalign all the things?? Maybe I went too far. But it does work! --- polbin/src/flatgfa.rs | 42 ++++++++++++++++++++++++------------------ polbin/src/print.rs | 20 +++++++++++++------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/polbin/src/flatgfa.rs b/polbin/src/flatgfa.rs index 0120686b..eebd7687 100644 --- a/polbin/src/flatgfa.rs +++ b/polbin/src/flatgfa.rs @@ -1,6 +1,6 @@ use bstr::{BStr, BString}; use num_enum::{IntoPrimitive, TryFromPrimitive}; -use zerocopy::{FromBytes, FromZeroes}; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; /// An efficient flattened representation of a GFA file. /// @@ -65,22 +65,23 @@ pub struct FlatGFA<'a> { /// useful for creating new ones from scratch. #[derive(Default)] pub struct FlatGFAStore { - header: BString, - segs: Vec, - paths: Vec, - links: Vec, - steps: Vec, - seq_data: Vec, - overlaps: Vec, - alignment: Vec, - name_data: BString, - optional_data: BString, - line_order: Vec, + pub header: BString, + pub segs: Vec, + pub paths: Vec, + pub links: Vec, + pub steps: Vec, + pub seq_data: Vec, + pub overlaps: Vec, + pub alignment: Vec, + pub name_data: BString, + pub optional_data: BString, + pub line_order: Vec, } /// GFA graphs consist of "segment" nodes, which are fragments of base-pair sequences /// that can be strung together into paths. -#[derive(Debug, FromZeroes, FromBytes)] +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] pub struct Segment { /// The segment's name. We assume all names are just plain numbers. pub name: usize, @@ -93,7 +94,8 @@ pub struct Segment { } /// A path is a sequence of oriented references to segments. -#[derive(Debug, FromZeroes, FromBytes)] +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] pub struct Path { /// The path's name. This can be an arbitrary string. It is a renge in the /// `name_data` pool. @@ -108,7 +110,8 @@ pub struct Path { } /// An allowed edge between two oriented segments. -#[derive(Debug, FromBytes, FromZeroes)] +#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)] +#[repr(packed)] pub struct Link { /// The source of the edge. pub from: Handle, @@ -134,7 +137,8 @@ pub enum Orientation { /// A Handle refers to the forward (+) or backward (-) orientation for a given segment. /// So, logically, it consists of a pair of a segment reference (usize) and an /// orientation (1 bit). We pack the two values into a single word. -#[derive(Debug, FromBytes, FromZeroes)] +#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)] +#[repr(transparent)] pub struct Handle(usize); impl Handle { @@ -171,7 +175,8 @@ pub enum AlignOpcode { /// /// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two /// into a single u32. -#[derive(Debug, FromZeroes, FromBytes)] +#[derive(Debug, FromZeroes, FromBytes, AsBytes)] +#[repr(transparent)] pub struct AlignOp(u32); impl AlignOp { @@ -216,7 +221,8 @@ pub enum LineKind { /// /// TODO: Consider smaller indices for this, and possibly base/offset instead /// of start/end. -#[derive(Debug, FromZeroes, FromBytes)] +#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)] +#[repr(packed)] pub struct Span { pub start: usize, pub end: usize, diff --git a/polbin/src/print.rs b/polbin/src/print.rs index 1056b198..96a0103b 100644 --- a/polbin/src/print.rs +++ b/polbin/src/print.rs @@ -31,8 +31,9 @@ impl<'a> fmt::Display for flatgfa::Alignment<'a> { } fn print_step(gfa: &flatgfa::FlatGFA, handle: &flatgfa::Handle) { - let seg = &gfa.segs[handle.segment()]; - print!("{}{}", seg.name, handle.orient()); + let seg = gfa.segs[handle.segment()]; + let name = seg.name; + print!("{}{}", name, handle.orient()); } fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) { @@ -57,18 +58,23 @@ fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) { } fn print_link(gfa: &flatgfa::FlatGFA, link: &flatgfa::Link) { + let from = link.from; + let from_name = gfa.segs[from.segment()].name; + let to = link.to; + let to_name = gfa.segs[to.segment()].name; println!( "L\t{}\t{}\t{}\t{}\t{}", - gfa.segs[link.from.segment()].name, - link.from.orient(), - gfa.segs[link.to.segment()].name, - link.to.orient(), + from_name, + from.orient(), + to_name, + to.orient(), gfa.get_alignment(&link.overlap) ); } fn print_seg(gfa: &flatgfa::FlatGFA, seg: &flatgfa::Segment) { - print!("S\t{}\t{}", seg.name, gfa.get_seq(seg)); + let name = seg.name; + print!("S\t{}\t{}", name, gfa.get_seq(seg)); if !seg.optional.is_empty() { print!("\t{}", gfa.get_optional_data(seg)); } From b2906b5059a63ab157bddefb3ed3dafa8de22fe6 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 16:28:19 -0400 Subject: [PATCH 2/7] Actually try writing a file --- polbin/src/file.rs | 54 +++++++++++++++++++++++++++++++++++++++++++--- polbin/src/main.rs | 17 ++++++++++++++- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 761b402d..9f346fb1 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -1,9 +1,10 @@ use crate::flatgfa; -use zerocopy::{FromBytes, FromZeroes}; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; const MAGIC_NUMBER: usize = 0x1337_4915; -#[derive(FromBytes, FromZeroes)] +#[derive(FromBytes, FromZeroes, AsBytes)] +#[repr(packed)] struct TOC { magic: usize, header_len: usize, @@ -25,11 +26,13 @@ fn get_prefix(data: &[u8], len: usize) -> (&[u8], &[u8]) { (&data[0..len], &data[len..]) } +/// Get a FlatGFA backed by the data in a byte buffer. pub fn view(data: &[u8]) -> flatgfa::FlatGFA { // Table of contents. let toc = TOC::ref_from_prefix(data).unwrap(); let rest = &data[std::mem::size_of::()..]; - assert_eq!(toc.magic, MAGIC_NUMBER); + let magic = toc.magic; + assert_eq!(magic, MAGIC_NUMBER); // Get slices for each chunk. let (header, rest) = get_prefix(rest, toc.header_len); @@ -58,3 +61,48 @@ pub fn view(data: &[u8]) -> flatgfa::FlatGFA { line_order, } } + +fn write_bump<'a, 'b, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &'b T) -> Option<&'a mut [u8]> { + let len = std::mem::size_of_val(data); + data.write_to_prefix(buf)?; + Some(&mut buf[len..]) +} + +fn write_bytes<'a, 'b>(buf: &'a mut [u8], data: &'b [u8]) -> Option<&'a mut [u8]> { + let len = data.len(); + buf[0..len].copy_from_slice(data); + Some(&mut buf[len..]) +} + +/// Copy a FlatGFA into a byte buffer. +pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { + // Table of contents. + let toc = TOC { + magic: MAGIC_NUMBER, + header_len: gfa.header.len(), + segs_count: gfa.segs.len(), + paths_count: gfa.paths.len(), + links_count: gfa.links.len(), + steps_count: gfa.steps.len(), + seq_data_len: gfa.seq_data.len(), + overlaps_count: gfa.overlaps.len(), + alignment_count: gfa.alignment.len(), + name_data_len: gfa.name_data.len(), + optional_data_len: gfa.optional_data.len(), + line_order_len: gfa.line_order.len(), + }; + let rest = write_bump(buf, &toc).unwrap(); + + // All the slices. + let rest = write_bytes(rest, gfa.header).unwrap(); + let rest = write_bump(rest, gfa.segs).unwrap(); + let rest = write_bump(rest, gfa.paths).unwrap(); + let rest = write_bump(rest, gfa.links).unwrap(); + let rest = write_bump(rest, gfa.steps).unwrap(); + let rest = write_bytes(rest, gfa.seq_data).unwrap(); + let rest = write_bump(rest, gfa.overlaps).unwrap(); + let rest = write_bump(rest, gfa.alignment).unwrap(); + let rest = write_bytes(rest, gfa.name_data).unwrap(); + let rest = write_bytes(rest, gfa.optional_data).unwrap(); + write_bytes(rest, gfa.line_order).unwrap(); +} diff --git a/polbin/src/main.rs b/polbin/src/main.rs index c4428348..904de44b 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -2,13 +2,24 @@ mod file; mod flatgfa; mod parse; mod print; -use memmap::Mmap; +use memmap::{Mmap, MmapMut}; fn map_file(name: &str) -> Mmap { let file = std::fs::File::open(name).unwrap(); unsafe { Mmap::map(&file) }.unwrap() } +fn map_file_mut(name: &str) -> MmapMut { + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(name) + .unwrap(); + file.set_len(8092).unwrap(); // TODO Estimate the size? + unsafe { MmapMut::map_mut(&file) }.unwrap() +} + fn main() { // Read either GFA text from stdin or a binary file from the first argument. if let Some(name) = std::env::args().nth(1) { @@ -20,5 +31,9 @@ fn main() { let store = parse::Parser::parse(stdin.lock()); let gfa = store.view(); print::print(&gfa); + + // TODO Just try dumping to a file. + let mut mmap = map_file_mut("hello.flatgfa"); + file::dump(&gfa, &mut mmap); } } From 7233dc81bfb1e4e055736778fa86f5087175a2d5 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 18:52:12 -0400 Subject: [PATCH 3/7] Proper CLI option parsing --- polbin/Cargo.lock | 32 ++++++++++++++++++++++++++++++ polbin/Cargo.toml | 1 + polbin/src/main.rs | 49 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/polbin/Cargo.lock b/polbin/Cargo.lock index b4b3a24f..a2620a1e 100644 --- a/polbin/Cargo.lock +++ b/polbin/Cargo.lock @@ -17,6 +17,37 @@ version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +[[package]] +name = "argh" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219" +dependencies = [ + "argh_derive", + "argh_shared", +] + +[[package]] +name = "argh_derive" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a" +dependencies = [ + "argh_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "argh_shared" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531" +dependencies = [ + "serde", +] + [[package]] name = "arrayvec" version = "0.5.2" @@ -202,6 +233,7 @@ dependencies = [ name = "polbin" version = "0.1.0" dependencies = [ + "argh", "bstr 1.9.1", "gfa", "memmap", diff --git a/polbin/Cargo.toml b/polbin/Cargo.toml index 60aae942..74cb68af 100644 --- a/polbin/Cargo.toml +++ b/polbin/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +argh = "0.1.12" bstr = "1.9.1" gfa = "0.10.1" memmap = "0.7.0" diff --git a/polbin/src/main.rs b/polbin/src/main.rs index 904de44b..a3b8350a 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -2,6 +2,7 @@ mod file; mod flatgfa; mod parse; mod print; +use argh::FromArgs; use memmap::{Mmap, MmapMut}; fn map_file(name: &str) -> Mmap { @@ -20,20 +21,42 @@ fn map_file_mut(name: &str) -> MmapMut { unsafe { MmapMut::map_mut(&file) }.unwrap() } +#[derive(FromArgs)] +/// Convert between GFA text and FlatGFA binary formats. +struct PolBin { + /// read from a binary FlatGFA file + #[argh(option, short = 'i')] + input: Option, + + /// write to a binary FlatGFA file + #[argh(option, short = 'o')] + output: Option, +} + fn main() { - // Read either GFA text from stdin or a binary file from the first argument. - if let Some(name) = std::env::args().nth(1) { - let mmap = map_file(&name); - let gfa = file::view(&mmap); - print::print(&gfa); - } else { - let stdin = std::io::stdin(); - let store = parse::Parser::parse(stdin.lock()); - let gfa = store.view(); - print::print(&gfa); + let args: PolBin = argh::from_env(); + + // Load the input from a file (binary) or stdin (text). + let mmap; + let store; + let gfa = match args.input { + Some(name) => { + mmap = map_file(&name); + file::view(&mmap) + } + None => { + let stdin = std::io::stdin(); + store = parse::Parser::parse(stdin.lock()); + store.view() + } + }; - // TODO Just try dumping to a file. - let mut mmap = map_file_mut("hello.flatgfa"); - file::dump(&gfa, &mut mmap); + // Write the output to a file (binary) or stdout (text). + match args.output { + Some(name) => { + let mut mmap = map_file_mut(&name); + file::dump(&gfa, &mut mmap); + } + None => print::print(&gfa), } } From 9de963074fced5975b3bdda4d87932a7312d7570 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 18:58:51 -0400 Subject: [PATCH 4/7] Compute size for output files --- polbin/src/file.rs | 22 ++++++++++++++++++++-- polbin/src/main.rs | 7 ++++--- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 9f346fb1..2f7e8a2e 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -1,4 +1,5 @@ use crate::flatgfa; +use std::mem::{size_of, size_of_val}; use zerocopy::{AsBytes, FromBytes, FromZeroes}; const MAGIC_NUMBER: usize = 0x1337_4915; @@ -30,7 +31,7 @@ fn get_prefix(data: &[u8], len: usize) -> (&[u8], &[u8]) { pub fn view(data: &[u8]) -> flatgfa::FlatGFA { // Table of contents. let toc = TOC::ref_from_prefix(data).unwrap(); - let rest = &data[std::mem::size_of::()..]; + let rest = &data[size_of::()..]; let magic = toc.magic; assert_eq!(magic, MAGIC_NUMBER); @@ -63,7 +64,7 @@ pub fn view(data: &[u8]) -> flatgfa::FlatGFA { } fn write_bump<'a, 'b, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &'b T) -> Option<&'a mut [u8]> { - let len = std::mem::size_of_val(data); + let len = size_of_val(data); data.write_to_prefix(buf)?; Some(&mut buf[len..]) } @@ -106,3 +107,20 @@ pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { let rest = write_bytes(rest, gfa.optional_data).unwrap(); write_bytes(rest, gfa.line_order).unwrap(); } + +/// Get the total size in bytes of a FlatGFA structure. This should result in a big +/// enough buffer to write the entire FlatGFA into with `dump`. +pub fn size(gfa: &flatgfa::FlatGFA) -> usize { + size_of::() + + gfa.header.len() + + gfa.segs.len() * size_of::() + + gfa.paths.len() * size_of::() + + gfa.links.len() * size_of::() + + gfa.steps.len() * size_of::() + + gfa.seq_data.len() + + gfa.overlaps.len() * size_of::() + + gfa.alignment.len() * size_of::() + + gfa.name_data.len() + + gfa.optional_data.len() + + gfa.line_order.len() +} diff --git a/polbin/src/main.rs b/polbin/src/main.rs index a3b8350a..ff1652f4 100644 --- a/polbin/src/main.rs +++ b/polbin/src/main.rs @@ -10,14 +10,14 @@ fn map_file(name: &str) -> Mmap { unsafe { Mmap::map(&file) }.unwrap() } -fn map_file_mut(name: &str) -> MmapMut { +fn map_file_mut(name: &str, size: u64) -> MmapMut { let file = std::fs::OpenOptions::new() .read(true) .write(true) .create(true) .open(name) .unwrap(); - file.set_len(8092).unwrap(); // TODO Estimate the size? + file.set_len(size).unwrap(); unsafe { MmapMut::map_mut(&file) }.unwrap() } @@ -54,8 +54,9 @@ fn main() { // Write the output to a file (binary) or stdout (text). match args.output { Some(name) => { - let mut mmap = map_file_mut(&name); + let mut mmap = map_file_mut(&name, file::size(&gfa) as u64); file::dump(&gfa, &mut mmap); + mmap.flush().unwrap(); } None => print::print(&gfa), } From 7a289827afcbe57cf00134a353966806a203474a Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 19:01:18 -0400 Subject: [PATCH 5/7] Pick a new magic number --- polbin/src/file.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polbin/src/file.rs b/polbin/src/file.rs index 2f7e8a2e..253666a1 100644 --- a/polbin/src/file.rs +++ b/polbin/src/file.rs @@ -2,12 +2,12 @@ use crate::flatgfa; use std::mem::{size_of, size_of_val}; use zerocopy::{AsBytes, FromBytes, FromZeroes}; -const MAGIC_NUMBER: usize = 0x1337_4915; +const MAGIC_NUMBER: u64 = 0xB101_1054; #[derive(FromBytes, FromZeroes, AsBytes)] #[repr(packed)] struct TOC { - magic: usize, + magic: u64, header_len: usize, segs_count: usize, paths_count: usize, From 468e3d32ca8a004461c684d512f52181d91f8f96 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 19:09:56 -0400 Subject: [PATCH 6/7] No alignment for packed words `slice_from_prefix` returns None when the bytes are not aligned. Maybe we should consider padding things in the binary format so that these things can be aligned in the file!!! --- polbin/src/flatgfa.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polbin/src/flatgfa.rs b/polbin/src/flatgfa.rs index eebd7687..c33b3a81 100644 --- a/polbin/src/flatgfa.rs +++ b/polbin/src/flatgfa.rs @@ -138,7 +138,7 @@ pub enum Orientation { /// So, logically, it consists of a pair of a segment reference (usize) and an /// orientation (1 bit). We pack the two values into a single word. #[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)] -#[repr(transparent)] +#[repr(packed)] pub struct Handle(usize); impl Handle { @@ -176,7 +176,7 @@ pub enum AlignOpcode { /// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two /// into a single u32. #[derive(Debug, FromZeroes, FromBytes, AsBytes)] -#[repr(transparent)] +#[repr(packed)] pub struct AlignOp(u32); impl AlignOp { From 1793f8def7fbf51d5cf20510cfb2005afe460f08 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 11 Mar 2024 19:12:22 -0400 Subject: [PATCH 7/7] Test round-trip through file --- tests/turnt.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/turnt.toml b/tests/turnt.toml index 5450ad4f..c501b3eb 100644 --- a/tests/turnt.toml +++ b/tests/turnt.toml @@ -159,6 +159,10 @@ binary = true command = "pollen_data_gen simple {filename} | jq .depth" output.json = "-" -[envs.polbin_roundtrip] +[envs.polbin_mem] command = "../polbin/target/debug/polbin < {filename}" output.gfa = "-" + +[envs.polbin_file] +command = "../polbin/target/debug/polbin < {filename} -o {base}.flatgfa ; ../polbin/target/debug/polbin -i {base}.flatgfa" +output.gfa = "-"