Skip to content

Commit

Permalink
Experimental Flatbuffer index
Browse files Browse the repository at this point in the history
  • Loading branch information
sourcefrog committed Feb 6, 2023
1 parent 3fa1459 commit 3eabe34
Show file tree
Hide file tree
Showing 9 changed files with 854 additions and 3 deletions.
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ tracing-appender = "0.2"
unix_mode = "0.1"
url = "2.2.2"
indoc = "1.0.8"
flatbuffers = "23.1.21"

[target.'cfg(unix)'.dependencies]
users = "0.11"
Expand Down
33 changes: 33 additions & 0 deletions flatbuffers/index.fbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Flatbuffers IDL for Conserve index.

namespace Conserve.Index;

enum Kind:byte {
File,
Dir,
Symlink,
}

table Addr {
hash: [ubyte];
start: uint64;
len: uint64;
}

table Entry {
apath:string;
kind:Kind;
target:string; // only for kind == Symlink
mtime:int64;
mtime_nanos:uint32;
unix_mode:uint32 = 0xffffffff; // default = unset
addrs: [Addr];
user: string;
group: string;
}

table Index {
entries:[Entry];
}

root_type Index;
27 changes: 24 additions & 3 deletions src/bin/conserve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
//! Command-line entry point for Conserve backups.
use std::error::Error;
use std::fs::OpenOptions;
use std::io::{BufWriter, Write};
use std::path::PathBuf;
use std::process::ExitCode;
Expand Down Expand Up @@ -246,14 +247,25 @@ enum Debug {
backup: Option<BandId>,
},

WriteIndexFlatbuf {
archive: String,
out: PathBuf,
},

/// List all blocks.
Blocks { archive: String },
Blocks {
archive: String,
},

/// List all blocks referenced by any band.
Referenced { archive: String },
Referenced {
archive: String,
},

/// List garbage blocks referenced by no band.
Unreferenced { archive: String },
Unreferenced {
archive: String,
},
}

impl Command {
Expand Down Expand Up @@ -294,6 +306,15 @@ impl Command {
let st = stored_tree_from_opt(archive, backup)?;
show::show_index_json(st.band(), &mut stdout)?;
}
Command::Debug(Debug::WriteIndexFlatbuf { archive, out }) => {
let st = stored_tree_from_opt(archive, &None)?;
let out_file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(out)?;
conserve::fbs::write_index(&st, out_file)?;
}
Command::Debug(Debug::Referenced { archive }) => {
let mut bw = BufWriter::new(stdout);
let archive = Archive::open(open_transport(archive)?)?;
Expand Down
6 changes: 6 additions & 0 deletions src/blockhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ pub struct BlockHash {
bin: [u8; BLAKE_HASH_SIZE_BYTES],
}

impl BlockHash {
pub fn as_slice(&self) -> &[u8] {
&self.bin
}
}

#[derive(Debug)]
pub struct BlockHashParseError {
rejected_string: String,
Expand Down
120 changes: 120 additions & 0 deletions src/fbs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
//! Experimental support for storing indexes as flatbuffers.
#[allow(dead_code, unused_imports, clippy::all)]
pub(crate) mod index_generated;

use std::collections::HashMap;
use std::{fs::File, io::Write};

use tracing::{debug, trace};

use crate::*;

use index_generated::conserve::index as gen;

pub fn write_index(st: &StoredTree, mut out_file: File) -> Result<()> {
let all_entries: Vec<_> = st
.iter_entries(Apath::root(), Exclude::nothing())?
.collect();
debug!("Loaded {} entries", all_entries.len());

// Map from hash to serialized location, so that hashes are stored only once.
let mut hash_to_fb: HashMap<BlockHash, _> = HashMap::new();
let mut name_to_pb: HashMap<String, _> = HashMap::new();

// TODO: Possibly, we should have the serialized layout have all the apaths together,
// all the hashes, all the user/group names, and then all the structs. That seems
// possible and would probably help bytewise compression..

let mut builder = flatbuffers::FlatBufferBuilder::with_capacity(200 * all_entries.len());
trace!("Allocated builder");
let fb_entries: Vec<_> = all_entries
.into_iter()
.map(|entry| {
let addrs = entry
.addrs
.iter()
.map(|addr| {
let hash = *hash_to_fb
.entry(addr.hash.clone())
.or_insert_with(|| builder.create_vector(addr.hash.as_slice()));
gen::Addr::create(
&mut builder,
&gen::AddrArgs {
hash: Some(hash),
start: addr.start,
len: addr.len,
},
)
})
.collect::<Vec<_>>();
let addrs = if addrs.is_empty() {
None
} else {
Some(builder.create_vector(&addrs))
};
let user = entry.owner.user.as_ref().map(|user| {
name_to_pb
.entry(user.to_owned())
.or_insert_with(|| builder.create_string(user))
.to_owned()
});
let group = entry.owner.group.as_ref().map(|group| {
name_to_pb
.entry(group.to_owned())
.or_insert_with(|| builder.create_string(group))
.to_owned()
});
let apath = Some(builder.create_string(entry.apath()));
let target = entry
.target
.as_ref()
.map(|target| builder.create_string(target));
gen::Entry::create(
&mut builder,
&gen::EntryArgs {
apath,
addrs,
kind: entry.kind().into(),
target,
mtime: entry.mtime,
mtime_nanos: entry.mtime_nanos,
unix_mode: entry.unix_mode.as_u32().unwrap_or(u32::MAX),
user,
group,
},
)
})
.collect();
let n_entries = fb_entries.len();
let fb_entries = builder.create_vector(&fb_entries);

let index = gen::Index::create(
&mut builder,
&gen::IndexArgs {
entries: Some(fb_entries),
},
);
builder.finish(index, None);

let buf = builder.finished_data();
let mean_size = buf.len() / n_entries;
debug!(
serialized_len = buf.len(),
n_entries, mean_size, "serialized index to flatbuf"
);
out_file.write_all(buf)?;
debug!("wrote to out file");
Ok(())
}

impl From<Kind> for gen::Kind {
fn from(value: Kind) -> Self {
match value {
Kind::Dir => Self::Dir,
Kind::File => Self::File,
Kind::Symlink => Self::Symlink,
_ => panic!("Can't serialize kind {value:?} to flatbuffers"),
}
}
}
Loading

0 comments on commit 3eabe34

Please sign in to comment.