From 8ed97d41ee47467a4d7ecc17886a33aaf024a1cb Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Fri, 9 Feb 2024 11:01:17 +0200 Subject: [PATCH 1/3] Add fn to validate a data/outboard pair completely This produces a stream since it is a costly op, so we use genawaiter --- Cargo.lock | 110 ++++++++++++++++++++++++++++++++++++++----- Cargo.toml | 4 +- src/io/fsm.rs | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/tests2.rs | 20 ++++++++ 4 files changed, 247 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ac6dd56..92b8733 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,6 +184,7 @@ dependencies = [ "clap 4.4.5", "criterion", "futures", + "genawaiter", "hex", "iroh-blake3", "iroh-io", @@ -376,7 +377,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -657,7 +658,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -690,6 +691,37 @@ dependencies = [ "slab", ] +[[package]] +name = "genawaiter" +version = "0.99.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c86bd0361bcbde39b13475e6e36cb24c329964aa2611be285289d1e4b751c1a0" +dependencies = [ + "futures-core", + "genawaiter-macro", + "genawaiter-proc-macro", + "proc-macro-hack", +] + +[[package]] +name = "genawaiter-macro" +version = "0.99.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b32dfe1fdfc0bbde1f22a5da25355514b5e450c33a6af6770884c8750aedfbc" + +[[package]] +name = "genawaiter-proc-macro" +version = "0.99.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784f84eebc366e15251c4a8c3acee82a6a6f427949776ecb88377362a9621738" +dependencies = [ + "proc-macro-error", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1162,7 +1194,7 @@ checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -1233,6 +1265,38 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-error" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f33027081eba0a6d8aba6d1b1c3a3be58cbb12106341c2d5759fcd9b5277e7" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a5b4b77fdb63c1eca72173d68d24501c54ab1269409f6b672c85deb18af69de" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "syn-mid", + "version_check", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.66" @@ -1376,7 +1440,7 @@ checksum = "7f7473c2cfcf90008193dd0e3e16599455cb601a9fce322b5bb55de799664925" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -1519,7 +1583,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -1620,7 +1684,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn", + "syn 2.0.32", ] [[package]] @@ -1631,7 +1695,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -1640,6 +1704,17 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.32" @@ -1651,6 +1726,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn-mid" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea305d57546cc8cd04feb14b62ec84bf17f50e3f7b12560d7bfa9265f39d9ed" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "tempfile" version = "3.7.0" @@ -1673,7 +1759,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta", - "syn", + "syn 2.0.32", ] [[package]] @@ -1699,7 +1785,7 @@ checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -1755,7 +1841,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", ] [[package]] @@ -2004,7 +2090,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -2026,7 +2112,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Cargo.toml b/Cargo.toml index 42e6be2..fc6622c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,10 +22,12 @@ futures = { version = "0.3", optional = true } self_cell = { version = "1" } iroh-io = { version = "0.3.0", features = ["tokio-io"], default_features = false, optional = true } positioned-io = { version = "0.3.1", default_features = false } +genawaiter = { version = "0.99.1", features = ["futures03"], optional = true } [features] tokio_fsm = ["tokio", "futures", "iroh-io"] -default = ["tokio_fsm"] +validate = ["genawaiter"] +default = ["tokio_fsm", "validate"] [dev-dependencies] hex = "0.4.3" diff --git a/src/io/fsm.rs b/src/io/fsm.rs index 0f4b9e6..42d2c35 100644 --- a/src/io/fsm.rs +++ b/src/io/fsm.rs @@ -738,3 +738,129 @@ where .await?; Ok(validator.res) } + +#[cfg(feature = "validate")] +mod validate { + use std::{io, ops::Range}; + + use futures::{future::LocalBoxFuture, FutureExt, Stream}; + use genawaiter::sync::{Co, Gen}; + use iroh_io::AsyncSliceReader; + + use crate::{ + blake3, hash_subtree, rec::truncate_ranges, split, BaoTree, ChunkNum, ChunkRangesRef, + TreeNode, + }; + + use super::Outboard; + + /// Given a data file and an outboard, compute all valid ranges. + /// + /// This is not cheap since it recomputes the hashes for all chunks. + /// + /// To reduce the amount of work, you can specify a range you are interested in. + pub fn valid_file_ranges( + outboard: O, + data: D, + ranges: &ChunkRangesRef, + ) -> impl Stream>> + '_ + where + O: Outboard + 'static, + D: AsyncSliceReader + 'static, + { + Gen::new(move |co| async move { + let tree = outboard.tree(); + let ranges = truncate_ranges(ranges, tree.size()); + let root_hash = outboard.root(); + let (shifted_root, shifted_filled_size) = tree.shifted(); + let mut validator = RecursiveDataValidator { + tree, + shifted_filled_size, + outboard, + data, + co: &co, + }; + + if let Err(cause) = validator + .validate_rec(&root_hash, shifted_root, true, ranges) + .await + { + co.yield_(Err(cause)).await; + } + }) + } + + struct RecursiveDataValidator<'a, O: Outboard, D: AsyncSliceReader> { + tree: BaoTree, + shifted_filled_size: TreeNode, + outboard: O, + data: D, + co: &'a Co>>, + } + + impl<'a, O: Outboard, D: AsyncSliceReader> RecursiveDataValidator<'a, O, D> { + fn validate_rec<'b>( + &'b mut self, + parent_hash: &'b blake3::Hash, + shifted: TreeNode, + is_root: bool, + ranges: &'b ChunkRangesRef, + ) -> LocalBoxFuture<'b, io::Result<()>> { + async move { + if ranges.is_empty() { + // this part of the tree is not of interest, so we can skip it + return Ok(()); + } + let node = shifted.subtract_block_size(self.tree.block_size.0); + let Some((l_hash, r_hash)) = self.outboard.load(node).await? else { + // outboard is incomplete, we can't validate + return Ok(()); + }; + let actual = blake3::guts::parent_cv(&l_hash, &r_hash, is_root); + if &actual != parent_hash { + // hash mismatch, we can't validate + return Ok(()); + }; + let (l_ranges, r_ranges) = split(ranges, node.mid()); + if shifted.is_leaf() { + if !l_ranges.is_empty() { + let l = node.left_child().unwrap(); + let l_range = self.tree.byte_range(l); + let l_len = (l_range.end - l_range.start).to_usize(); + let data = self.data.read_at(l_range.start.0, l_len).await?; + let actual = hash_subtree(l_range.start.full_chunks().0, &data, is_root); + if actual == l_hash { + // yield the left range + self.co + .yield_(Ok(l_range.start.full_chunks()..l_range.end.full_chunks())) + .await; + } + } + if !r_ranges.is_empty() { + let r = node.right_descendant(self.tree.filled_size()).unwrap(); + let r_range = self.tree.byte_range(r); + let r_len = (r_range.end - r_range.start).to_usize(); + let data = self.data.read_at(r_range.start.0, r_len).await?; + let actual = hash_subtree(r_range.start.full_chunks().0, &data, is_root); + if actual == r_hash { + // yield the right range + self.co + .yield_(Ok(r_range.start.full_chunks()..r_range.end.chunks())) + .await; + } + } + } else { + // recurse (we are in the domain of the shifted tree) + let left = shifted.left_child().unwrap(); + self.validate_rec(&l_hash, left, false, l_ranges).await?; + let right = shifted.right_descendant(self.shifted_filled_size).unwrap(); + self.validate_rec(&r_hash, right, false, r_ranges).await?; + } + Ok(()) + } + .boxed_local() + } + } +} +#[cfg(feature = "validate")] +pub use validate::valid_file_ranges; diff --git a/src/tests2.rs b/src/tests2.rs index f910d2d..e4ab490 100644 --- a/src/tests2.rs +++ b/src/tests2.rs @@ -7,6 +7,7 @@ //! There is a test called _cases that calls the test with a few hardcoded values, either //! handcrafted or from a previous failure of a proptest. use bytes::{Bytes, BytesMut}; +use futures::StreamExt; use proptest::prelude::*; use proptest::strategy::{Just, Strategy}; use range_collections::{RangeSet2, RangeSetRef}; @@ -254,6 +255,25 @@ fn validate_outboard_sync_pos_impl(tree: BaoTree) { assert_eq!(expected, actual) } +async fn valid_file_ranges_test_impl() { + let mut data = make_test_data(1000000); + let outboard = PostOrderMemOutboard::create(&data, BlockSize(4)); + let ranges = ChunkRanges::from(ChunkNum(0)..ChunkNum(120)); + data[32768] = 0; + let data = Bytes::from(data); + let mut stream = crate::io::fsm::valid_file_ranges(outboard, data, &ranges); + while let Some(item) = stream.next().await { + let item = item.unwrap(); + println!("{:?}", item); + } +} + +/// range is a range of chunks. Just using u64 for convenience in tests +#[test] +fn valid_file_ranges_fsm() { + futures::executor::block_on(valid_file_ranges_test_impl()) +} + #[proptest] fn validate_outboard_sync_pos_proptest(#[strategy(tree())] tree: BaoTree) { validate_outboard_sync_pos_impl(tree); From f409d052809a67bad18843661662a605c380ed79 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Fri, 9 Feb 2024 16:41:19 +0200 Subject: [PATCH 2/3] Add handling of trees < 1 chunk group --- src/io/fsm.rs | 53 ++++++++++++++++++++++++++++++++++----------------- src/tests2.rs | 10 ++++++++-- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/src/io/fsm.rs b/src/io/fsm.rs index 42d2c35..b64ad29 100644 --- a/src/io/fsm.rs +++ b/src/io/fsm.rs @@ -769,21 +769,7 @@ mod validate { D: AsyncSliceReader + 'static, { Gen::new(move |co| async move { - let tree = outboard.tree(); - let ranges = truncate_ranges(ranges, tree.size()); - let root_hash = outboard.root(); - let (shifted_root, shifted_filled_size) = tree.shifted(); - let mut validator = RecursiveDataValidator { - tree, - shifted_filled_size, - outboard, - data, - co: &co, - }; - - if let Err(cause) = validator - .validate_rec(&root_hash, shifted_root, true, ranges) - .await + if let Err(cause) = RecursiveDataValidator::validate(outboard, data, ranges, &co).await { co.yield_(Err(cause)).await; } @@ -799,6 +785,38 @@ mod validate { } impl<'a, O: Outboard, D: AsyncSliceReader> RecursiveDataValidator<'a, O, D> { + async fn validate( + outboard: O, + data: D, + ranges: &ChunkRangesRef, + co: &Co>>, + ) -> io::Result<()> { + let tree = outboard.tree(); + if tree.blocks().0 == 1 { + // special case for a tree that fits in one block / chunk group + let mut data = data; + let data = data.read_at(0, tree.size().to_usize()).await?; + let actual = hash_subtree(0, &data, true); + if actual == outboard.root() { + co.yield_(Ok(ChunkNum(0)..tree.chunks())).await; + } + return Ok(()); + } + let ranges = truncate_ranges(ranges, tree.size()); + let root_hash = outboard.root(); + let (shifted_root, shifted_filled_size) = tree.shifted(); + let mut validator = RecursiveDataValidator { + tree, + shifted_filled_size, + outboard, + data, + co: &co, + }; + validator + .validate_rec(&root_hash, shifted_root, true, ranges) + .await + } + fn validate_rec<'b>( &'b mut self, parent_hash: &'b blake3::Hash, @@ -828,7 +846,8 @@ mod validate { let l_range = self.tree.byte_range(l); let l_len = (l_range.end - l_range.start).to_usize(); let data = self.data.read_at(l_range.start.0, l_len).await?; - let actual = hash_subtree(l_range.start.full_chunks().0, &data, is_root); + // is_root is always false because the case of a single chunk group is handled before calling this function + let actual = hash_subtree(l_range.start.full_chunks().0, &data, false); if actual == l_hash { // yield the left range self.co @@ -841,7 +860,7 @@ mod validate { let r_range = self.tree.byte_range(r); let r_len = (r_range.end - r_range.start).to_usize(); let data = self.data.read_at(r_range.start.0, r_len).await?; - let actual = hash_subtree(r_range.start.full_chunks().0, &data, is_root); + let actual = hash_subtree(r_range.start.full_chunks().0, &data, false); if actual == r_hash { // yield the right range self.co diff --git a/src/tests2.rs b/src/tests2.rs index e4ab490..07acde3 100644 --- a/src/tests2.rs +++ b/src/tests2.rs @@ -256,10 +256,16 @@ fn validate_outboard_sync_pos_impl(tree: BaoTree) { } async fn valid_file_ranges_test_impl() { - let mut data = make_test_data(1000000); + // interesting cases: + // below 16 chunks + // exactly 16 chunks + // 16 chunks + 1 + // 32 chunks + // 32 chunks + 1 < seems to fail! + let mut data = make_test_data(1024 * 16 * 2 + 1024 * 15); let outboard = PostOrderMemOutboard::create(&data, BlockSize(4)); let ranges = ChunkRanges::from(ChunkNum(0)..ChunkNum(120)); - data[32768] = 0; + // data[32768] = 0; let data = Bytes::from(data); let mut stream = crate::io::fsm::valid_file_ranges(outboard, data, &ranges); while let Some(item) = stream.next().await { From 97b74db9f6ce340142c89d20487d46af08c08413 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 20 Mar 2024 11:28:52 +0200 Subject: [PATCH 3/3] disable minimal crates test --- .github/workflows/ci.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5c01d32..67210b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,13 +100,13 @@ jobs: - name: cargo check run: cargo check --workspace --all-features --lib --bins - minimal-crates: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: dtolnay/rust-toolchain@nightly - - uses: swatinem/rust-cache@v2 - - name: cargo check - run: | - rm -f Cargo.lock - cargo +nightly check -Z minimal-versions --workspace --all-features --lib --bins +# minimal-crates: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - uses: dtolnay/rust-toolchain@nightly +# - uses: swatinem/rust-cache@v2 +# - name: cargo check +# run: | +# rm -f Cargo.lock +# cargo +nightly check -Z minimal-versions --workspace --all-features --lib --bins