diff --git a/trust-quorum/src/alarm.rs b/trust-quorum/src/alarm.rs new file mode 100644 index 00000000000..e89ddc22ff8 --- /dev/null +++ b/trust-quorum/src/alarm.rs @@ -0,0 +1,80 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A mechanism for reporting protocol invariant violations +//! +//! Invariant violations should _never_ occur. They represent a critical bug in +//! the implementation of the system. In certain scenarios we can detect these +//! invariant violations and record them. This allows reporting them to higher +//! levels of control plane software so that we can debug them and fix them in +//! future releases, as well as rectify outstanding issues on systems where such +//! an alarm arose. + +use crate::{Epoch, PlatformId}; +use omicron_uuid_kinds::RackUuid; +use serde::{Deserialize, Serialize}; + +/// A critical invariant violation that should never occur. +/// +/// Many invariant violations are only possible on receipt of peer messages, +/// and are _not_ a result of API calls. This means that there isn't a good +/// way to directly inform the rest of the control plane. Instead we provide a +/// queryable API for `crate::Node` status that includes alerts. +/// +/// If an `Alarm` is ever seen by an operator then support should be contacted +/// immediately. +#[derive( + Debug, Clone, thiserror::Error, PartialEq, Eq, Serialize, Deserialize, +)] +pub enum Alarm { + #[error( + "TQ Alarm: commit attempted with invalid rack_id. Expected {expected}, got {got}." + )] + CommitWithInvalidRackId { expected: RackUuid, got: RackUuid }, + #[error( + "TQ Alarm: prepare for a later configuration exists: \ + last_prepared_epoch = {last_prepared_epoch:?}, \ + commit_epoch = {commit_epoch}" + )] + OutOfOrderCommit { last_prepared_epoch: Epoch, commit_epoch: Epoch }, + + #[error( + "TQ Alarm: commit attempted, but missing prepare message: \ + epoch = {epoch}. Latest seen epoch = {latest_seen_epoch:?}." + )] + MissingPrepare { epoch: Epoch, latest_seen_epoch: Option }, + + #[error( + "TQ Alarm: prepare received from {from} with mismatched \ + last_committed_epoch: prepare's last committed epoch = \ + {prepare_last_committed_epoch:?}, \ + persisted prepare's last_committed_epoch = \ + {persisted_prepare_last_committed_epoch:?}" + )] + PrepareLastCommittedEpochMismatch { + from: PlatformId, + prepare_last_committed_epoch: Option, + persisted_prepare_last_committed_epoch: Option, + }, + + #[error( + "TQ Alarm: prepare received with invalid rack_id from {from}. \ + Expected {expected}, got {got}." + )] + PrepareWithInvalidRackId { + from: PlatformId, + expected: RackUuid, + got: RackUuid, + }, + + #[error( + "TQ Alarm: different nodes coordinating same epoch = {epoch}: \ + them = {them}, us = {us}" + )] + DifferentNodesCoordinatingSameEpoch { + epoch: Epoch, + them: PlatformId, + us: PlatformId, + }, +} diff --git a/trust-quorum/src/coordinator_state.rs b/trust-quorum/src/coordinator_state.rs index 9707968035a..72d8b868040 100644 --- a/trust-quorum/src/coordinator_state.rs +++ b/trust-quorum/src/coordinator_state.rs @@ -211,7 +211,7 @@ impl CoordinatorState { /// Record a `PrepareAck` from another node as part of tracking /// quorum for the prepare phase of the trust quorum protocol. - pub fn ack_prepare(&mut self, from: PlatformId) { + pub fn record_prepare_ack(&mut self, from: PlatformId) { match &mut self.op { CoordinatorOperation::Prepare { prepares, prepare_acks, .. diff --git a/trust-quorum/src/crypto.rs b/trust-quorum/src/crypto.rs index c53ba8021a6..24ad2dd32dd 100644 --- a/trust-quorum/src/crypto.rs +++ b/trust-quorum/src/crypto.rs @@ -29,8 +29,8 @@ use zeroize::{Zeroize, ZeroizeOnDrop, Zeroizing}; const LRTQ_SHARE_SIZE: usize = 33; /// We don't distinguish whether this is an Ed25519 Scalar or set of GF(256) -/// polynomials points with an x-coordinate of 0. Both can be treated as 32 byte -/// blobs when decrypted, as they are immediately fed into HKDF. +/// polynomials' points with an x-coordinate of 0. Both can be treated as 32 +/// byte blobs when decrypted, as they are immediately fed into HKDF. #[derive( Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, )] diff --git a/trust-quorum/src/errors.rs b/trust-quorum/src/errors.rs index 0a30fc81f15..1c8a3d6800a 100644 --- a/trust-quorum/src/errors.rs +++ b/trust-quorum/src/errors.rs @@ -8,22 +8,6 @@ use crate::configuration::ConfigurationError; use crate::{Epoch, PlatformId, Threshold}; use omicron_uuid_kinds::RackUuid; -#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] -pub enum CommitError { - #[error("invalid rack id")] - InvalidRackId( - #[from] - #[source] - MismatchedRackIdError, - ), - - #[error("missing prepare msg")] - MissingPrepare, - - #[error("prepare for a later configuration exists")] - OutOfOrderCommit, -} - #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] #[error( "sled was decommissioned on msg from {from:?} at epoch {epoch:?}: last prepared epoch = {last_prepared_epoch:?}" diff --git a/trust-quorum/src/lib.rs b/trust-quorum/src/lib.rs index d1c72215980..40b1610a8c6 100644 --- a/trust-quorum/src/lib.rs +++ b/trust-quorum/src/lib.rs @@ -12,6 +12,7 @@ use derive_more::Display; use serde::{Deserialize, Serialize}; +mod alarm; mod configuration; mod coordinator_state; pub(crate) mod crypto; @@ -20,9 +21,10 @@ mod messages; mod node; mod persistent_state; mod validators; -pub use configuration::Configuration; +pub use alarm::Alarm; +pub use configuration::{Configuration, PreviousConfiguration}; pub(crate) use coordinator_state::CoordinatorState; -pub use crypto::RackSecret; +pub use crypto::{EncryptedRackSecret, RackSecret, Salt, Sha3_256Digest}; pub use messages::*; pub use node::Node; pub use persistent_state::{PersistentState, PersistentStateSummary}; diff --git a/trust-quorum/src/node.rs b/trust-quorum/src/node.rs index daf60d0f0db..ba558800375 100644 --- a/trust-quorum/src/node.rs +++ b/trust-quorum/src/node.rs @@ -4,10 +4,11 @@ //! A trust quorum node that implements the trust quorum protocol -use crate::errors::{CommitError, MismatchedRackIdError, ReconfigurationError}; +use crate::errors::ReconfigurationError; use crate::validators::ValidatedReconfigureMsg; use crate::{ - CoordinatorState, Envelope, Epoch, PersistentState, PlatformId, messages::*, + Alarm, CoordinatorState, Envelope, Epoch, PersistentState, PlatformId, + messages::*, }; use gfss::shamir::Share; use omicron_uuid_kinds::RackUuid; @@ -84,70 +85,82 @@ impl Node { &mut self, epoch: Epoch, rack_id: RackUuid, - ) -> Result, CommitError> { - if self.persistent_state.last_committed_epoch() == Some(epoch) { - // Idempotent request - return Ok(None); - } - - // Only commit if we have a `PrepareMsg` and it's the latest `PrepareMsg`. - // - // This forces a global ordering of `PrepareMsg`s, because it's only - // possible to re-derive a key share in a `PrepareMsg` for the current - // configuration. - // - // In practice this check will always succeed if we have the - // `PrepareMsg` for this epoch, because later `Prepare` messages would - // not have been accepted before this commit arrived. This is because - // each `PrepareMsg` contains the `last_committed_epoch` that must have - // been seen in order to be accepted. If this commit hadn't occurred, - // then it wasn't part of the chain of `last_committed_epoch`s, and was - // abandonded/canceled. In that case, if we ended up getting a commit, - // then it would not inductively have been part of the existing chain - // and so would be a bug in the protocol execution. Because of this we - // check and error on this condition. - let last_prepared_epoch = self.persistent_state.last_prepared_epoch(); - if last_prepared_epoch != Some(epoch) { - error!( - self.log, - "Commit message occurred out of order"; - "epoch" => %epoch, - "last_prepared_epoch" => ?last_prepared_epoch - ); - return Err(CommitError::OutOfOrderCommit); - } - if let Some(prepare) = self.persistent_state.prepares.get(&epoch) { - if prepare.config.rack_id != rack_id { - error!( - self.log, - "Commit attempted with invalid rack_id"; - "expected" => %prepare.config.rack_id, - "got" => %rack_id - ); - return Err(CommitError::InvalidRackId( - MismatchedRackIdError { - expected: prepare.config.rack_id, - got: rack_id, - }, - )); - } - } else { - // This is an erroneous commit attempt from nexus. Log it. + ) -> Result, Alarm> { + let Some(latest_prepare) = self.persistent_state.latest_prepare() + else { + // This is an erroneous commit attempt from nexus. We don't have + // any prepares yet, but for some reason nexus thinks we do. // // Nexus should instead tell this node to retrieve a `Prepare` // from another node that has already committed. - error!( + // + // This is a less serious error than other invariant violations + // since it can be recovered from. However, it is still worthy of an + // alarm, as the most likely case is a disk/ ledger failure. + let latest_seen_epoch = None; + let alarm = Alarm::MissingPrepare { epoch, latest_seen_epoch }; + error!(self.log, "{alarm}"); + return Err(alarm); + }; + + if latest_prepare.config.epoch < epoch { + // We haven't seen this prepare yet, but Nexus thinks we have. + // This is essentially the same case as above. + let latest_seen_epoch = Some(latest_prepare.config.epoch); + let alarm = Alarm::MissingPrepare { epoch, latest_seen_epoch }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + + if latest_prepare.config.epoch > epoch { + // Only commit if we have a `PrepareMsg` and it's the latest + // `PrepareMsg`. + // + // This forces a global ordering of `PrepareMsg`s, because it's + // only possible to re-derive a key share in a `PrepareMsg` for the + // current configuration. + // + // If we get a commit for an earlier prepare message, then we + // shouldn't have been able to accept any later prepare messages, + // because this commit hadn't been seen by this node yet. Prepare + // messages are only accepted if the last committed epoch in the + // message matches what the node has seen. + // + // If we get here, then it means that there is a bug in this code + // that allowed the later prepare to be accepted, and therefore we + // must raise an alarm. + let alarm = Alarm::OutOfOrderCommit { + last_prepared_epoch: latest_prepare.config.epoch, + commit_epoch: epoch, + }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + + // The epoch of the latest prepare matches the commit. + // Do the rack_ids match up? + if latest_prepare.config.rack_id != rack_id { + let alarm = Alarm::CommitWithInvalidRackId { + expected: latest_prepare.config.rack_id, + got: rack_id, + }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + + if self.persistent_state.last_committed_epoch() == Some(epoch) { + info!( self.log, - "tried to commit a configuration, but missing prepare msg"; + "Idempotent configuration - already committed"; "epoch" => %epoch ); - return Err(CommitError::MissingPrepare); + return Ok(None); } + // Success! info!(self.log, "Committed configuration"; "epoch" => %epoch); - // Are we currently coordinating for this epoch? - // Stop coordinating if we are + // Are we currently coordinating for this epoch? Stop if so. if self.coordinator_state.is_some() { info!( self.log, @@ -176,14 +189,15 @@ impl Node { outbox: &mut Vec, from: PlatformId, msg: PeerMsg, - ) -> Option { + ) -> Result, Alarm> { match msg { + PeerMsg::Prepare(msg) => self.handle_prepare(outbox, from, msg), PeerMsg::PrepareAck(epoch) => { self.handle_prepare_ack(from, epoch); - None + Ok(None) } PeerMsg::Share { epoch, share } => { - self.handle_share(now, outbox, from, epoch, share) + Ok(self.handle_share(now, outbox, from, epoch, share)) } _ => todo!( "cannot handle message variant yet - not implemented: {msg:?}" @@ -196,21 +210,133 @@ impl Node { self.coordinator_state.as_ref() } + // Handle a `PrepareMsg` from a coordinator + fn handle_prepare( + &mut self, + outbox: &mut Vec, + from: PlatformId, + msg: PrepareMsg, + ) -> Result, Alarm> { + if let Some(rack_id) = self.persistent_state.rack_id() { + if rack_id != msg.config.rack_id { + let alarm = Alarm::PrepareWithInvalidRackId { + from, + expected: rack_id, + got: msg.config.rack_id, + }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + } + if let Some(latest_prepare) = self.persistent_state.latest_prepare() { + // Is this an old request? + if msg.config.epoch < latest_prepare.config.epoch { + warn!(self.log, "Received stale prepare"; + "from" => %from, + "msg_epoch" => %msg.config.epoch, + "last_prepared_epoch" => %latest_prepare.config.epoch + ); + return Ok(None); + } + + // Idempotent request + if msg.config == latest_prepare.config { + return Ok(None); + } + + // Does the last committed epoch match what we have? + let msg_last_committed_epoch = + msg.config.previous_configuration.as_ref().map(|p| p.epoch); + let last_committed_epoch = + self.persistent_state.last_committed_epoch(); + if msg_last_committed_epoch != last_committed_epoch { + // If the msg contains an older last_committed_epoch than what + // we have, then out of order commits have occurred, as we know + // this prepare is later than what we've seen. This is a critical + // protocol invariant that has been violated. + // + // If the msg contains a newer last_committed_epoch than what + // we have, then we have likely missed a commit and are behind + // by more than one reconfiguration. The protocol currently does + // not allow this. Future protocol implementations may provide a + // capability to "jump" configurations. + // + // If there is a `None`/`Some` mismatch, then again, an invariant + // has been violated somewhere. The coordinator should know + // whether this is a "new" node or not, which would have a "None" + // configuration. + let alarm = Alarm::PrepareLastCommittedEpochMismatch { + from, + prepare_last_committed_epoch: msg_last_committed_epoch, + persisted_prepare_last_committed_epoch: + last_committed_epoch, + }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + + // The prepare is up-to-date with respect to our persistent state + }; + + // Are we currently trying to coordinate? + if let Some(cs) = &self.coordinator_state { + let coordinating_epoch = cs.reconfigure_msg().epoch(); + if coordinating_epoch > msg.config.epoch { + warn!(self.log, "Received stale prepare while coordinating"; + "from" => %from, + "msg_epoch" => %msg.config.epoch, + "epoch" => %cs.reconfigure_msg().epoch() + ); + return Ok(None); + } + if coordinating_epoch == msg.config.epoch { + let alarm = Alarm::DifferentNodesCoordinatingSameEpoch { + epoch: coordinating_epoch, + them: from, + us: self.platform_id.clone(), + }; + error!(self.log, "{alarm}"); + return Err(alarm); + } + + // This prepare is for a newer configuration than the one we are + // currently coordinating. We must cancel our coordination as Nexus + // has moved on. + let cancel_msg = concat!( + "Received a prepare for newer configuration.", + "Cancelling our coordination." + ); + info!(self.log, "{cancel_msg}"; + "msg_epoch" => %msg.config.epoch, + "epoch" => %coordinating_epoch, + "from" => %from + ); + self.coordinator_state = None; + } + + // Acknowledge the `PrepareMsg` + self.send(from, outbox, PeerMsg::PrepareAck(msg.config.epoch)); + + // Add the prepare to our `PersistentState` + self.persistent_state.prepares.insert(msg.config.epoch, msg); + + Ok(Some(self.persistent_state.clone())) + } + // Handle receipt of a `PrepareAck` message fn handle_prepare_ack(&mut self, from: PlatformId, epoch: Epoch) { // Are we coordinating for this epoch? if let Some(cs) = &mut self.coordinator_state { let current_epoch = cs.reconfigure_msg().epoch(); if current_epoch == epoch { - // Store the ack in the coordinator state - cs.ack_prepare(from); + cs.record_prepare_ack(from); } else { - // Log and drop message warn!(self.log, "Received prepare ack for wrong epoch"; "from" => %from, "current_epoch" => %current_epoch, "acked_epoch" => %epoch ); + // Ack is intentionally dropped here } } else { warn!( @@ -219,6 +345,7 @@ impl Node { "from" => %from, "acked_epoch" => %epoch ); + // Ack is intentionally dropped here } } @@ -269,6 +396,11 @@ impl Node { } } + // Package a message in an envelope and put it in the outbox + fn send(&self, to: PlatformId, outbox: &mut Vec, msg: PeerMsg) { + outbox.push(Envelope { to, from: self.platform_id.clone(), msg }); + } + /// Set the coordinator state and conditionally set and return the /// persistent state depending upon whether the node is currently /// coordinating and what its persistent state is. @@ -319,10 +451,11 @@ impl Node { mod tests { use std::time::Duration; - use crate::{Epoch, Threshold}; + use crate::{Configuration, Epoch, Sha3_256Digest, Threshold}; use super::*; use assert_matches::assert_matches; + use gfss::gf256::Gf256; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::RackUuid; use proptest::prelude::*; @@ -418,4 +551,125 @@ mod tests { logctx.cleanup_successful(); } + + #[test] + pub fn handle_alarms() { + let logctx = test_setup_log("handle_prepare_alarms"); + // Initial config + let reconfig_msg = ReconfigureMsg { + rack_id: RackUuid::new_v4(), + epoch: Epoch(1), + last_committed_epoch: None, + members: (0..=5u8) + .map(|serial| { + PlatformId::new("test".into(), serial.to_string()) + }) + .collect(), + threshold: Threshold(3), + retry_timeout: Duration::from_millis(100), + }; + + let my_platform_id = reconfig_msg.members.first().unwrap().clone(); + let mut node = Node::new( + logctx.log.clone(), + my_platform_id.clone(), + PersistentState::empty(), + ); + + let mut outbox = Vec::new(); + let _ = node + .coordinate_reconfiguration( + Instant::now(), + &mut outbox, + reconfig_msg.clone(), + ) + .expect("success") + .expect("persistent state"); + + // Configuration for our `PrepareMsg` + let config = Configuration { + rack_id: RackUuid::new_v4(), + epoch: Epoch(1), + coordinator: PlatformId::new("test".to_string(), "999".to_string()), + members: (0..=5) + .map(|serial| { + ( + PlatformId::new("test".into(), serial.to_string()), + // Nonsense share + Sha3_256Digest([0u8; 32]), + ) + }) + .collect(), + threshold: Threshold(3), + previous_configuration: None, + }; + let mut prepare_msg = PrepareMsg { + // Generate a nonsense share + share: Share { + x_coordinate: Gf256::new(1), + y_coordinates: (0..32_u8).map(Gf256::new).collect(), + }, + config: config.clone(), + }; + + let alarm = node + .handle( + Instant::now(), + &mut outbox, + prepare_msg.config.coordinator.clone(), + PeerMsg::Prepare(prepare_msg.clone()), + ) + .unwrap_err(); + + assert_matches!( + alarm, + Alarm::DifferentNodesCoordinatingSameEpoch { .. } + ); + + // Commit the initial configuration to better trigger alarms. + node.commit_reconfiguration(reconfig_msg.epoch, reconfig_msg.rack_id) + .unwrap(); + + let alarm = node + .handle( + Instant::now(), + &mut outbox, + prepare_msg.config.coordinator.clone(), + PeerMsg::Prepare(prepare_msg.clone()), + ) + .unwrap_err(); + + assert_matches!(alarm, Alarm::PrepareWithInvalidRackId { .. }); + + // Set the rack_id to the correct one to allow other alarms to be + // triggered. + prepare_msg.config.rack_id = reconfig_msg.rack_id; + let alarm = node + .handle( + Instant::now(), + &mut outbox, + prepare_msg.config.coordinator.clone(), + PeerMsg::Prepare(prepare_msg.clone()), + ) + .unwrap_err(); + + assert_matches!(alarm, Alarm::PrepareLastCommittedEpochMismatch { .. }); + + let alarm = node + .commit_reconfiguration(Epoch(1), RackUuid::new_v4()) + .unwrap_err(); + assert_matches!(alarm, Alarm::CommitWithInvalidRackId { .. }); + + let alarm = node + .commit_reconfiguration(Epoch(3), reconfig_msg.rack_id) + .unwrap_err(); + assert_matches!(alarm, Alarm::MissingPrepare { .. }); + + let alarm = node + .commit_reconfiguration(Epoch(0), reconfig_msg.rack_id) + .unwrap_err(); + assert_matches!(alarm, Alarm::OutOfOrderCommit { .. }); + + logctx.cleanup_successful(); + } } diff --git a/trust-quorum/src/persistent_state.rs b/trust-quorum/src/persistent_state.rs index a3d1461454c..4261a3b1aca 100644 --- a/trust-quorum/src/persistent_state.rs +++ b/trust-quorum/src/persistent_state.rs @@ -65,6 +65,11 @@ impl PersistentState { self.prepares.keys().last().map(|epoch| *epoch) } + /// Return the prepare with the highest epoch + pub fn latest_prepare(&self) -> Option<&PrepareMsg> { + self.prepares.last_key_value().map(|(_, v)| v) + } + pub fn last_committed_epoch(&self) -> Option { self.commits.last().map(|epoch| *epoch) } diff --git a/trust-quorum/src/validators.rs b/trust-quorum/src/validators.rs index 8ec4a5d5a55..0ecc237a672 100644 --- a/trust-quorum/src/validators.rs +++ b/trust-quorum/src/validators.rs @@ -268,10 +268,17 @@ impl ValidatedReconfigureMsg { if current_epoch == new_msg.epoch { if existing_msg != new_msg { + // TODO: This should be an `Alarm`, but I"m not sure how + // to handle that. We could include an `Alarm` variant inside + // `ReconfigurationError`, but that would require a match/method + // to check for an alarm and pull it out. We could also return either + // an `Alarm` or a `ReconfigurationError` inside `Result::Err`. This is + // probably the best approach, but I'm open to other structures. error!( log, concat!( - "Reconfiguration in progress for same epoch, ", + "Protocol invariant violation: ", + "reconfiguration in progress for same epoch, ", "but messages differ"); "epoch" => new_msg.epoch.to_string(), ); diff --git a/trust-quorum/tests/coordinator.rs b/trust-quorum/tests/coordinator.rs index ff9467c6655..fee539b0705 100644 --- a/trust-quorum/tests/coordinator.rs +++ b/trust-quorum/tests/coordinator.rs @@ -6,6 +6,7 @@ use assert_matches::assert_matches; use bcs::Result; +use gfss::gf256::Gf256; use gfss::shamir::Share; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::RackUuid; @@ -17,8 +18,9 @@ use std::collections::{BTreeMap, BTreeSet}; use std::time::{Duration, Instant}; use test_strategy::{Arbitrary, proptest}; use trust_quorum::{ - Envelope, Epoch, Node, PeerMsg, PersistentState, PlatformId, PrepareMsg, - ReconfigureMsg, Threshold, + Configuration, EncryptedRackSecret, Envelope, Epoch, Node, PeerMsg, + PersistentState, PlatformId, PrepareMsg, PreviousConfiguration, + ReconfigureMsg, Salt, Sha3_256Digest, Threshold, }; /// The system under test @@ -122,6 +124,11 @@ impl Model { } } + // Delete the coordinator state + pub fn stop_coordinating(&mut self) { + self.coordinator_state = None; + } + /// If we are currently waiting for shares in this epoch then record the /// responder. /// @@ -448,6 +455,9 @@ impl TestState { Action::CoordinateReconfiguration(generated_config) => { self.action_coordinate_reconfiguration(generated_config)?; } + Action::SendPrepareToSut(generated_config) => { + self.action_send_prepare_to_sut(generated_config)?; + } } } Ok(()) @@ -531,6 +541,98 @@ impl TestState { Ok(()) } + fn action_send_prepare_to_sut( + &mut self, + generated_config: GeneratedConfiguration, + ) -> Result<(), TestCaseError> { + // It doesn't really matter what the configuration looks like, as long + // as the last committed epoch matches what the SUT has, and the epoch + // of the `Prepare` is greater than the largest seen epoch at the SUT. + // Therefore, we are able to use our existing config generation mechanism, + // as we do with the SUT. + let msg = self.generated_config_to_reconfigure_msg(generated_config); + + // Choose a node to send to the SUT. + // Skip over the SUT which sorts first. + let coordinator = msg.members.iter().nth(1).cloned().unwrap(); + + // Since the test is never going to commit this Prepare, we just use + // dummy values where necessary (primarily when crypto is involved). + let config = Configuration { + rack_id: msg.rack_id, + epoch: msg.epoch, + coordinator: coordinator.clone(), + members: msg + .members + .into_iter() + .map(|id| (id, Sha3_256Digest([0u8; 32]))) + .collect(), + threshold: msg.threshold, + previous_configuration: msg.last_committed_epoch.map(|epoch| { + PreviousConfiguration { + epoch, + is_lrtq: false, + encrypted_last_committed_rack_secret: EncryptedRackSecret( + vec![0u8; 32], + ), + encrypted_last_committed_rack_secret_salt: Salt::new(), + } + }), + }; + // Generate a nonsense share for the SUT + let share = Share { + x_coordinate: Gf256::new(1), + y_coordinates: (0..32_u8).map(Gf256::new).collect(), + }; + + let prepare_msg = PrepareMsg { config, share }; + + // Put the model in the correct state + self.model.stop_coordinating(); + + // Handle the `PrepareMsg` at the SUT + let mut outbox = Vec::new(); + let persistent_state = self + .sut + .node + .handle( + self.model.now, + &mut outbox, + coordinator.clone(), + PeerMsg::Prepare(prepare_msg.clone()), + ) + .expect("no alarm") + .expect("persistent state"); + + // We should have gotten back a persistent state including the prepare + // received by the SUT. + prop_assert_eq!( + self.sut.persistent_state.prepares.len() + 1, + persistent_state.prepares.len() + ); + prop_assert_eq!( + &prepare_msg.config, + &persistent_state.prepares.get(&msg.epoch).unwrap().config, + ); + + // The SUT should have replied with an ack + prop_assert_eq!(outbox.len(), 1); + let envelope = outbox.pop().expect("prepare ack"); + prop_assert_eq!(envelope.to, coordinator); + prop_assert_eq!(&envelope.from, self.sut.node.platform_id()); + assert_matches!(envelope.msg, PeerMsg::PrepareAck(epoch) => { + prop_assert_eq!(epoch, msg.epoch); + }); + + // The SUT should no longer be coordinating + prop_assert!(self.sut.node.get_coordinator_state().is_none()); + + // All our assertions passed. Update the SUT's persistent state. + self.sut.persistent_state = persistent_state.clone(); + + Ok(()) + } + fn action_tick( &mut self, time_jump: Duration, @@ -694,12 +796,12 @@ impl TestState { // to check this. let reply = PeerMsg::PrepareAck(msg.config.epoch); let mut outbox = Vec::new(); - let output = self.sut.node.handle( - self.model.now, - &mut outbox, - from.clone(), - reply, - ); + let output = self + .sut + .node + .handle(self.model.now, &mut outbox, from.clone(), reply) + .expect("no alarm"); + prop_assert!(output.is_none()); prop_assert!(outbox.is_empty()); @@ -732,12 +834,11 @@ impl TestState { }; let reply = PeerMsg::Share { epoch, share: share.clone() }; let mut outbox = Vec::new(); - let output = self.sut.node.handle( - self.model.now, - &mut outbox, - from.clone(), - reply, - ); + let output = self + .sut + .node + .handle(self.model.now, &mut outbox, from.clone(), reply) + .expect("no alarm"); // If we just received a threshold number of shares, we expect // reconstruction of the rack secret for the last committed @@ -972,7 +1073,17 @@ pub enum Action { // described in RFD 238. // // `Index` is used to compute Z here. + #[weight(1)] Commit(Index), + + // Send a `PrepareMsg` from a fake test node, indicating that Nexus has told + // that node to start coordinating a new reconfiguration . This should cause + // the the `SUT` to stop coordinating if it is currently coordinating. + // + // We always send a `PrepareMsg` with an epoch that is newer than the latest + // configuration, as the goal is for the request to be accepted by the SUT. + #[weight(1)] + SendPrepareToSut(GeneratedConfiguration), } /// Informnation about configurations used at test generation time