From 8b7de61e49ba08c5768f9a7037b6e1659b1eb8c4 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 19 Dec 2024 08:56:36 +0100 Subject: [PATCH] Add safekeeper membership conf to control file. --- Cargo.lock | 2 + libs/safekeeper_api/Cargo.toml | 2 + libs/safekeeper_api/src/lib.rs | 1 + libs/safekeeper_api/src/membership.rs | 163 ++++++++++++++++++ libs/safekeeper_api/src/models.rs | 4 +- safekeeper/src/control_file.rs | 20 ++- safekeeper/src/control_file_upgrade.rs | 145 ++++++++++++++-- safekeeper/src/copy_timeline.rs | 3 +- safekeeper/src/http/routes.rs | 8 +- safekeeper/src/json_ctrl.rs | 2 + safekeeper/src/receive_wal.rs | 9 +- safekeeper/src/safekeeper.rs | 113 +++--------- safekeeper/src/state.rs | 33 ++-- safekeeper/src/timelines_global_map.rs | 4 +- .../tests/walproposer_sim/safekeeper.rs | 11 +- 15 files changed, 380 insertions(+), 140 deletions(-) create mode 100644 libs/safekeeper_api/src/membership.rs diff --git a/Cargo.lock b/Cargo.lock index d1f77469699e6..73aa216fe1a0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5565,10 +5565,12 @@ dependencies = [ name = "safekeeper_api" version = "0.1.0" dependencies = [ + "anyhow", "const_format", "postgres_ffi", "pq_proto", "serde", + "serde_json", "tokio", "utils", ] diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 4234ec6779a20..7652c3d4137a9 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -5,8 +5,10 @@ edition.workspace = true license.workspace = true [dependencies] +anyhow.workspace = true const_format.workspace = true serde.workspace = true +serde_json.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true tokio.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index be6923aca9028..76d46f038dfec 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -4,6 +4,7 @@ use const_format::formatcp; use pq_proto::SystemId; use serde::{Deserialize, Serialize}; +pub mod membership; /// Public API types pub mod models; diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs new file mode 100644 index 0000000000000..863393347d0b1 --- /dev/null +++ b/libs/safekeeper_api/src/membership.rs @@ -0,0 +1,163 @@ +//! Types defining safekeeper membership, see +//! rfcs/035-safekeeper-dynamic-membership-change.md +//! for details. + +use std::{collections::HashSet, fmt::Display}; + +use anyhow; +use anyhow::bail; +use serde::{Deserialize, Serialize}; +use utils::id::NodeId; + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +pub type Generation = u32; +/// 1 is the first valid generation, 0 is used as +/// a placeholder before we fully migrate to generations. +pub const INVALID_GENERATION: Generation = 0; + +/// Membership is defined by ids so e.g. walproposer uses them to figure out +/// quorums, but we also carry host and port to give wp idea where to connect. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafekeeperId { + pub id: NodeId, + pub host: String, + pub pg_port: u16, +} + +impl Display for SafekeeperId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port) + } +} + +/// Set of safekeepers. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(transparent)] +pub struct MemberSet { + pub members: Vec, +} + +impl MemberSet { + pub fn empty() -> Self { + MemberSet { + members: Vec::new(), + } + } + + pub fn new(members: Vec) -> anyhow::Result { + let hs: HashSet = HashSet::from_iter(members.iter().map(|sk| sk.id)); + if hs.len() != members.len() { + bail!("duplicate safekeeper id in the set {:?}", members); + } + Ok(MemberSet { members }) + } + + pub fn contains(&self, sk: &SafekeeperId) -> bool { + self.members.iter().any(|m| m.id == sk.id) + } + + pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { + if self.contains(&sk) { + bail!(format!( + "sk {} is already member of the set {}", + sk.id, self + )); + } + self.members.push(sk); + Ok(()) + } +} + +impl Display for MemberSet { + /// Display as a comma separated list of members. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let sks_str = self + .members + .iter() + .map(|m| m.to_string()) + .collect::>(); + write!(f, "({})", sks_str.join(", ")) + } +} + +/// Safekeeper membership configuration. +/// Note: it is a part of both control file and http API. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Configuration { + /// Unique id. + pub generation: Generation, + /// Current members of the configuration. + pub members: MemberSet, + /// Some means it is a joint conf. + pub new_members: Option, +} + +impl Configuration { + /// Used for pre-generations timelines, will be removed eventually. + pub fn empty() -> Self { + Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + } + } +} + +impl Display for Configuration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "gen={}, members={}, new_members={}", + self.generation, + self.members, + self.new_members + .as_ref() + .map(ToString::to_string) + .unwrap_or(String::from("none")) + ) + } +} + +#[cfg(test)] +mod tests { + use super::{MemberSet, SafekeeperId}; + use utils::id::NodeId; + + #[test] + fn test_member_set() { + let mut members = MemberSet::empty(); + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .unwrap(); + + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .expect_err("duplicate must not be allowed"); + + members + .add(SafekeeperId { + id: NodeId(43), + host: String::from("bubu.org"), + pg_port: 5432, + }) + .unwrap(); + + println!("members: {}", members); + + let j = serde_json::to_string(&members).expect("failed to serialize"); + println!("members json: {}", j); + assert_eq!( + j, + r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# + ); + } +} diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 3e424a792c7f0..f5a6c83f71bb5 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -11,7 +11,7 @@ use utils::{ pageserver_feedback::PageserverFeedback, }; -use crate::{ServerInfo, Term}; +use crate::{membership::Configuration, ServerInfo, Term}; #[derive(Debug, Serialize)] pub struct SafekeeperStatus { @@ -22,7 +22,7 @@ pub struct SafekeeperStatus { pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, - pub peer_ids: Option>, + pub mconf: Configuration, pub pg_version: u32, pub system_id: Option, pub wal_seg_size: Option, diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 06e5afbf74ecb..e92ca881e15e9 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -3,6 +3,7 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; use tokio::fs::File; use tokio::io::AsyncWriteExt; use utils::crashsafe::durable_rename; @@ -13,14 +14,14 @@ use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v9_to_v8; +use crate::control_file_upgrade::downgrade_v10_to_v9; use crate::control_file_upgrade::upgrade_control_file; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 9; +pub const SK_FORMAT_VERSION: u32 = 10; // contains persistent metadata for safekeeper pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; @@ -169,10 +170,11 @@ impl TimelinePersistentState { let mut buf: Vec = Vec::new(); WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - if self.eviction_state == EvictionState::Present { - // temp hack for forward compatibility - const PREV_FORMAT_VERSION: u32 = 8; - let prev = downgrade_v9_to_v8(self); + if self.mconf.generation == INVALID_GENERATION { + // Temp hack for forward compatibility test: in case of none + // configuration save cfile in previous v9 format. + const PREV_FORMAT_VERSION: u32 = 9; + let prev = downgrade_v10_to_v9(self); WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; prev.ser_into(&mut buf)?; } else { @@ -233,6 +235,7 @@ impl Storage for FileStorage { #[cfg(test)] mod test { use super::*; + use safekeeper_api::membership::{Configuration, MemberSet}; use tokio::fs; use utils::lsn::Lsn; @@ -242,6 +245,11 @@ mod test { async fn test_read_write_safekeeper_state() -> anyhow::Result<()> { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); + state.mconf = Configuration { + generation: 42, + members: MemberSet::empty(), + new_members: None, + }; let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?; // Make a change. diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index dd152fd4cce88..11ef328d8047f 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,17 +1,22 @@ //! Code to deal with safekeeper control file upgrades +use std::vec; + use crate::{ safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, - state::{EvictionState, PersistedPeers, TimelinePersistentState}, + state::{EvictionState, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; -use safekeeper_api::{ServerInfo, Term}; +use safekeeper_api::{ + membership::{Configuration, INVALID_GENERATION}, + ServerInfo, Term, +}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, - id::{TenantId, TimelineId}, + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, }; @@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 { pub partial_backup: wal_backup_partial::State, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + pub backup_lsn: Lsn, + /// Term of the last entry. + pub term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + pub commit_lsn: Lsn, +} + +impl PersistedPeerInfo { + pub fn new() -> Self { + Self { + backup_lsn: Lsn::INVALID, + term: safekeeper_api::INVALID_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// make clippy happy +impl Default for PersistedPeerInfo { + fn default() -> Self { + Self::new() + } +} + +/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TimelinePersistentStateV9 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result Result Result Result Result Result Result Result Result SafeKeeperStateV8 { - assert!(state.eviction_state == EvictionState::Present); - SafeKeeperStateV8 { +// Used as a temp hack to make forward compatibility test work. Should be +// removed after PR adding v10 is merged. +pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 { + assert!(state.mconf.generation == INVALID_GENERATION); + TimelinePersistentStateV9 { tenant_id: state.tenant_id, timeline_id: state.timeline_id, acceptor_state: state.acceptor_state.clone(), @@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, - peers: state.peers.clone(), + peers: PersistedPeers(vec![]), partial_backup: state.partial_backup.clone(), + eviction_state: state.eviction_state, } } @@ -437,7 +554,7 @@ mod tests { use utils::{id::NodeId, Hex}; - use crate::safekeeper::PersistedPeerInfo; + use crate::control_file_upgrade::PersistedPeerInfo; use super::*; diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 28ef2b1d23f1e..d0333bb402330 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Result}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use safekeeper_api::membership::Configuration; use std::sync::Arc; use tokio::{ fs::OpenOptions, @@ -147,8 +148,8 @@ pub async fn handle_request( let mut new_state = TimelinePersistentState::new( &request.destination_ttid, + Configuration::empty(), state.server.clone(), - vec![], request.until_lsn, start_lsn, )?; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9bc1bf340919b..2a62db410052f 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -118,7 +118,13 @@ async fn timeline_create_handler(mut request: Request) -> Result NetworkReader<'_, IO> { }; let tli = self .global_timelines - .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) + .create( + self.ttid, + Configuration::empty(), + server_info, + Lsn::INVALID, + Lsn::INVALID, + ) .await .context("create timeline")?; tli.wal_residence_guard().await? diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 6ceaf325b049d..f1e48a2fac2cc 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -193,36 +193,6 @@ impl AcceptorState { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeerInfo { - /// LSN up to which safekeeper offloaded WAL to s3. - pub backup_lsn: Lsn, - /// Term of the last entry. - pub term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - /// Up to which LSN safekeeper regards its WAL as committed. - pub commit_lsn: Lsn, -} - -impl PersistedPeerInfo { - pub fn new() -> Self { - Self { - backup_lsn: Lsn::INVALID, - term: INVALID_TERM, - flush_lsn: Lsn(0), - commit_lsn: Lsn(0), - } - } -} - -// make clippy happy -impl Default for PersistedPeerInfo { - fn default() -> Self { - Self::new() - } -} - // protocol messages /// Initial Proposer -> Acceptor message @@ -1025,12 +995,20 @@ where #[cfg(test)] mod tests { use futures::future::BoxFuture; + use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; - use safekeeper_api::ServerInfo; + use safekeeper_api::{ + membership::{Configuration, MemberSet, SafekeeperId}, + ServerInfo, + }; use super::*; - use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; - use std::{ops::Deref, str::FromStr, time::Instant}; + use crate::state::{EvictionState, TimelinePersistentState}; + use std::{ + ops::Deref, + str::FromStr, + time::{Instant, UNIX_EPOCH}, + }; // fake storage for tests struct InMemoryState { @@ -1319,6 +1297,16 @@ mod tests { let state = TimelinePersistentState { tenant_id, timeline_id, + mconf: Configuration { + generation: 42, + members: MemberSet::new(vec![SafekeeperId { + id: NodeId(1), + host: "hehe.org".to_owned(), + pg_port: 5432, + }]) + .expect("duplicate member"), + new_members: None, + }, acceptor_state: AcceptorState { term: 42, term_history: TermHistory(vec![TermLsn { @@ -1342,70 +1330,13 @@ mod tests { backup_lsn: Lsn(1234567300), peer_horizon_lsn: Lsn(9999999), remote_consistent_lsn: Lsn(1234560000), - peers: PersistedPeers(vec![( - NodeId(1), - PersistedPeerInfo { - backup_lsn: Lsn(1234567000), - term: 42, - flush_lsn: Lsn(1234567800 - 8), - commit_lsn: Lsn(1234567600), - }, - )]), partial_backup: crate::wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: UNIX_EPOCH, }; let ser = state.ser().unwrap(); - #[rustfmt::skip] - let expected = [ - // tenant_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, - // timeline_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, - // term - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // length prefix - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // unsure why this order is swapped - 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // pg_version - 0x0e, 0x00, 0x00, 0x00, - // systemid - 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, - // wal_seg_size - 0x78, 0x56, 0x34, 0x12, - // pguuid as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31, - - // timeline_start_lsn - 0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - // length prefix for persistentpeers - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // nodeid - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // backuplsn - 0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - // partial_backup - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // eviction_state - 0x00, 0x00, 0x00, 0x00, - ]; - - assert_eq!(Hex(&ser), Hex(&expected)); - let deser = TimelinePersistentState::des(&ser).unwrap(); assert_eq!(deser, state); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index c6ae6c1d2b0ef..aa8517070212c 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,20 +1,22 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref}; +use std::{cmp::max, ops::Deref, time::SystemTime}; use anyhow::{bail, Result}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term}; +use safekeeper_api::{ + membership::Configuration, models::TimelineTermBumpResponse, ServerInfo, Term, +}; use serde::{Deserialize, Serialize}; use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; use crate::{ control_file, - safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION}, + safekeeper::{AcceptorState, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION}, timeline::TimelineError, wal_backup_partial::{self}, }; @@ -27,6 +29,8 @@ pub struct TimelinePersistentState { pub tenant_id: TenantId, #[serde(with = "hex")] pub timeline_id: TimelineId, + /// Membership configuration. + pub mconf: Configuration, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -58,22 +62,15 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - /// Peers and their state as we remember it. Knowing peers themselves is - /// fundamental; but state is saved here only for informational purposes and - /// obviously can be stale. (Currently not saved at all, but let's provision - /// place to have less file version upgrades). - pub peers: PersistedPeers, /// Holds names of partial segments uploaded to remote storage. Used to /// clean up old objects without leaving garbage in remote storage. pub partial_backup: wal_backup_partial::State, /// Eviction state of the timeline. If it's Offloaded, we should download /// WAL files from remote storage to serve the timeline. pub eviction_state: EvictionState, + pub creation_ts: SystemTime, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); - /// State of the local WAL files. Used to track current timeline state, /// that can be either WAL files are present on disk or last partial segment /// is offloaded to remote storage. @@ -89,8 +86,8 @@ pub enum EvictionState { impl TimelinePersistentState { pub fn new( ttid: &TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, - peers: Vec, commit_lsn: Lsn, local_start_lsn: Lsn, ) -> anyhow::Result { @@ -113,6 +110,7 @@ impl TimelinePersistentState { Ok(TimelinePersistentState { tenant_id: ttid.tenant_id, timeline_id: ttid.timeline_id, + mconf, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), @@ -125,26 +123,21 @@ impl TimelinePersistentState { backup_lsn: local_start_lsn, peer_horizon_lsn: local_start_lsn, remote_consistent_lsn: Lsn(0), - peers: PersistedPeers( - peers - .iter() - .map(|p| (*p, PersistedPeerInfo::new())) - .collect(), - ), partial_backup: wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: SystemTime::now(), }) } pub fn empty() -> Self { TimelinePersistentState::new( &TenantTimelineId::empty(), + Configuration::empty(), ServerInfo { pg_version: 170000, /* Postgres server version (major * 10000) */ system_id: 0, /* Postgres system identifier */ wal_seg_size: WAL_SEGMENT_SIZE as u32, }, - vec![], Lsn::INVALID, Lsn::INVALID, ) diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index ad29c9f66c2c1..3e0af03621b46 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; +use safekeeper_api::membership::Configuration; use safekeeper_api::ServerInfo; use serde::Serialize; use std::collections::HashMap; @@ -214,6 +215,7 @@ impl GlobalTimelines { pub(crate) async fn create( &self, ttid: TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, @@ -240,7 +242,7 @@ impl GlobalTimelines { // TODO: currently we create only cfile. It would be reasonable to // immediately initialize first WAL segment as well. let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + TimelinePersistentState::new(&ttid, mconf, server_info, commit_lsn, local_start_lsn)?; control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?; let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?; Ok(timeline) diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index efcdd89e7da75..a99de71a041b8 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -21,7 +21,7 @@ use safekeeper::{ wal_storage::Storage, SafeKeeperConf, }; -use safekeeper_api::ServerInfo; +use safekeeper_api::{membership::Configuration, ServerInfo}; use tracing::{debug, info_span, warn}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -96,8 +96,13 @@ impl GlobalMap { let commit_lsn = Lsn::INVALID; let local_start_lsn = Lsn::INVALID; - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + let state = TimelinePersistentState::new( + &ttid, + Configuration::empty(), + server_info, + commit_lsn, + local_start_lsn, + )?; let disk_timeline = self.disk.put_state(&ttid, state); let control_store = DiskStateStorage::new(disk_timeline.clone());