Skip to content

Commit

Permalink
Clean up old logs at the beginning of baseline resync.
Browse files Browse the repository at this point in the history
If the follower restarts during baseline resync, it will replay remaining logs first.
However, we have already cleared shard info at the beginning of resync, so we cannot get shard while replaying logs,
which will raise errors.
This change clean up old logs in log store to avoid this situation.
  • Loading branch information
yawzhang committed Feb 11, 2025
1 parent 3e38fa8 commit d642083
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 37 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.6.17"
version = "6.6.18"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
42 changes: 14 additions & 28 deletions src/lib/replication/log_store/home_raft_log_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,40 +49,26 @@ static uint64_t extract_term(const log_buffer& log_bytes) {
return (*r_cast< uint64_t const* >(raw_ptr));
}

#if 0
// Since truncate_lsn can not accross compact_lsn passed down by raft server
// and compact will truncate logs upto compact_lsn, we don't need to re-truncate in this function now.
void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn) {
auto const last_lsn = last_index();
auto const start_lsn = start_index();

// compact_lsn will be zero on first time boot, so we should not truncate in that case.
if (compact_lsn == 0 || (start_lsn + num_reserved_cnt >= last_lsn)) {
void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn, bool force) {
auto const last_lsn = to_store_lsn(last_index());
auto const start_lsn = to_store_lsn(start_index());
auto const calculated_truncate_lsn = last_lsn - num_reserved_cnt;

// if force is set true, we will use calculated truncate lsn directly and trigger the truncate both in memory and device.
// if force is set false, we will use the compact_lsn as a barrier to find the minimum lsn to truncate.
auto const truncate_lsn = force ? calculated_truncate_lsn : std::min(calculated_truncate_lsn, to_store_lsn(compact_lsn));
if (truncate_lsn <= start_lsn) {
REPL_STORE_LOG(DEBUG,
"Store={} LogDev={}: Skipping truncating because of reserved logs entries is not enough or "
"compact_lsn is zero. "
"Store={} LogDev={}: Skipping truncating because of reserved logs entries is not enough."
"start_lsn={}, resv_cnt={}, last_lsn={}, compact_lsn={}",
m_logstore_id, m_logdev_id, start_lsn, num_reserved_cnt, last_lsn, compact_lsn);
return;
} else {
//
// truncate_lsn can not accross compact_lsn passed down by raft server;
//
// When will it happen:
// compact_lsn can be smaller than last_lsn - num_reserved_cnt, when raft is configured with
// snapshot_distance of a large value, and dynamic config "resvered log entries" a smaller value.
//
auto truncate_lsn = std::min(last_lsn - num_reserved_cnt, (ulong)to_store_lsn(compact_lsn));

REPL_STORE_LOG(INFO, "LogDev={}: Truncating log entries from {} to {}, compact_lsn={}, last_lsn={}",
m_logdev_id, start_lsn, truncate_lsn, compact_lsn, last_lsn);
// this will only truncate in memory.
// we rely on resrouce mgr timer to trigger real truncate for all log stores in system;
// this will be friendly for multiple logstore on same logdev;
m_log_store->truncate(truncate_lsn);
}

REPL_STORE_LOG(INFO, "Store={} LogDev={}: Truncating log entries from {} to {}, last_lsn={}, compact_lsn={}, force={}",
m_logstore_id, m_logdev_id, start_lsn, truncate_lsn, last_lsn, compact_lsn, force);
m_log_store->truncate(truncate_lsn, !force /* in_memory_truncate_only */);
}
#endif

HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb,
log_replay_done_cb_t const& log_replay_done_cb) :
Expand Down
8 changes: 4 additions & 4 deletions src/lib/replication/log_store/home_raft_log_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,16 +204,16 @@ class HomeRaftLogStore : public nuraft::log_store {
*/
ulong last_index() const;

#if 0
/**
* Truncates the log store
*
* @param num_reserved_cnt The number of log entries to be reserved.
* @param compact_lsn This is the truncation barrier passed down by raft server. Truncation should not across this
* LSN;
* LSN except in baseline resync case;
* @param force If true, the truncation will be processed both in memory and device. It is a dangerous operation
* and should be used only in baseline resync case (cleanup logs and restore by snapshot).
*/
void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn);
#endif
void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn, bool force);

void wait_for_log_store_ready();
void set_last_durable_lsn(repl_lsn_t lsn);
Expand Down
7 changes: 3 additions & 4 deletions src/lib/replication/repl_dev/raft_repl_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,15 @@ class RaftReplDev : public ReplDev,
*/
void on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done);

#if 0
/**
* Truncates the replication log by providing a specified number of reserved entries.
*
* @param num_reserved_entries The number of reserved entries of the replication log.
* @param force If true, the truncation will be processed both in memory and device.
*/
void truncate(uint32_t num_reserved_entries) {
m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load());
void truncate(uint32_t num_reserved_entries, bool force = false) {
m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load(), force);
}
#endif

void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); }

Expand Down
4 changes: 4 additions & 0 deletions src/lib/replication/repl_dev/raft_state_machine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,10 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id,
if (is_hs_snp_obj(obj_id)) {
// Homestore preserved msg
if (m_rd.save_snp_resync_data(data)) {
// If the follower restarts during baseline resync, it will replay remaining logs first. However, we have
// already cleared shard info at the beginning of resync, so we cannot get shard while replaying logs,
// which will raise errors. Therefore, we need to clean up old logs to avoid this situation.
m_rd.truncate(0, true /* force */);
obj_id = snp_obj_id_type_app;
LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id);
}
Expand Down

0 comments on commit d642083

Please sign in to comment.