From 38d9cf4ca760c667d105435a714f76dbff926960 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 01:59:30 -0500 Subject: [PATCH 1/8] osd/scrub: introduce ScrubStore::at_level_t to hold the caching and backend details related to the representation of scrub-detected errors as OMap entries of a uniquely-named object. In a followup commit - the ScrubStore is modified to hold two of these objects, one for the shallow errors and one for the deep errors. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 49 +++++++++++++++++++++---------- src/osd/scrubber/ScrubStore.h | 53 +++++++++++++++++++++++++++++----- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index a00ab2caecee6..af223cb5cdc09 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -109,19 +109,29 @@ Store::create(ObjectStore* store, ceph_assert(t); ghobject_t oid = make_scrub_object(pgid); t->touch(coll, oid); - return new Store{coll, oid, store}; + return new Store{*store, t, pgid, coll}; +} + + +Store::Store( + ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll) + : object_store{osd_store} + , coll{coll} +{ + ceph_assert(t); + + const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); + t->touch(coll, err_obj); + errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj}); } -Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) - : coll(coll), - hoid(oid), - driver(store, coll, hoid), - backend(&driver) -{} Store::~Store() { - ceph_assert(results.empty()); + ceph_assert(!errors_db || errors_db->results.empty()); } void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) @@ -131,11 +141,13 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { + const auto key = to_object_key(pool, e.object); bufferlist bl; e.encode(bl); - results[to_object_key(pool, e.object)] = bl; + errors_db->results[key] = bl; } + void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) { add_snap_error(pool, e); @@ -145,26 +157,28 @@ void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { bufferlist bl; e.encode(bl); - results[to_snap_key(pool, e.object)] = bl; + errors_db->results[to_snap_key(pool, e.object)] = bl; } bool Store::empty() const { - return results.empty(); + return errors_db->results.empty(); } void Store::flush(ObjectStore::Transaction* t) { if (t) { - OSDriver::OSTransaction txn = driver.get_transaction(t); - backend.set_keys(results, &txn); + OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t); + errors_db->backend.set_keys(errors_db->results, &txn); } - results.clear(); + errors_db->results.clear(); } void Store::cleanup(ObjectStore::Transaction* t) { - t->remove(coll, hoid); + ceph_assert(t); + if (errors_db) + t->remove(coll, errors_db->errors_hoid); } std::vector @@ -195,8 +209,11 @@ Store::get_errors(const string& begin, uint64_t max_return) const { vector errors; + if (!errors_db) + return errors; + auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !backend.get_next(next.first, &next)) { + while (max_return && !errors_db->backend.get_next(next.first, &next)) { if (next.first >= end) break; errors.push_back(next.second); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 567badf608b6c..949a976051e67 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -5,6 +5,7 @@ #define CEPH_SCRUB_RESULT_H #include "common/map_cacher.hpp" +#include "osd/osd_types_fmt.h" #include "osd/SnapMapper.h" // for OSDriver namespace librados { @@ -45,18 +46,56 @@ class Store { uint64_t max_return) const; private: - Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); + /** + * at_level_t + * + * The machinery for caching and storing errors at a specific scrub level. + */ + struct at_level_t { + at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr) + : errors_hoid{err_obj} + , driver{std::move(drvr)} + , backend{&driver} + {} + + /// the object in the PG store, where the errors are stored + ghobject_t errors_hoid; + + /// abstracted key fetching + OSDriver driver; + + /// a K,V cache for the errors that are detected during the scrub + /// session. The errors marked for a specific object are stored as + /// an OMap entry with the object's name as the key. + MapCacher::MapCacher backend; + + /// a temp object mapping seq-id to inconsistencies + std::map results; + }; + + Store(ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; private: + /// the OSD's storage backend + ObjectStore& object_store; + + /// the collection (i.e. - the PG store) in which the errors are stored const coll_t coll; - const ghobject_t hoid; - // a temp object holding mappings from seq-id to inconsistencies found in - // scrubbing - OSDriver driver; - mutable MapCacher::MapCacher backend; - std::map results; + + /** + * the machinery (backend details, cache, etc.) for storing both levels + * of errors (note: 'optional' to allow delayed creation w/o dynamic + * allocations; and 'mutable', as the caching mechanism is used in const + * methods) + */ + mutable std::optional errors_db; + // not yet: mutable std::optional deep_db; }; } // namespace Scrub From 571e2f3c193fc0d117cfd577fe90798fc75e98fa Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 03:58:59 -0500 Subject: [PATCH 2/8] osd/scrub: directly create or reinit the ScrubStore The ScrubStore is now directly created or reinitialized by the Scrubber. Note that the store object is not identical to the errors DB: the errors DB is an entity in the OSD store (a collection of OMap entries in a uniquely-named object(s)), while the ScrubSTore object is a cacher and accessor for that entity. That one can be recreated or disposed of at will. We now do not recreate the ScrubStore object for every scrub. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 41 ++++++++++++++++++++----------- src/osd/scrubber/ScrubStore.h | 42 ++++++++++++++++++++++++-------- src/osd/scrubber/pg_scrubber.cc | 43 +++++++++++++++++++++++++++------ src/osd/scrubber/pg_scrubber.h | 10 ++++++++ 4 files changed, 104 insertions(+), 32 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index af223cb5cdc09..0c36be6b66b02 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -99,20 +99,6 @@ string last_snap_key(int64_t pool) namespace Scrub { -Store* -Store::create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll) -{ - ceph_assert(store); - ceph_assert(t); - ghobject_t oid = make_scrub_object(pgid); - t->touch(coll, oid); - return new Store{*store, t, pgid, coll}; -} - - Store::Store( ObjectStore& osd_store, ObjectStore::Transaction* t, @@ -174,6 +160,33 @@ void Store::flush(ObjectStore::Transaction* t) errors_db->results.clear(); } + +void Store::clear_level_db( + ObjectStore::Transaction* t, + at_level_t& db) +{ + // easiest way to guarantee that the object representing the DB exists + t->touch(coll, db.errors_hoid); + + // remove all the keys in the DB + t->omap_clear(coll, db.errors_hoid); + + // restart the 'in progress' part of the MapCacher + db.backend.reset(); +} + + +void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level) +{ + // Note: only one caller, and it creates the transaction passed to reinit(). + // No need to assert on 't' + + if (errors_db) { + clear_level_db(t, *errors_db); + } +} + + void Store::cleanup(ObjectStore::Transaction* t) { ceph_assert(t); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 949a976051e67..600905e85e8a2 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -20,11 +20,16 @@ namespace Scrub { class Store { public: ~Store(); - static Store* create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); + + Store(ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + + + /// mark down detected errors, either shallow or deep void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); + void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); // and a variant-friendly interface: @@ -33,8 +38,22 @@ class Store { bool empty() const; void flush(ObjectStore::Transaction*); + + /// remove both shallow and deep errors DBs. Called on interval. void cleanup(ObjectStore::Transaction*); + /** + * prepare the Store object for a new scrub session. + * This involves clearing one or both of the errors DBs, and resetting + * the cache. + * + * @param level: the scrub level to prepare for. Whenever a deep scrub + * is requested, both the shallow and deep errors DBs are cleared. + * If, on the other hand, a shallow scrub is requested, only the shallow + * errors DB is cleared. + */ + void reinit(ObjectStore::Transaction* t, scrub_level_t level); + std::vector get_snap_errors( int64_t pool, const librados::object_id_t& start, @@ -73,15 +92,9 @@ class Store { std::map results; }; - Store(ObjectStore& osd_store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); - std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; - private: /// the OSD's storage backend ObjectStore& object_store; @@ -96,6 +109,15 @@ class Store { */ mutable std::optional errors_db; // not yet: mutable std::optional deep_db; + + /** + * Clear the DB of errors at a specific scrub level by performing an + * omap_clear() on the DB object, and resetting the MapCacher. + */ + void clear_level_db( + ObjectStore::Transaction* t, + at_level_t& db); + }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 555d13ba72b2b..a085481f477ac 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica, m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); } +// only called on interval change. Both DBs are to be removed. void PgScrubber::cleanup_store(ObjectStore::Transaction* t) { if (!m_store) @@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t) ceph_assert(!m_store); } + +void PgScrubber::reinit_scrub_store() +{ + // Entering, 0 to 3 of the following objects(*) may exist: + // ((*)'objects' here: both code objects (the ScrubStore object) and + // actual Object Store objects). + // 1. The ScrubStore object itself. + // 2,3. The two special hobjects in the coll (the PG data) holding the last + // scrub's results. <> + // + // The Store object can be deleted and recreated, as a way to guarantee + // no junk is left. We won't do it here, but we will clear the at_level_t + // structures. + // The hobjects: possibly. The shallow DB object is always cleared. The + // deep one - only if running a deep scrub. + ObjectStore::Transaction t; + if (m_store) { + dout(10) << __func__ << " reusing existing store" << dendl; + m_store->flush(&t); + } else { + dout(10) << __func__ << " creating new store" << dendl; + m_store = std::make_unique( + *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); + } + + // regardless of whether the ScrubStore object was recreated or reused, we need to + // (possibly) clear the actual DB objects in the Object Store. + m_store->reinit(&t, m_active_target->level()); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); +} + + void PgScrubber::on_init() { // going upwards from 'inactive' @@ -1217,14 +1250,8 @@ void PgScrubber::on_init() m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow, m_pg->get_actingset()); - // create a new store - { - ObjectStore::Transaction t; - cleanup_store(&t); - m_store.reset( - Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); - m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); - } + // create or reuse the 'known errors' store + reinit_scrub_store(); m_start = m_pg->info.pgid.pgid.get_hobj_start(); m_active = true; diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index ff8c98d387ea2..1a5813bd9235c 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -771,6 +771,16 @@ class PgScrubber : public ScrubPgIF, std::unique_ptr m_store; + /** + * the ScrubStore sub-object caches and manages the database of known + * scrub errors. reinit_scrub_store() clears the database and re-initializes + * the ScrubStore object. + * + * in the next iteration - reinit_..() potentially deletes only the + * shallow errors part of the database. + */ + void reinit_scrub_store(); + int num_digest_updates_pending{0}; hobject_t m_start, m_end; ///< note: half-closed: [start,end) From ce58c88158381e252ffa432ff855a01570cc98dd Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 05:15:57 -0500 Subject: [PATCH 3/8] osd/scrub: add dout() capability to the ScrubStore now that the ScrubSTore object is directly created by the scrubber, (and has a lifetime that does not extend beyond the scrubber object), we can add the same dout() mechanism used by the other scrubber sub-objects. Note: that mechanism will be changed shortly, so that the sub-objects would use one prefix() creator supplied by the Scrubber object. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 50 +++++++++++++++++++++++++++++---- src/osd/scrubber/ScrubStore.h | 20 +++++++++---- src/osd/scrubber/pg_scrubber.cc | 2 +- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 0c36be6b66b02..dd141d1c38ca4 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -1,11 +1,13 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "ScrubStore.h" +#include "./ScrubStore.h" #include "osd/osd_types.h" #include "common/scrub_types.h" #include "include/rados/rados_types.hpp" +#include "pg_scrubber.h" + using std::ostringstream; using std::string; using std::vector; @@ -95,16 +97,31 @@ string last_snap_key(int64_t pool) hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } + +} // namespace + +#undef dout_context +#define dout_context (m_scrubber.get_pg_cct()) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix_fn(_dout, this, __func__) + +template +static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "") +{ + return t->gen_prefix(*_dout, fn); } namespace Scrub { Store::Store( + PgScrubber& scrubber, ObjectStore& osd_store, ObjectStore::Transaction* t, const spg_t& pgid, const coll_t& coll) - : object_store{osd_store} + : m_scrubber{scrubber} + , object_store{osd_store} , coll{coll} { ceph_assert(t); @@ -120,6 +137,18 @@ Store::~Store() ceph_assert(!errors_db || errors_db->results.empty()); } + +std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const +{ + if (fn.starts_with("operator")) { + // it's a lambda, and __func__ is not available + return m_scrubber.gen_prefix(out) << "Store::"; + } else { + return m_scrubber.gen_prefix(out) << "Store::" << fn << ": "; + } +} + + void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) { add_object_error(pool, e); @@ -163,8 +192,11 @@ void Store::flush(ObjectStore::Transaction* t) void Store::clear_level_db( ObjectStore::Transaction* t, - at_level_t& db) + at_level_t& db, + std::string_view db_name) { + dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name) + << dendl; // easiest way to guarantee that the object representing the DB exists t->touch(coll, db.errors_hoid); @@ -176,19 +208,27 @@ void Store::clear_level_db( } -void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level) +void Store::reinit( + ObjectStore::Transaction* t, + [[maybe_unused]] scrub_level_t level) { + dout(20) << fmt::format( + "re-initializing the Scrub::Store (for {} scrub)", + (level == scrub_level_t::deep ? "deep" : "shallow")) + << dendl; + // Note: only one caller, and it creates the transaction passed to reinit(). // No need to assert on 't' if (errors_db) { - clear_level_db(t, *errors_db); + clear_level_db(t, *errors_db, "scrub"); } } void Store::cleanup(ObjectStore::Transaction* t) { + dout(20) << "discarding error DBs" << dendl; ceph_assert(t); if (errors_db) t->remove(coll, errors_db->errors_hoid); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 600905e85e8a2..a83841e2cfbb2 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -14,6 +14,7 @@ struct object_id_t; struct inconsistent_obj_wrapper; struct inconsistent_snapset_wrapper; +class PgScrubber; namespace Scrub { @@ -21,10 +22,12 @@ class Store { public: ~Store(); - Store(ObjectStore& osd_store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); + Store( + PgScrubber& scrubber, + ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); /// mark down detected errors, either shallow or deep @@ -64,6 +67,8 @@ class Store { const librados::object_id_t& start, uint64_t max_return) const; + std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const; + private: /** * at_level_t @@ -95,6 +100,10 @@ class Store { std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; + + /// access to the owning Scrubber object, for logging mostly + PgScrubber& m_scrubber; + /// the OSD's storage backend ObjectStore& object_store; @@ -116,7 +125,8 @@ class Store { */ void clear_level_db( ObjectStore::Transaction* t, - at_level_t& db); + at_level_t& db, + std::string_view db_name); }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index a085481f477ac..81093666f91c8 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1223,7 +1223,7 @@ void PgScrubber::reinit_scrub_store() } else { dout(10) << __func__ << " creating new store" << dendl; m_store = std::make_unique( - *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); + *this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); } // regardless of whether the ScrubStore object was recreated or reused, we need to From 283f4c258641f86d3de3431bfdfba31856387ea6 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 05:25:05 -0500 Subject: [PATCH 4/8] common: extend MapCacher API to include 'no out' version of get_next() Signed-off-by: Ronen Friedman --- src/common/map_cacher.hpp | 45 +++++++++++++++++++++++++++++++++++ src/osd/scrubber/ScrubStore.h | 5 ++++ 2 files changed, 50 insertions(+) diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp index 4d843be75dc64..95353425de9e6 100644 --- a/src/common/map_cacher.hpp +++ b/src/common/map_cacher.hpp @@ -16,6 +16,7 @@ #define MAPCACHER_H #include "include/Context.h" +#include "include/expected.hpp" #include "common/sharedptr_registry.hpp" namespace MapCacher { @@ -130,6 +131,50 @@ class MapCacher { return -EINVAL; } ///< @return error value, 0 on success, -ENOENT if no more entries + /// Fetch first key/value std::pair after specified key + struct PosAndData { + K last_key; + V data; + }; + using MaybePosAndData = tl::expected; + + MaybePosAndData get_1st_after_key( + K key ///< [in] key after which to get next + ) + { + ceph_assert(driver); + while (true) { + std::pair> cached; + bool got_cached = in_progress.get_next(key, &cached); + + ///\todo a driver->get_next() that returns an expected would be nice + bool got_store{false}; + std::pair store; + int r = driver->get_next(key, &store); + if (r < 0 && r != -ENOENT) { + return tl::unexpected(r); + } else if (r == 0) { + got_store = true; + } + + if (!got_cached && !got_store) { + return tl::unexpected(-ENOENT); + } else if (got_cached && (!got_store || store.first >= cached.first)) { + if (cached.second) { + return PosAndData{cached.first, *cached.second}; + } else { + key = cached.first; + continue; // value was cached as removed, recurse + } + } else { + return PosAndData{store.first, store.second}; + } + } + ceph_abort(); // not reachable + return tl::unexpected(-EINVAL); + } + + /// Adds operation setting keys to Transaction void set_keys( const std::map &keys, ///< [in] keys/values to std::set diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index a83841e2cfbb2..7d590d2d1915e 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -97,6 +97,11 @@ class Store { std::map results; }; + using CacherPosData = + MapCacher::MapCacher::PosAndData; + using ExpCacherPosData = tl::expected; + + std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; From 031580fb662f35daacf61a8aa2a4b4f3b32b7b6b Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 08:51:22 -0500 Subject: [PATCH 5/8] common/scrub,osd/scrub: minor cleanups to ScrubStore Including: - introducing 'no out param' encode() for the inconsistent wrappers; - renaming the ambiguous 'empty()' to 'is_empty()'; - removing unused code; - a few other minor cleanups. Signed-off-by: Ronen Friedman --- src/common/scrub_types.cc | 14 ++++++++ src/common/scrub_types.h | 2 ++ src/osd/scrubber/ScrubStore.cc | 61 +++++++++++----------------------- src/osd/scrubber/ScrubStore.h | 18 ++++------ 4 files changed, 42 insertions(+), 53 deletions(-) diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc index b03a3cab70c84..4b4d191e09c39 100644 --- a/src/common/scrub_types.cc +++ b/src/common/scrub_types.cc @@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const ENCODE_FINISH(bl); } +bufferlist inconsistent_obj_wrapper::encode() const +{ + bufferlist bl; + encode(bl); + return bl; +} + void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp) { DECODE_START(2, bp); @@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const ENCODE_FINISH(bl); } +bufferlist inconsistent_snapset_wrapper::encode() const +{ + bufferlist bl; + encode(bl); + return bl; +} + void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp) { DECODE_START(2, bp); diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h index dd206f56f6035..d86fc12b6c8cf 100644 --- a/src/common/scrub_types.h +++ b/src/common/scrub_types.h @@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t { const pg_shard_t &primary); void set_version(uint64_t ver) { version = ver; } void encode(ceph::buffer::list& bl) const; + ceph::buffer::list encode() const; void decode(ceph::buffer::list::const_iterator& bp); }; @@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t { void set_size_mismatch(); void encode(ceph::buffer::list& bl) const; + ceph::buffer::list encode() const; void decode(ceph::buffer::list::const_iterator& bp); }; diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index dd141d1c38ca4..033ea6b24dfd4 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -15,21 +15,9 @@ using std::vector; using ceph::bufferlist; namespace { -ghobject_t make_scrub_object(const spg_t& pgid) -{ - ostringstream ss; - ss << "scrub_" << pgid; - return pgid.make_temp_ghobject(ss.str()); -} - string first_object_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); + auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, ""); hoid.build_hash_cache(); return "SCRUB_OBJ_" + hoid.to_str(); } @@ -49,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid) string last_object_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); + auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, ""); hoid.build_hash_cache(); return "SCRUB_OBJ_" + hoid.to_str(); } @@ -62,14 +45,9 @@ string last_object_key(int64_t pool) string first_snap_key(int64_t pool) { // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for - // the representing the minimal and maximum keys. and this relies on how + // representing the minimal and maximum keys. and this relies on how // hobject_t::to_str() works: hex(pool).hex(revhash). - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); + auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, ""); hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } @@ -88,12 +66,7 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid) string last_snap_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); + auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, ""); hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } @@ -168,22 +141,23 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) add_snap_error(pool, e); } + void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { - bufferlist bl; - e.encode(bl); - errors_db->results[to_snap_key(pool, e.object)] = bl; + errors_db->results[to_snap_key(pool, e.object)] = e.encode(); } -bool Store::empty() const + +bool Store::is_empty() const { - return errors_db->results.empty(); + return !errors_db || errors_db->results.empty(); } + void Store::flush(ObjectStore::Transaction* t) { if (t) { - OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t); + auto txn = errors_db->driver.get_transaction(t); errors_db->backend.set_keys(errors_db->results, &txn); } errors_db->results.clear(); @@ -234,10 +208,11 @@ void Store::cleanup(ObjectStore::Transaction* t) t->remove(coll, errors_db->errors_hoid); } -std::vector -Store::get_snap_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const + +std::vector Store::get_snap_errors( + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const { const string begin = (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start)); @@ -272,6 +247,8 @@ Store::get_errors(const string& begin, errors.push_back(next.second); max_return--; } + + dout(10) << fmt::format("{} errors reported", errors.size()) << dendl; return errors; } diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 7d590d2d1915e..9eb77ab667db7 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -1,8 +1,6 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab - -#ifndef CEPH_SCRUB_RESULT_H -#define CEPH_SCRUB_RESULT_H +#pragma once #include "common/map_cacher.hpp" #include "osd/osd_types_fmt.h" @@ -39,7 +37,7 @@ class Store { void add_error(int64_t pool, const inconsistent_obj_wrapper& e); void add_error(int64_t pool, const inconsistent_snapset_wrapper& e); - bool empty() const; + [[nodiscard]] bool is_empty() const; void flush(ObjectStore::Transaction*); /// remove both shallow and deep errors DBs. Called on interval. @@ -101,11 +99,6 @@ class Store { MapCacher::MapCacher::PosAndData; using ExpCacherPosData = tl::expected; - - std::vector get_errors(const std::string& start, - const std::string& end, - uint64_t max_return) const; - /// access to the owning Scrubber object, for logging mostly PgScrubber& m_scrubber; @@ -124,6 +117,11 @@ class Store { mutable std::optional errors_db; // not yet: mutable std::optional deep_db; + std::vector get_errors( + const std::string& start, + const std::string& end, + uint64_t max_return) const; + /** * Clear the DB of errors at a specific scrub level by performing an * omap_clear() on the DB object, and resetting the MapCacher. @@ -135,5 +133,3 @@ class Store { }; } // namespace Scrub - -#endif // CEPH_SCRUB_RESULT_H From daf848fa5afcf4ad86388eade472d2c3a4873826 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 23:09:51 -0500 Subject: [PATCH 6/8] osd/scrub: separate shallow vs deep errors storage The ScrubStore now holds two ScrubStore::at_level_t objects, one for the shallow errors and one for the deep errors. The shallow errors DB is recreated at the start of every scrub, while the deep errors DB is only recreated at the start of a deep scrub. When queried by the operator for known scrub errors, the ScrubStore will return the union of the errors from both DBs. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 285 +++++++++++++++++++++++++++----- src/osd/scrubber/ScrubStore.h | 42 ++++- src/osd/scrubber/pg_scrubber.cc | 2 +- 3 files changed, 285 insertions(+), 44 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 033ea6b24dfd4..9c680da0de16f 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -99,15 +99,30 @@ Store::Store( { ceph_assert(t); - const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); - t->touch(coll, err_obj); - errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj}); + // shallow errors DB object + const auto sh_err_obj = + pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); + t->touch(coll, sh_err_obj); + shallow_db.emplace( + pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj}); + + // and the DB for deep errors + const auto dp_err_obj = + pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid)); + t->touch(coll, dp_err_obj); + deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj}); + + dout(20) << fmt::format( + "created Scrub::Store for pg[{}], shallow: {}, deep: {}", + pgid, sh_err_obj, dp_err_obj) + << dendl; } Store::~Store() { - ceph_assert(!errors_db || errors_db->results.empty()); + ceph_assert(!shallow_db || shallow_db->results.empty()); + ceph_assert(!deep_db || deep_db->results.empty()); } @@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) add_object_error(pool, e); } +namespace { + +inconsistent_obj_wrapper create_filtered_copy( + const inconsistent_obj_wrapper& obj, + uint64_t obj_err_mask, + uint64_t shard_err_mask) +{ + inconsistent_obj_wrapper dup = obj; + dup.errors &= obj_err_mask; + for (auto& [shard, si] : dup.shards) { + si.errors &= shard_err_mask; + } + return dup; +} + +} // namespace + + void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { const auto key = to_object_key(pool, e.object); - bufferlist bl; - e.encode(bl); - errors_db->results[key] = bl; + dout(20) << fmt::format( + "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}", + e.object, key, librados::err_t{e.errors}, + librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS}, + librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e) + << dendl; + + // divide the errors & shard errors into shallow and deep. + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS) + .encode(bl); + shallow_db->results[key] = bl; + } + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS) + .encode(bl); + deep_db->results[key] = bl; + } } @@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { - errors_db->results[to_snap_key(pool, e.object)] = e.encode(); + // note: snap errors are only placed in the shallow store + shallow_db->results[to_snap_key(pool, e.object)] = e.encode(); } bool Store::is_empty() const { - return !errors_db || errors_db->results.empty(); + return (!shallow_db || shallow_db->results.empty()) && + (!deep_db || deep_db->results.empty()); } void Store::flush(ObjectStore::Transaction* t) { if (t) { - auto txn = errors_db->driver.get_transaction(t); - errors_db->backend.set_keys(errors_db->results, &txn); + auto txn = shallow_db->driver.get_transaction(t); + shallow_db->backend.set_keys(shallow_db->results, &txn); + txn = deep_db->driver.get_transaction(t); + deep_db->backend.set_keys(deep_db->results, &txn); } - errors_db->results.clear(); + + shallow_db->results.clear(); + deep_db->results.clear(); } @@ -184,18 +242,23 @@ void Store::clear_level_db( void Store::reinit( ObjectStore::Transaction* t, - [[maybe_unused]] scrub_level_t level) + scrub_level_t level) { + // Note: only one caller, and it creates the transaction passed to reinit(). + // No need to assert on 't' dout(20) << fmt::format( "re-initializing the Scrub::Store (for {} scrub)", (level == scrub_level_t::deep ? "deep" : "shallow")) << dendl; - // Note: only one caller, and it creates the transaction passed to reinit(). - // No need to assert on 't' - - if (errors_db) { - clear_level_db(t, *errors_db, "scrub"); + // always clear the known shallow errors DB (as both shallow and deep scrubs + // would recreate it) + if (shallow_db) { + clear_level_db(t, *shallow_db, "shallow"); + } + // only a deep scrub recreates the deep errors DB + if (level == scrub_level_t::deep && deep_db) { + clear_level_db(t, *deep_db, "deep"); } } @@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t) { dout(20) << "discarding error DBs" << dendl; ceph_assert(t); - if (errors_db) - t->remove(coll, errors_db->errors_hoid); + if (shallow_db) + t->remove(coll, shallow_db->errors_hoid); + if (deep_db) + t->remove(coll, deep_db->errors_hoid); } @@ -214,42 +279,180 @@ std::vector Store::get_snap_errors( const librados::object_id_t& start, uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_snap_key(pool) : to_snap_key(pool, start)); + vector errors; + const string begin = + (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start)); const string end = last_snap_key(pool); - return get_errors(begin, end, max_return); + + // the snap errors are stored only in the shallow store + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin); + + while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) { + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } + + return errors; } -std::vector -Store::get_object_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const + +std::vector Store::get_object_errors( + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_object_key(pool) : to_object_key(pool, start)); + const string begin = + (start.name.empty() ? first_object_key(pool) + : to_object_key(pool, start)); const string end = last_object_key(pool); + dout(20) << fmt::format("fetching errors, from {} to {}", begin, end) + << dendl; return get_errors(begin, end, max_return); } -std::vector -Store::get_errors(const string& begin, - const string& end, - uint64_t max_return) const + +inline void decode( + librados::inconsistent_obj_t& obj, + ceph::buffer::list::const_iterator& bp) { + reinterpret_cast(obj).decode(bp); +} + + +inconsistent_obj_wrapper decode_wrapper( + hobject_t obj, + ceph::buffer::list::const_iterator bp) +{ + inconsistent_obj_wrapper iow{obj}; + iow.decode(bp); + return iow; +} + + +void Store::collect_specific_store( + MapCacher::MapCacher& backend, + Store::ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const +{ + while (max_return-- && latest.has_value() && + latest.value().last_key < end_key) { + errors.push_back(latest->data); + latest = backend.get_1st_after_key(latest->last_key); + } +} + + +bufferlist Store::merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const +{ + // decode both error wrappers + auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin()); + auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin()); + dout(20) << fmt::format( + "merging errors {}. Shallow: {}-({}), Deep: {}-({})", + sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap, + dp_wrap) + << dendl; + + // merge the object errors (a simple OR of the two error bit-sets) + sh_wrap.errors |= dp_wrap.errors; + + // merge the two shard error maps + for (const auto& [shard, si] : dp_wrap.shards) { + dout(20) << fmt::format( + "shard {} dp-errors: {} sh-errors:{}", shard, si.errors, + sh_wrap.shards[shard].errors) + << dendl; + // note: we may be creating the shallow shard entry here. This is OK + sh_wrap.shards[shard].errors |= si.errors; + } + + return sh_wrap.encode(); +} + + +// a better way to implement get_errors(): use two generators, one for each store. +// and sort-merge the results. Almost like a merge-sort, but with equal +// keys combined. 'todo' once 'ranges' are really working. + +std::vector Store::get_errors( + const std::string& from_key, + const std::string& end_key, + uint64_t max_return) const +{ + // merge the input from the two sorted DBs into 'errors' (until + // enough errors are collected) vector errors; - if (!errors_db) - return errors; + dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key) + << dendl; - auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !errors_db->backend.get_next(next.first, &next)) { - if (next.first >= end) + ceph_assert(shallow_db); + ceph_assert(deep_db); + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key); + ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key); + + while (max_return) { + dout(20) << fmt::format( + "n:{} latest_sh: {}, latest_dp: {}", max_return, + (latest_sh ? latest_sh->last_key : "(none)"), + (latest_dp ? latest_dp->last_key : "(none)")) + << dendl; + + // keys not smaller than end_key are not interesting + if (latest_sh.has_value() && latest_sh->last_key >= end_key) { + latest_sh = tl::unexpected(-EINVAL); + } + if (latest_dp.has_value() && latest_dp->last_key >= end_key) { + latest_dp = tl::unexpected(-EINVAL); + } + + if (!latest_sh && !latest_dp) { + // both stores are exhausted + break; + } + if (!latest_sh.has_value()) { + // continue with the deep store + dout(10) << fmt::format("collecting from deep store") << dendl; + collect_specific_store( + deep_db->backend, latest_dp, errors, end_key, max_return); break; - errors.push_back(next.second); + } + if (!latest_dp.has_value()) { + // continue with the shallow store + dout(10) << fmt::format("collecting from shallow store") << dendl; + collect_specific_store( + shallow_db->backend, latest_sh, errors, end_key, max_return); + break; + } + + // we have results from both stores. Select the one with a lower key. + // If the keys are equal, combine the errors. + if (latest_sh->last_key == latest_dp->last_key) { + auto bl = merge_encoded_error_wrappers( + shallow_db->errors_hoid.hobj, latest_sh, latest_dp); + errors.push_back(bl); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + + } else if (latest_sh->last_key < latest_dp->last_key) { + dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key) + << dendl; + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } else { + dout(20) << fmt::format("deep store element ({})", latest_dp->last_key) + << dendl; + errors.push_back(latest_dp->data); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + } max_return--; } dout(10) << fmt::format("{} errors reported", errors.size()) << dendl; return errors; } - -} // namespace Scrub +} // namespace Scrub diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 9eb77ab667db7..8a30e8daf8569 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -16,6 +16,28 @@ class PgScrubber; namespace Scrub { +/** + * Storing errors detected during scrubbing. + * + * From both functional and internal perspectives, the store is a pair of key-value + * databases: one maps objects to shallow errors detected during their scrubbing, + * and other stores deep errors. + * Note that the first store is updated in both shallow and in deep scrubs. The + * second - only while deep scrubbing. + * + * The DBs can be consulted by the operator, when trying to list 'errors known + * at this point in time'. Whenever a scrub starts - the relevant entries in the + * DBs are removed. Specifically - the shallow errors DB is recreated each scrub, + * while the deep errors DB is recreated only when a deep scrub starts. + * + * When queried - the data from both DBs is merged for each named object, and + * returned to the operator. + * + * Implementation: + * Each of the two DBs is implemented as OMAP entries of a single, uniquely named, + * object. Both DBs are cached using the general KV Cache mechanism. + */ + class Store { public: ~Store(); @@ -114,14 +136,21 @@ class Store { * allocations; and 'mutable', as the caching mechanism is used in const * methods) */ - mutable std::optional errors_db; - // not yet: mutable std::optional deep_db; + mutable std::optional shallow_db; + mutable std::optional deep_db; std::vector get_errors( const std::string& start, const std::string& end, uint64_t max_return) const; + void collect_specific_store( + MapCacher::MapCacher& backend, + ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const; + /** * Clear the DB of errors at a specific scrub level by performing an * omap_clear() on the DB object, and resetting the MapCacher. @@ -131,5 +160,14 @@ class Store { at_level_t& db, std::string_view db_name); + /** + * merge the two error wrappers - fetched from both DBs for the same object. + * Specifically, the object errors are or'ed, and so are the per-shard + * entries. + */ + bufferlist merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const; }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 81093666f91c8..594ffb15e2b5b 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1209,7 +1209,7 @@ void PgScrubber::reinit_scrub_store() // actual Object Store objects). // 1. The ScrubStore object itself. // 2,3. The two special hobjects in the coll (the PG data) holding the last - // scrub's results. <> + // scrub's results. // // The Store object can be deleted and recreated, as a way to guarantee // no junk is left. We won't do it here, but we will clear the at_level_t From 47ef574bee6fc43850e9da9c0b9b6c4a34d58dae Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 7 Oct 2024 01:49:18 -0500 Subject: [PATCH 7/8] qa/standalone/scrub: test new ScrubStore implementation The ScrubStore is now comprised of two separate data structures, one for shallow errors and one for deep. A new test is added to verify the main objective of that design change: shallow scrubs should not overwrite deep scrub data. Signed-off-by: Ronen Friedman --- qa/standalone/scrub/osd-scrub-repair.sh | 249 +++++++++++++++++++++++- 1 file changed, 248 insertions(+), 1 deletion(-) diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 59564f7e37e28..491e46603f72e 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() { ['pool_name']="testpool" ['extras']=" --osd_scrub_auto_repair=true" ) - local extr_dbg=3 standard_scrub_cluster $dir cluster_conf local poolid=${cluster_conf['pool_id']} local poolname=${cluster_conf['pool_name']} @@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() { grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1 } +# +# Testing the "split scrub store" feature: shallow scrubs do not +# purge deep errors from the store. +# +# Corrupt one copy of a replicated pool, creating both shallow and deep errors. +# Then shallow-scrub the pool and verify that the deep errors are still present. +# +function TEST_dual_store_replicated_cluster() { + local dir=$1 + local poolname=csr_pool + local total_objs=19 + local extr_dbg=1 # note: 3 and above leave some temp files around + + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " + ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 " + ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 " + for osd in $(seq 0 1) + do + run_osd $dir $osd $ceph_osd_args || return 1 + done + + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_pool foo 1 || return 1 + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + ceph osd pool set $poolname noscrub 1 + ceph osd pool set $poolname nodeep-scrub 1 + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + add_something $dir $poolname $objname || return 1 + + rados --pool $poolname setomapheader $objname hdr-$objname || return 1 + rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1 + done + + # Increase file 1 MB + 1KB + dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025 + rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1 + rm -f $dir/new.ROBJ19 + + local pg=$(get_pg $poolname ROBJ0) + local primary=$(get_primary $poolname ROBJ0) + + # Compute an old omap digest and save oi + CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \ + config set osd_deep_scrub_update_digest_min_age 0 + CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \ + config set osd_deep_scrub_update_digest_min_age 0 + pg_deep_scrub $pg + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + + # Alternate corruption between osd.0 and osd.1 + local osd=$(expr $i % 2) + + case $i in + 1) + # Size (deep scrub data_digest too) + local payload=UVWXYZZZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 2) + # digest (deep scrub only) + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 3) + # missing + objectstore_tool $dir $osd $objname remove || return 1 + ;; + + 4) + # Modify omap value (deep scrub only) + objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1 + ;; + + 5) + # Delete omap key (deep scrub only) + objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1 + ;; + + 6) + # Add extra omap key (deep scrub only) + echo extra > $dir/extra-val + objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1 + rm $dir/extra-val + ;; + + 7) + # Modify omap header (deep scrub only) + echo -n newheader > $dir/hdr + objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1 + rm $dir/hdr + ;; + + 8) + rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1 + rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1 + + # Break xattrs + echo -n bad-val > $dir/bad-val + objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1 + objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1 + echo -n val3-$objname > $dir/newval + objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1 + rm $dir/bad-val $dir/newval + ;; + + 9) + objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi + echo -n D > $dir/change + rados --pool $poolname put $objname $dir/change + objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi + rm $dir/oi $dir/change + ;; + + # ROBJ10 must be handled after digests are re-computed by a deep scrub below + # ROBJ11 must be handled with config change before deep scrub + # ROBJ12 must be handled with config change before scrubs + # ROBJ13 must be handled before scrubs + + 14) + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1 + objectstore_tool $dir 1 $objname rm-attr _ || return 1 + rm $dir/bad-val + ;; + + 15) + objectstore_tool $dir $osd $objname rm-attr _ || return 1 + ;; + + 16) + objectstore_tool $dir 0 $objname rm-attr snapset || return 1 + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1 + ;; + + 17) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ17 + echo $payload > $dir/new.ROBJ17 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1 + ;; + + 18) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ18 + echo $payload > $dir/new.ROBJ18 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1 + # Make one replica have a different object info, so a full repair must happen too + objectstore_tool $dir $osd $objname corrupt-info || return 1 + ;; + + 19) + # Set osd-max-object-size smaller than this object's size + + esac + done + + local pg=$(get_pg $poolname ROBJ0) + + ceph tell osd.\* injectargs -- --osd-max-object-size=1048576 + + inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + + # first sequence: the final shallow scrub should not override any of the deep errors + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1.json + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1b.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_1b_s.json + + pg_deep_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_2.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_2s.json + + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_3.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_3s.json + + diff -u $dir/dp_results.json $dir/sh2_results.json || return 1 + + # inject a read error, which is a special case: the scrub encountering the read error + # would override the previously collected shard info. + inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + + pg_deep_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_4.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json + # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s.json + + pg_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_5.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \ + $dir/sh2Part2_w13_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json + + # the shallow scrub results should differ from the results of the deep + # scrub preceding it, but the difference should be limited to ROBJ13 + diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1 + diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1 + + ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it + return 0 +} + main osd-scrub-repair "$@" From 4f1ef85c7204dec9df9853597024637ac2873762 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sat, 5 Oct 2024 07:33:49 -0500 Subject: [PATCH 8/8] osd/scrub: modify ScrubStore contents retrieval A separate commit added a simple test to verify the new store implementation (creating both shallow & deep errors), scrubbing (step 1), deep scrubbing (step 2), then shallow scrubbing again (step 3). The test verifies that the results after step 2 include all shallow errors data (*), and that the results after step 3 include all deep errors data. The test highlighted the need to correctly partition and retrieve the "shards inconsistencies" and the "selected shard" data, which was not fully implemented in the previous commit. Thus, this commit adds the following: - add_object_error() no longer filters out data saved during deep scrubbing; it also filters less of the shallow scrubs "shards inconsistencies" data; - merge_encoded_error_wrappers() now merges the "shards inconsistencies" data correctly, handling the multiple scenarios possible. (*) note the special case of not being able to read the object's version during deep scrubbing (due to a read error). In this case - the data collected during the shallow scrub will not be reported. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 154 ++++++++++++++++++++++----------- src/osd/scrubber/ScrubStore.h | 2 + 2 files changed, 106 insertions(+), 50 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 9c680da0de16f..7f28ca2d642a8 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -142,49 +142,30 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) add_object_error(pool, e); } -namespace { - -inconsistent_obj_wrapper create_filtered_copy( - const inconsistent_obj_wrapper& obj, - uint64_t obj_err_mask, - uint64_t shard_err_mask) -{ - inconsistent_obj_wrapper dup = obj; - dup.errors &= obj_err_mask; - for (auto& [shard, si] : dup.shards) { - si.errors &= shard_err_mask; - } - return dup; -} - -} // namespace - void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { + using librados::obj_err_t; const auto key = to_object_key(pool, e.object); dout(20) << fmt::format( - "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}", - e.object, key, librados::err_t{e.errors}, - librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS}, - librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e) + "{}: adding error for object {} ({}). Errors: {} ({}/{}) " + "unfiltered:{}", + (current_level == scrub_level_t::deep ? "deep" : "shallow"), + e.object, key, obj_err_t{e.errors}, + obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS}, + obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e) << dendl; - // divide the errors & shard errors into shallow and deep. - { - bufferlist bl; - create_filtered_copy( - e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS) - .encode(bl); - shallow_db->results[key] = bl; - } - { - bufferlist bl; - create_filtered_copy( - e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS) - .encode(bl); - deep_db->results[key] = bl; + if (current_level == scrub_level_t::deep) { + // not overriding the deep errors DB during shallow scrubs + deep_db->results[key] = e.encode(); } + + // only shallow errors are stored in the shallow DB + auto e_copy = e; + e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS; + e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS; + shallow_db->results[key] = e_copy.encode(); } @@ -251,6 +232,8 @@ void Store::reinit( (level == scrub_level_t::deep ? "deep" : "shallow")) << dendl; + current_level = level; + // always clear the known shallow errors DB (as both shallow and deep scrubs // would recreate it) if (shallow_db) { @@ -344,6 +327,15 @@ void Store::collect_specific_store( } +/* + * Implementation notes: + * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f + * for why we encode the shard_info_t in the store. + * - to maintain known shard_info-s created during a deep scrub (but only when + * needed), we use our knowledge of the level of the last scrub performed + * (current_level), and the object user version as encoded in the error + * structure. + */ bufferlist Store::merge_encoded_error_wrappers( hobject_t obj, ExpCacherPosData& latest_sh, @@ -352,26 +344,88 @@ bufferlist Store::merge_encoded_error_wrappers( // decode both error wrappers auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin()); auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin()); - dout(20) << fmt::format( - "merging errors {}. Shallow: {}-({}), Deep: {}-({})", - sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap, - dp_wrap) - << dendl; - // merge the object errors (a simple OR of the two error bit-sets) - sh_wrap.errors |= dp_wrap.errors; - - // merge the two shard error maps - for (const auto& [shard, si] : dp_wrap.shards) { + // note: the '20' level is just until we're sure the merging works as + // expected + if (g_conf()->subsys.should_gather()) { + dout(20) << fmt::format( + "merging errors {}. Deep: {:#x}-({})", sh_wrap.object, + dp_wrap.errors, dp_wrap) + << dendl; dout(20) << fmt::format( - "shard {} dp-errors: {} sh-errors:{}", shard, si.errors, - sh_wrap.shards[shard].errors) + "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object, + sh_wrap.errors, sh_wrap) << dendl; - // note: we may be creating the shallow shard entry here. This is OK - sh_wrap.shards[shard].errors |= si.errors; + // dev: list the attributes: + for (const auto& [shard, si] : sh_wrap.shards) { + for (const auto& [attr, bl] : si.attrs) { + dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr) + << dendl; + } + } + for (const auto& [shard, si] : dp_wrap.shards) { + for (const auto& [attr, bl] : si.attrs) { + dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr) + << dendl; + } + } + } + + // Actual merging of the shard map entries is only performed if the + // latest version is from the shallow scrub. + // Otherwise, the deep scrub, which (for the shards info) contains all data, + // and the shallow scrub is ignored. + if (current_level == scrub_level_t::shallow) { + // is the object data related to the same object version? + if (sh_wrap.version == dp_wrap.version) { + // combine the error information + dp_wrap.errors |= sh_wrap.errors; + for (const auto& [shard, si] : sh_wrap.shards) { + if (dp_wrap.shards.contains(shard)) { + dout(20) << fmt::format( + "-----> {}-{} combining: sh-errors: {} dp-errors:{}", + sh_wrap.object, shard, si, dp_wrap.shards[shard]) + << dendl; + const auto saved_er = dp_wrap.shards[shard].errors; + dp_wrap.shards[shard].selected_oi = si.selected_oi; + dp_wrap.shards[shard].primary = si.primary; + dp_wrap.shards[shard].errors |= saved_er; + + // the attributes: + for (const auto& [attr, bl] : si.attrs) { + if (!dp_wrap.shards[shard].attrs.contains(attr)) { + dout(20) << fmt::format( + "-----> {}-{} copying shallow attr: attr: {}", + sh_wrap.object, shard, attr) + << dendl; + dp_wrap.shards[shard].attrs[attr] = bl; + } + // otherwise - we'll ignore the shallow attr buffer + } + } else { + // the deep scrub data for this shard is missing. We take the shallow + // scrub data. + dp_wrap.shards[shard] = si; + } + } + } else if (sh_wrap.version > dp_wrap.version) { + if (false && dp_wrap.version == 0) { + // there was a read error in the deep scrub. The deep version + // shows as '0'. That's severe enough for us to ignore the shallow. + dout(10) << fmt::format("{} ignoring deep after read failure", + sh_wrap.object) + << dendl; + } else { + // There is a new shallow version of the object results. + // The deep data is for an older version of that object. + // There are multiple possibilities here, but for now we ignore the + // deep data. + dp_wrap = sh_wrap; + } + } } - return sh_wrap.encode(); + return dp_wrap.encode(); } diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 8a30e8daf8569..0955654d78e91 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -130,6 +130,8 @@ class Store { /// the collection (i.e. - the PG store) in which the errors are stored const coll_t coll; + scrub_level_t current_level; + /** * the machinery (backend details, cache, etc.) for storing both levels * of errors (note: 'optional' to allow delayed creation w/o dynamic