diff --git a/qa/tasks/cephfs/test_quiesce.py b/qa/tasks/cephfs/test_quiesce.py index a7bf65b36b129b..7e9f2dfbfb55a9 100644 --- a/qa/tasks/cephfs/test_quiesce.py +++ b/qa/tasks/cephfs/test_quiesce.py @@ -207,11 +207,6 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False): self.assertEqual(lock['flags'], 4) self.assertEqual(lock['lock']['state'], 'lock') self.assertEqual(lock['lock']['num_xlocks'], 1) - elif lock_type == "iversion": - # only regular files acquire xlock (on ifile) - self.assertEqual((mode & S_IFMT), S_IFREG) - self.assertEqual(lock['flags'], 2) - self.assertEqual(lock['lock']['state'][:4], 'lock') else: # no iflock self.assertFalse(lock_type.startswith("i")) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 8edf33c89596a2..ee70078706646b 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -3538,7 +3538,8 @@ int CInode::get_caps_quiesce_mask() const if (quiescelock.can_wrlock()) { return CEPH_CAP_ANY; } else { - return CEPH_CAP_ANY_SHARED | CEPH_CAP_PIN /*?*/; + // what we allow for a quiesced node + return CEPH_CAP_ANY_SHARED | CEPH_CAP_GRD | CEPH_CAP_GBUFFER | CEPH_CAP_PIN; } } @@ -3546,9 +3547,9 @@ int CInode::get_caps_quiesce_mask() const int CInode::get_caps_liked() const { if (is_dir()) - return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER + return get_caps_quiesce_mask() & (CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED); // but not, say, FILE_RD|WR|WRBUFFER else - return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; + return get_caps_quiesce_mask() & (CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO); } int CInode::get_caps_allowed_ever() const diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5abd989a3d580f..06ed79d4b09636 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "MDCache.h" @@ -13526,7 +13528,6 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) mds->server->respond_to_request(mdr, -CEPHFS_ENOENT); return; } - const bool is_root = (mdr->get_filepath().get_ino() == mdr->get_filepath2().get_ino()); dout(20) << __func__ << " " << *mdr << " quiescing " << *in << dendl; @@ -13565,7 +13566,7 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) } // now we can test if this inode should be skipped - if (in->get_projected_inode()->get_quiesce_block()) { + if (in->get_inode()->get_quiesce_block()) { dout(10) << __func__ << " quiesce is blocked for this inode; dropping locks!" << dendl; mdr->mark_event("quiesce blocked"); mds->locker->drop_locks(mdr.get()); @@ -13576,17 +13577,14 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) return; } - if (!in->is_dir()) { - /* - * Because files are treated specially allowing multiple reader/writers, we - * need an xlock here to recall all write caps. This unfortunately means - * there can be no readers. - */ - lov.add_xlock(&in->filelock); - } else { - lov.add_rdlock(&in->filelock); - } + // we are adding the rest of the rdlocks + // this is safe because we only take rdlocks for all four locks + // if any lock is attempted for wr or x, then quiesce lock would be + // injected automatically by the `acquire_locks` and we already hold + // that as xlock so those can't be holding any of the write locks to the below + // rdlock on the filelock is sufficient, as it ca + lov.add_rdlock(&in->filelock); lov.add_rdlock(&in->authlock); lov.add_rdlock(&in->linklock); @@ -13600,16 +13598,49 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) return; } - // since we're holding onto the quiesce lock, // we don't need the other capabilities related locks anymore // as the quiesce mask will prevent any non-shared capabilities from being issued // see CInode::get_caps_quiesce_mask() + // we actually don't need to hold any other lock but the quiesce lock + + MutationImpl::LockOpVec drop_ops; + std::ranges::copy_if( + mdr->locks, + std::back_inserter(drop_ops), + [&in](auto &&op) { return op.lock != &in->quiescelock; } + ); + + dout(20) << "dropping locks: " << drop_ops << dendl; + + for (auto op: drop_ops) { + mds->locker->drop_lock(mdr.get(), op.lock); + } + + // use the lambda for completion so that we don't have to go + // through the whole lock/drop story above + auto did_quiesce = new LambdaContext([in, mdr=mdr, &qs, mds = mds, func = __func__](int rc) { + /* do not respond/complete so locks are not lost, parent request will complete */ + if (mdr->killed) { + dout(20) << func << " " << *mdr << " not dispatching killed " << *mdr << dendl; + return; + } else if (mdr->internal_op_finish == nullptr) { + dout(20) << func << " " << *mdr << " already finished quiesce" << dendl; + return; + } + + if (in->is_auth()) { + dout(10) << func << " " << *mdr << " quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete"); + } else { + dout(10) << func << " " << *mdr << " non-auth quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete for non-auth inode"); + } - mds->locker->drop_lock(mdr.get(), &in->filelock); - mds->locker->drop_lock(mdr.get(), &in->authlock); - mds->locker->drop_lock(mdr.get(), &in->linklock); - mds->locker->drop_lock(mdr.get(), &in->xattrlock); + qs.inc_inodes_quiesced(); + mdr->internal_op_finish->complete(rc); + mdr->internal_op_finish = nullptr; + }); if (in->is_dir()) { for (auto& dir : in->get_dirfrags()) { @@ -13621,48 +13652,41 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) } } uint64_t count = 0; - MDSGatherBuilder gather(g_ceph_context, new C_MDS_RetryRequest(this, mdr)); + MDSGatherBuilder gather(g_ceph_context, new MDSInternalContextWrapper(mds, did_quiesce)); auto& qops = qrmdr->more()->quiesce_ops; for (auto& dir : in->get_dirfrags()) { for (auto& [dnk, dn] : *dir) { - auto recurse = [this, mds=mds, &qops, &count, delay, &qfinisher, &qs, &gather, func=__func__](CInode* in) { - if (auto it = qops.find(in); it != qops.end()) { - dout(25) << func << ": existing quiesce metareqid: " << it->second << dendl; - if (auto reqit = active_requests.find(it->second); reqit != active_requests.end()) { - auto& qimdr = reqit->second; - dout(25) << func << ": found in-progress " << qimdr << dendl; - return; - } - } - dout(10) << func << ": scheduling op to quiesce " << *in << dendl; - - MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); - qimdr->set_filepath(filepath(in->ino())); - qimdr->internal_op_finish = gather.new_sub(); - qimdr->internal_op_private = qfinisher; - qops[in] = qimdr->reqid; - qs.inc_inodes(); - if (delay > 0ms) { - mds->timer.add_event_after(delay, new LambdaContext([cache=this,qimdr](int r) { - cache->dispatch_request(qimdr); - })); - } else { - dispatch_request(qimdr); - } - if (!(++count % mds->heartbeat_reset_grace())) { - mds->heartbeat_reset(); + // don't take the projected linkage because + // we should only care about the committed state + auto in = dn->get_linkage()->inode; + if (!in) { + continue; + } + if (auto it = qops.find(in); it != qops.end()) { + dout(25) << __func__ << ": existing quiesce metareqid: " << it->second << dendl; + if (auto reqit = active_requests.find(it->second); reqit != active_requests.end()) { + auto& qimdr = reqit->second; + dout(25) << __func__ << ": found in-progress " << qimdr << dendl; + continue; } - }; - - - auto inode = dn->get_linkage()->inode; - if (inode) { - recurse(inode); } - auto pinode = dn->get_projected_inode(); - if (pinode && pinode != inode) { - dout(10) << __func__ << " descending into a projected dentry" << dendl; - recurse(inode); + dout(10) << __func__ << ": scheduling op to quiesce " << *in << dendl; + + MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); + qimdr->set_filepath(filepath(in->ino())); + qimdr->internal_op_finish = gather.new_sub(); + qimdr->internal_op_private = qfinisher; + qops[in] = qimdr->reqid; + qs.inc_inodes(); + if (delay > 0ms) { + mds->timer.add_event_after(delay, new LambdaContext([cache=this,qimdr](int r) { + cache->dispatch_request(qimdr); + })); + } else { + dispatch_request(qimdr); + } + if (!(++count % mds->heartbeat_reset_grace())) { + mds->heartbeat_reset(); } } } @@ -13673,19 +13697,7 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) } } - if (in->is_auth()) { - dout(10) << __func__ << " " << *mdr << " quiesce complete of " << *in << dendl; - mdr->mark_event("quiesce complete"); - } else { - dout(10) << __func__ << " " << *mdr << " non-auth quiesce complete of " << *in << dendl; - mdr->mark_event("quiesce complete for non-auth inode"); - } - - qs.inc_inodes_quiesced(); - mdr->internal_op_finish->complete(0); - mdr->internal_op_finish = nullptr; - - /* do not respond/complete so locks are not lost, parent request will complete */ + did_quiesce->complete(0); } void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) @@ -13755,7 +13767,6 @@ void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) } else if (auto& qops = mdr->more()->quiesce_ops; qops.count(diri) == 0) { MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); qimdr->set_filepath(filepath(diri->ino())); - qimdr->set_filepath2(filepath(diri->ino())); /* is_root! */ qimdr->internal_op_finish = new C_MDS_RetryRequest(this, mdr); qimdr->internal_op_private = qfinisher; qops[diri] = qimdr->reqid;