From 34b5654d9eb1999704e75d964645e3aa9b78e249 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 18:11:46 -0400 Subject: [PATCH] Update bcachefs sources to 9404a01d3dc5 bcachefs: Make read_only a mount option again, but hidden Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- include/linux/atomic.h | 7 + include/linux/closure.h | 23 ++ include/linux/workqueue.h | 2 +- libbcachefs/alloc_background.c | 277 +++++++------- libbcachefs/alloc_background.h | 9 +- libbcachefs/alloc_foreground.c | 4 +- libbcachefs/bcachefs.h | 20 +- libbcachefs/bcachefs_format.h | 18 +- libbcachefs/btree_gc.c | 7 +- libbcachefs/btree_iter.c | 33 +- libbcachefs/btree_key_cache.c | 324 ++++++---------- libbcachefs/btree_trans_commit.c | 11 +- libbcachefs/btree_types.h | 1 - libbcachefs/buckets.c | 4 +- libbcachefs/buckets.h | 4 +- libbcachefs/chardev.c | 23 +- libbcachefs/checksum.c | 5 +- libbcachefs/debug.c | 109 ++++-- libbcachefs/disk_accounting.c | 14 +- libbcachefs/ec.c | 2 +- libbcachefs/error.c | 19 +- libbcachefs/error.h | 7 - libbcachefs/extents.c | 21 +- libbcachefs/extents.h | 2 + libbcachefs/fs-io-buffered.c | 41 +- libbcachefs/fs-io-direct.c | 4 +- libbcachefs/fs-io-pagecache.c | 37 +- libbcachefs/fs-io-pagecache.h | 7 +- libbcachefs/fs.c | 34 +- libbcachefs/inode.c | 3 +- libbcachefs/io_read.c | 1 - libbcachefs/io_write.c | 5 +- libbcachefs/journal.c | 5 + libbcachefs/journal_io.c | 24 +- libbcachefs/journal_seq_blacklist.c | 2 +- libbcachefs/lru.h | 12 - libbcachefs/lru_format.h | 25 ++ libbcachefs/opts.h | 5 +- libbcachefs/recovery.c | 4 +- libbcachefs/recovery_passes.c | 4 + libbcachefs/sb-downgrade.c | 1 + libbcachefs/sb-errors.c | 14 +- libbcachefs/sb-errors_format.h | 564 ++++++++++++++-------------- libbcachefs/seqmutex.h | 11 +- libbcachefs/snapshot.c | 5 + libbcachefs/super.c | 13 +- libbcachefs/util.h | 17 +- linux/closure.c | 54 ++- 49 files changed, 1003 insertions(+), 837 deletions(-) create mode 100644 libbcachefs/lru_format.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 9c418ae26..50da14dd6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -792ca5ba3c9a07d762d9c1a440e31c0520f37de0 +9404a01d3dc5553b106fa590602f4771b8e0b8ae diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 73024023e..dcc6e6441 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -26,6 +26,7 @@ typedef struct { #define __ATOMIC_READ(p) uatomic_read(p) #define __ATOMIC_SET(p, v) uatomic_set(p, v) +#define __ATOMIC_SET_RELEASE(p, v) uatomic_set(p, v) #define __ATOMIC_ADD_RETURN(v, p) uatomic_add_return(p, v) #define __ATOMIC_SUB_RETURN(v, p) uatomic_sub_return(p, v) #define __ATOMIC_ADD(v, p) uatomic_add(p, v) @@ -64,6 +65,7 @@ typedef struct { #define __ATOMIC_READ(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define __ATOMIC_SET(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#define __ATOMIC_SET_RELEASE(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) #define __ATOMIC_ADD_RETURN(v, p) __atomic_add_fetch(p, v, __ATOMIC_RELAXED) #define __ATOMIC_ADD_RETURN_RELEASE(v, p) \ __atomic_add_fetch(p, v, __ATOMIC_RELEASE) @@ -189,6 +191,11 @@ static inline void a_type##_set(a_type##_t *v, i_type i) \ return __ATOMIC_SET(&v->counter, i); \ } \ \ +static inline void a_type##_set_release(a_type##_t *v, i_type i) \ +{ \ + return __ATOMIC_SET_RELEASE(&v->counter, i); \ +} \ + \ static inline i_type a_type##_add_return(i_type i, a_type##_t *v) \ { \ return __ATOMIC_ADD_RETURN(i, &v->counter); \ diff --git a/include/linux/closure.h b/include/linux/closure.h index 99155df16..59b8c06b1 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -284,6 +284,21 @@ static inline void closure_get(struct closure *cl) #endif } +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + /** * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize @@ -310,6 +325,12 @@ static inline void closure_init_stack(struct closure *cl) atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + /** * closure_wake_up - wake up all closures on a wait list, * with memory barrier @@ -355,6 +376,8 @@ do { \ */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +void closure_return_sync(struct closure *cl); + /** * continue_at_nobarrier - jump to another function without barrier * diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1406c9582..5d2ca5f8c 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -151,7 +151,7 @@ extern void workqueue_set_max_active(struct workqueue_struct *wq, extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); -extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); +static inline __printf(1, 2) void set_worker_desc(const char *fmt, ...) {} extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_workqueue_state(void); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 77aa85b98..8e8aed2a5 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -30,7 +30,7 @@ #include #include -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -476,7 +476,8 @@ bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_i } __flatten -struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); @@ -484,7 +485,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, flags); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } @@ -595,8 +596,6 @@ int bch2_alloc_read(struct bch_fs *c) struct bch_dev *ca = NULL; int ret; - down_read(&c->gc_lock); - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch, k, ({ @@ -645,7 +644,6 @@ int bch2_alloc_read(struct bch_fs *c) bch2_dev_put(ca); bch2_trans_put(trans); - up_read(&c->gc_lock); bch_err_fn(c, ret); return ret; @@ -847,6 +845,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || @@ -958,12 +957,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (statechange(a->data_type == BCH_DATA_need_discard) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(c, new.k->p); + bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_gc_gens_async(c); @@ -1684,34 +1683,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } -static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } - ret = darray_push(&c->discard_buckets_in_flight, bucket); + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); out: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } -static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { @@ -1719,26 +1722,11 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) @@ -1752,16 +1740,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; - struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode - ? s->ca - : bch2_dev_get_ioref(c, pos.inode, WRITE); - if (!ca) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; @@ -1821,7 +1799,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; @@ -1845,8 +1823,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); write: + alloc_data_type_set(&a->v, a->v.data_type); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -1858,7 +1837,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; out: if (discard_locked) - discard_in_flight_remove(c, iter.pos); + discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1867,7 +1846,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1878,23 +1858,41 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - - discard_buckets_next_dev(c, &s, NULL); + for_each_btree_key_upto(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) @@ -1923,68 +1921,69 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo static void bch2_do_discards_fast_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; - struct bpos bucket; - struct bch_dev *ca; + u64 bucket; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) { - if (i->snapshot) + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) continue; - ca = bch2_dev_get_ioref(c, i->inode, WRITE); - if (!ca) { - darray_remove_item(&c->discard_buckets_in_flight, i); - continue; - } - got_bucket = true; - bucket = *i; - i->snapshot = true; + bucket = i->bucket; + i->in_progress = true; break; } - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, - bucket.offset * ca->mi.bucket_size, + bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, bucket)); + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); - percpu_ref_put(&ca->io_ref); - discard_in_flight_remove(c, bucket); + discard_in_flight_remove(ca, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + percpu_ref_put(&ca->io_ref); } -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); - bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); - rcu_read_unlock(); + struct bch_fs *c = ca->fs; + + if (discard_in_flight_add(ca, bucket, false)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; - if (!dead && - !discard_in_flight_add(c, bucket) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && - !queue_work(c->write_ref_wq, &c->discard_fast_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +put_ioref: + percpu_ref_put(&ca->io_ref); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -2010,7 +2009,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket); + a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -2086,7 +2085,8 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -2094,50 +2094,63 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - if (!k.k) - break; + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - if (ret < 0) { - bch2_dev_put(ca); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) break; - } + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -2453,16 +2466,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_fs_allocator_background_exit(struct bch_fs *c) +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) { - darray_exit(&c->discard_buckets_in_flight); + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - mutex_init(&c->discard_buckets_in_flight_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index dcf58c38d..8d2b62c95 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -206,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct bpos); +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, + enum btree_iter_update_trigger_flags); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -299,6 +300,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -313,6 +315,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -336,7 +339,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_fs_allocator_background_exit(struct bch_fs *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index a352a671b..73228b25d 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -621,13 +621,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { if (cl && !waiting) { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 116766789..372bc339c 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -496,6 +496,11 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -533,8 +538,8 @@ struct bch_dev { /* * Buckets: * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for dev_ptr_stale(): + * gc_gens_lock, for device resize - holding any is sufficient for + * access: Or rcu_read_lock(), but only for dev_ptr_stale(): */ struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; @@ -555,6 +560,12 @@ struct bch_dev { size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -909,11 +920,6 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct bpos) discard_buckets_in_flight; - struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct work_struct gc_gens_work; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 66ba8fb44..74a60b1a4 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -468,18 +468,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -/* LRU btree: */ - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define LRU_ID_STRIPES (1U << 16) - -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -516,6 +504,7 @@ struct bch_sb_field { #include "inode_format.h" #include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" +#include "lru_format.h" #include "quota_format.h" #include "reflink_format.h" #include "replicas_format.h" @@ -954,8 +943,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 0c2eb7567..2e9ccb207 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1229,7 +1229,7 @@ int bch2_gc_gens(struct bch_fs *c) int ret; /* - * Ideally we would be using state_lock and not gc_lock here, but that + * Ideally we would be using state_lock and not gc_gens_lock here, but that * introduces a deadlock in the RO path - we currently take the state * lock at the start of going RO, thus the gc thread may get stuck: */ @@ -1237,7 +1237,8 @@ int bch2_gc_gens(struct bch_fs *c) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->gc_lock); + + down_read(&c->state_lock); for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); @@ -1306,7 +1307,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = NULL; } - up_read(&c->gc_lock); + up_read(&c->state_lock); mutex_unlock(&c->gc_gens_lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index c68cc7149..80f4a3959 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1801,13 +1801,12 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * goto hole; } else { struct bkey_cached *ck = (void *) path->l[0].b; - - EBUG_ON(ck && - (path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos))); - if (!ck || !ck->valid) + if (!ck) return bkey_s_c_null; + EBUG_ON(path->btree_id != ck->key.btree_id || + !bkey_eq(path->pos, ck->key.pos)); + *u = ck->k->k; k = bkey_i_to_s_c(ck->k); } @@ -3131,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -3151,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) BUG_ON(pos_task && pid == pos_task->pid && pos->locked); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; @@ -3200,6 +3193,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + + closure_init_stack_release(&trans->ref); return trans; } @@ -3236,7 +3231,6 @@ void bch2_trans_put(struct btree_trans *trans) trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3248,6 +3242,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths array; used + * by cycle detector + */ + closure_return_sync(&trans->ref); + trans->locking_wait.task = NULL; + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3265,8 +3266,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3386,8 +3385,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 8b2fd0ae6..f2f2e5254 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -205,9 +205,22 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc, six_unlock_intent(&ck->c.lock); } +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) +{ + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); + if (unlikely(!ck)) + return NULL; + ck->k = kmalloc(key_u64s * sizeof(u64), gfp); + if (unlikely(!ck->k)) { + kmem_cache_free(bch2_key_cache, ck); + return NULL; + } + ck->u64s = key_u64s; + return ck; +} + static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, - bool *was_new) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; @@ -281,8 +294,10 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, } ck = allocate_dropping_locks(trans, ret, - kmem_cache_zalloc(bch2_key_cache, _gfp)); + __bkey_cached_alloc(key_u64s, _gfp)); if (ret) { + if (ck) + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); return ERR_PTR(ret); } @@ -296,7 +311,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, ck->c.cached = true; BUG_ON(!six_trylock_intent(&ck->c.lock)); BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; return ck; } @@ -326,71 +340,102 @@ bkey_cached_reuse(struct btree_key_cache *c) return ck; } -static struct bkey_cached * -btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck; - bool was_new = false; - ck = bkey_cached_alloc(trans, path, &was_new); - if (IS_ERR(ck)) - return ck; + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + unsigned key_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + key_u64s = min(256U, (key_u64s * 3) / 2); + key_u64s = roundup_pow_of_two(key_u64s); + + struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); + int ret = PTR_ERR_OR_ZERO(ck); + if (ret) + return ret; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(path->btree_id)); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + return -BCH_ERR_ENOMEM_btree_key_cache_create; } - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; ck->c.btree_id = path->btree_id; ck->key.btree_id = path->btree_id; ck->key.pos = path->pos; - ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&bc->table, - &ck->hash, - bch2_btree_key_cache_params))) { - /* We raced with another fill: */ - - if (likely(was_new)) { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - kfree(ck); - } else { - bkey_cached_free_fast(bc, ck); + if (unlikely(key_u64s > ck->u64s)) { + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); + if (unlikely(!new_k)) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), key_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + } else if (ret) { + kfree(new_k); + goto err; } - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - return NULL; + kfree(ck->k); + ck->k = new_k; + ck->u64s = key_u64s; } - atomic_long_inc(&bc->nr_keys); + bkey_reassemble(ck->k, k); + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + if (unlikely(ret)) /* raced with another fill? */ + goto err; + + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - return ck; + enum six_lock_type lock_want = __btree_lock_want(path, 0); + if (lock_want == SIX_LOCK_read) + six_lock_downgrade(&ck->c.lock); + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; +err: + bkey_cached_free_fast(bc, ck); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + + return ret; } -static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_cached *ck) +static noinline int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + unsigned flags) { + if (flags & BTREE_ITER_cached_nofill) { + ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; + } + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - unsigned new_u64s = 0; - struct bkey_i *new_k = NULL; int ret; - bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; @@ -399,70 +444,15 @@ static int btree_key_cache_fill(struct btree_trans *trans, if (ret) goto err; - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - new_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - new_u64s = min(256U, (new_u64s * 3) / 2); - - if (new_u64s > ck->u64s) { - new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (!new_k) { - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), new_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_k); - goto err; - } - - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - } - } + /* Recheck after btree lookup, before allocating: */ + ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; + if (unlikely(ret)) + goto out; - ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); - if (ret) { - kfree(new_k); + ret = btree_key_cache_create(trans, ck_path, k); + if (ret) goto err; - } - - if (new_k) { - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - } - - bkey_reassemble(ck->k, k); - ck->valid = true; - bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); - +out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); err: @@ -470,128 +460,62 @@ static int btree_key_cache_fill(struct btree_trans *trans, return ret; } -static noinline int -bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, + struct btree_path *path) { struct bch_fs *c = trans->c; struct bkey_cached *ck; - int ret = 0; - - BUG_ON(path->level); - - path->l[1].b = NULL; - - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - ck = btree_key_cache_create(trans, path); - ret = PTR_ERR_OR_ZERO(ck); - if (ret) - goto err; - if (!ck) - goto retry; - - btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); - path->locks_want = 1; - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - BUG_ON(ret); - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } + if (!ck) + return -ENOENT; - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - } -fill: - path->uptodate = BTREE_ITER_UPTODATE; + enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { - ret = bch2_btree_path_upgrade(trans, path, 1) ?: - btree_key_cache_fill(trans, path, ck) ?: - bch2_btree_path_relock(trans, path, _THIS_IP_); - if (ret) - goto err; + int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); + if (ret) + return ret; - path->uptodate = BTREE_ITER_UPTODATE; + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - BUG_ON(path->uptodate); - - return ret; -err: - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - return ret; + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, unsigned flags) { - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - int ret = 0; - EBUG_ON(path->level); path->l[1].b = NULL; if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - - enum six_lock_type lock_want = __btree_lock_want(path, 0); - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (ret) - return ret; - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; + int ret; + do { + ret = btree_path_traverse_cached_fast(trans, path); + if (unlikely(ret == -ENOENT)) + ret = btree_key_cache_fill(trans, path, flags); + } while (ret == -EEXIST); + + if (unlikely(ret)) { + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); + } } - - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); -fill: - if (!ck->valid) - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - path->uptodate = BTREE_ITER_UPTODATE; - EBUG_ON(!ck->valid); - EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - return ret; } @@ -630,8 +554,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; } - BUG_ON(!ck->valid); - if (journal_seq && ck->journal.seq != journal_seq) goto out; @@ -753,7 +675,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); bkey_copy(ck->k, insert); - ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -792,10 +713,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = (void *) path->l[0].b; - BUG_ON(!ck->valid); - /* * We just did an update to the btree, bypassing the key cache: the key * cache key is now stale and must be dropped, even if dirty: @@ -806,7 +726,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bch2_journal_pin_drop(&c->journal, &ck->journal); } - ck->valid = false; + bkey_cached_evict(bc, ck); + bkey_cached_free_fast(bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 8ab85f212..cca336fe4 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -137,7 +137,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) + if (btree_node_locked_type(trans->paths + i->path, i->level) == + BTREE_NODE_WRITE_LOCKED) bch2_btree_node_unlock_write_inlined(trans, trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; @@ -777,14 +778,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; - if (!i->cached) { + if (!i->cached) bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - } else if (!i->key_cache_already_flushed) + else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); - else { + else bch2_btree_key_cache_drop(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - } } return 0; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e7a78ef62..c9c9864a8 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -388,7 +388,6 @@ struct bkey_cached { unsigned long flags; unsigned long btree_trans_barrier_seq; u16 u64s; - bool valid; struct bkey_cached_key key; struct rhash_head hash; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 74c0fce39..25549c231 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -569,7 +569,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); if (ret) @@ -1217,7 +1217,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { - down_write(&c->gc_lock); down_write(&ca->bucket_lock); percpu_down_write(&c->mark_lock); } @@ -1240,7 +1239,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (resize) { percpu_up_write(&c->mark_lock); up_write(&ca->bucket_lock); - up_write(&c->gc_lock); } ret = 0; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index fa5637967..4a14741b8 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) return rcu_dereference_check(ca->buckets_gc, !ca->fs || percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->fs->state_lock) || lockdep_is_held(&ca->bucket_lock)); } @@ -103,7 +103,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) return rcu_dereference_check(ca->bucket_gens, !ca->fs || percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->fs->state_lock) || lockdep_is_held(&ca->bucket_lock)); } diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 4248c251f..ef1f74866 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -214,22 +214,10 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - char *ro, *rest; - - /* - * If passed a "read_only" mount option, remove it because it is - * no longer a valid mount option, and the filesystem will be - * set "read_only" regardless. - */ - ro = strstr(optstr, "read_only"); - if (ro) { - rest = ro + strlen("read_only"); - memmove(ro, rest, strlen(rest) + 1); - } - ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; @@ -333,7 +321,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return ret; ret = bch2_dev_add(c, path); - kfree(path); + if (!IS_ERR(path)) + kfree(path); return ret; } @@ -579,7 +568,6 @@ static long bch2_ioctl_query_accounting(struct bch_fs *c, ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); err: - bch_err_fn(c, ret); darray_exit(&accounting); return ret; } @@ -861,7 +849,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 3bd3aba90..e7208bf19 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -436,7 +437,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type ", + " expected %0llx:%0llx got %0llx:%0llx (old type ", __func__, crc_old.csum.hi, crc_old.csum.lo, @@ -446,7 +447,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, " new type "); bch2_prt_csum_type(&buf, new_csum_type); prt_str(&buf, ")"); - bch_err(c, "%s", buf.buf); + WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); return -EIO; } diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 51cbf3928..f0d4727c4 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans < i->iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + i->iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + if (!closure_get_not_zero(&trans->ref)) + continue; + + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - u32 seq; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + pid_t iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { struct task_struct *task = READ_ONCE(trans->locking_wait.task); - if (!task || task->pid <= i->iter) + if (!task || task->pid <= iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + iter = task->pid; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + if (!closure_get_not_zero(&trans->ref)) + continue; - bch2_check_for_deadlock(trans, &i->buf); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index 370374591..dcdd59249 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -521,8 +521,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) return ret; } -static int accounting_read_key(struct bch_fs *c, struct btree_trans *trans, struct bkey_s_c k) +static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; if (k.k->type != KEY_TYPE_accounting) @@ -557,15 +558,15 @@ static int accounting_read_key(struct bch_fs *c, struct btree_trans *trans, stru int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; + struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, + int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - accounting_read_key(c, trans, k); - }))); + accounting_read_key(trans, k); + })); if (ret) goto err; @@ -598,7 +599,7 @@ int bch2_accounting_read(struct bch_fs *c) continue; } - ret = accounting_read_key(c, NULL, k); + ret = accounting_read_key(trans, k); if (ret) goto err; } @@ -645,6 +646,7 @@ int bch2_accounting_read(struct bch_fs *c) preempt_enable(); percpu_up_read(&c->mark_lock); err: + bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 3c3a2a7e8..86948d110 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -283,7 +283,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket); + bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); } diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 9d7cc79ed..a62b63108 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -16,6 +16,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", @@ -211,6 +212,12 @@ static void prt_actioning(struct printbuf *out, const char *action) prt_str(out, "ing"); } +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, @@ -226,6 +233,9 @@ int __bch2_fsck_err(struct bch_fs *c, might_sleep(); + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + if (!c) c = trans->c; @@ -293,7 +303,14 @@ int __bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_CAN_FIX) && + (flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); diff --git a/libbcachefs/error.h b/libbcachefs/error.h index ead36936e..995e6bba9 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -108,13 +108,6 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) __printf(5, 6) __cold diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 410b8bd81..057df38fc 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1034,6 +1034,18 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc --out->atomic; } +void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) +{ + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", + crc->compressed_size, + crc->uncompressed_size, + crc->offset, crc->nonce); + bch2_prt_csum_type(out, crc->csum_type); + prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); + prt_str(out, " compress "); + bch2_prt_compression_type(out, crc->compression_type); +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -1059,13 +1071,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce); - bch2_prt_csum_type(out, crc.csum_type); - prt_str(out, " compress "); - bch2_prt_compression_type(out, crc.compression_type); + bch2_extent_crc_unpacked_to_text(out, &crc); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1096,6 +1102,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } + static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags, diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 1ade95965..530686aa6 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -212,6 +212,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); } +void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 54873ecc6..cc33d763f 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -678,8 +678,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); if (IS_ERR_OR_NULL(folio)) goto err_unlock; @@ -820,9 +820,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &fs); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping), &fs); if (ret) goto out; @@ -864,24 +863,26 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { + ssize_t f_reserved; + f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; + f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); + + if (unlikely(f_reserved != f_len)) { + if (f_reserved < 0) { + if (f == darray_first(fs)) { + ret = f_reserved; + goto out; + } + + folios_trunc(&fs, fi); + end = min(end, folio_end_pos(darray_last(fs))); + } else { + folios_trunc(&fs, fi + 1); + end = f_pos + f_reserved; + } - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&fs, fi); - if (!fs.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(fs))); break; } diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 049b61bc9..e246b1e05 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -179,7 +179,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = file->f_mapping; size_t count = iov_iter_count(iter); - ssize_t ret; + ssize_t ret = 0; if (!count) return 0; /* skip atime */ @@ -205,7 +205,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += ret; } else { bch2_pagecache_add_get(inode); - ret = generic_file_read_iter(iocb, iter); + ret = filemap_read(iocb, iter, ret); bch2_pagecache_add_put(inode); } out: diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c index 872283e5b..a9cc5cad9 100644 --- a/libbcachefs/fs-io-pagecache.c +++ b/libbcachefs/fs-io-pagecache.c @@ -423,7 +423,7 @@ int bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, - unsigned offset, unsigned len) + size_t offset, size_t len) { struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; @@ -437,8 +437,7 @@ int bch2_folio_reservation_get(struct bch_fs *c, for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { - disk_sectors += sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); quota_sectors += s->s[i].state == SECTOR_unallocated; } @@ -449,12 +448,9 @@ int bch2_folio_reservation_get(struct bch_fs *c, } if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, true); + ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); if (unlikely(ret)) { - struct disk_reservation tmp = { - .sectors = disk_sectors - }; + struct disk_reservation tmp = { .sectors = disk_sectors }; bch2_disk_reservation_put(c, &tmp); res->disk.sectors -= disk_sectors; @@ -465,6 +461,31 @@ int bch2_folio_reservation_get(struct bch_fs *c, return 0; } +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + size_t l, reserved = 0; + int ret; + + while ((l = len - reserved)) { + while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) { + if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c)) + return reserved ?: ret; + + len = reserved + l; + l /= 2; + } + + offset += l; + reserved += l; + } + + return reserved; +} + static void bch2_clear_folio_bits(struct folio *folio) { struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h index 828c3d7c8..fd7d692c0 100644 --- a/libbcachefs/fs-io-pagecache.h +++ b/libbcachefs/fs-io-pagecache.h @@ -153,7 +153,12 @@ int bch2_folio_reservation_get(struct bch_fs *, struct bch_inode_info *, struct folio *, struct bch2_folio_reservation *, - unsigned, unsigned); + size_t, size_t); +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + size_t, size_t); void bch2_set_folio_dirty(struct bch_fs *, struct bch_inode_info *, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 4a3e9f429..d34d628ff 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino BUG_ON(!old); if (unlikely(old != inode)) { + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; discard_new_inode(&inode->v); inode = old; } else { @@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. */ unlock_new_inode(&inode->v); } @@ -880,6 +888,16 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->subvol = inode->ei_subvol; stat->result_mask |= STATX_SUBVOL; + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + /* + * this is incorrect; we should be tracking this in superblock, + * and checking the alignment of open devices + */ + stat->dio_mem_align = SECTOR_SIZE; + stat->dio_offset_align = block_bytes(c); + } + if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); @@ -1157,6 +1175,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1488,11 +1507,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1504,6 +1518,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1776,7 +1793,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->flags & OPT_MOUNT)) + if ((opt->flags & OPT_HIDDEN) || + !(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 6b807ecb5..1e20020ea 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -535,12 +535,13 @@ int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { + prt_printf(out, "\n"); printbuf_indent_add(out, 2); prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)\n", inode->bi_flags); + prt_printf(out, "(%x)\n", inode->bi_flags); prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); prt_printf(out, "bi_size=%llu\n", inode->bi_size); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index c97fa7002..2a5c4371f 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -389,7 +389,6 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index c6197e6aa..b3b05e939 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -1080,7 +1080,10 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; csum_err: - bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)", op->flags & BCH_WRITE_MOVE ? "move" : "user"); ret = -EIO; err: diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index d5a9f3ada..6209d7787 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1520,6 +1520,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 struct journal_entry_pin *pin; spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + *seq = max(*seq, j->pin.front); if (*seq >= j->pin.back) { diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 5fda9a931..ff832d20e 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -722,13 +722,16 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + printbuf_indent_add(out, 2); for (i = 0; i < nr_types; i++) { + prt_newline(out); bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } + printbuf_indent_sub(out, 2); } static int journal_entry_log_validate(struct bch_fs *c, @@ -1678,6 +1681,13 @@ static CLOSURE_CALLBACK(journal_write_done) mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + /* + * We don't typically trigger journal writes from her - the next journal + * write will be triggered immediately after the previous one is + * allocated, in bch2_journal_write() - but the journal write error path + * is special: + */ + bch2_journal_do_writes(j); spin_unlock(&j->lock); } @@ -1974,7 +1984,6 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2018,11 +2027,16 @@ CLOSURE_CALLBACK(bch2_journal_write) } if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), + le64_to_cpu(w->data->seq), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); goto err; } diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index ed4846709..1f25c111c 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) BUG_ON(nr != t->nr); unsigned i; - for (src = bl->start, i = eytzinger0_first(t->nr); + for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index bd71ba77d..425ba732c 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -24,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) - static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/libbcachefs/lru_format.h b/libbcachefs/lru_format.h new file mode 100644 index 000000000..f372cb3b8 --- /dev/null +++ b/libbcachefs/lru_format.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_FORMAT_H +#define _BCACHEFS_LRU_FORMAT_H + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __packed __aligned(8); + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index cff35845a..60b930185 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -63,6 +63,7 @@ enum opt_flags { OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ + OPT_HIDDEN = (1 << 10), }; enum opt_type { @@ -137,7 +138,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -406,7 +407,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS, \ + OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 097ef7d13..d89eb43c5 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -699,10 +699,10 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (write_sb) bch2_write_super(c); - - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index 4a59f52f8..73339a0a3 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -193,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; + down_read(&c->state_lock); + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { struct recovery_pass_fn *p = recovery_pass_fns + i; @@ -208,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) break; } + up_read(&c->state_lock); + return ret; } diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index be81c8c64..dfbbd33c8 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -77,6 +77,7 @@ BCH_FSCK_ERR_fs_usage_cached_wrong, \ BCH_FSCK_ERR_fs_usage_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ BCH_FSCK_ERR_bkey_version_in_future) diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c index bda33e59e..c1270d790 100644 --- a/libbcachefs/sb-errors.c +++ b/libbcachefs/sb-errors.c @@ -110,19 +110,25 @@ void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) void bch2_sb_errors_from_cpu(struct bch_fs *c) { bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst = - bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + struct bch_sb_field_errors *dst; unsigned i; + mutex_lock(&c->fsck_error_counts_lock); + + dst = bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + if (!dst) - return; + goto err; for (i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } + +err: + mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 9dd2b7ae4..67648b776 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -2,286 +2,294 @@ #ifndef _BCACHEFS_SB_ERRORS_FORMAT_H #define _BCACHEFS_SB_ERRORS_FORMAT_H -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(key_in_missing_inode, 216) \ - x(key_in_wrong_inode_type, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) \ - x(alloc_key_stripe_sectors_wrong, 271) \ - x(accounting_mismatch, 272) \ - x(accounting_replicas_not_marked, 273) \ - x(invalid_btree_id, 274) \ - x(alloc_key_io_time_bad, 275) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, + FSCK_AUTOFIX = 1 << 4, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, 0) \ + x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_root_bkey_invalid, 58, 0) \ + x(btree_root_read_error, 59, 0) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, 0) \ + x(btree_node_topology_bad_min_key, 63, 0) \ + x(btree_node_topology_bad_max_key, 64, 0) \ + x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ + x(btree_node_topology_overwritten_by_next_node, 66, 0) \ + x(btree_node_topology_interior_node_empty, 67, 0) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, 0) \ + x(alloc_key_to_missing_lru_entry, 105, 0) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, 0) \ + x(freespace_key_wrong, 115, 0) \ + x(freespace_hole_missing, 116, 0) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, 0) \ + x(bucket_gens_hole_wrong, 119, 0) \ + x(bucket_gens_to_invalid_dev, 120, 0) \ + x(bucket_gens_to_invalid_buckets, 121, 0) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_to_missing_device, 126, 0) \ + x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_ptr, 128, 0) \ + x(lru_entry_at_time_0, 129, 0) \ + x(lru_entry_to_invalid_bucket, 130, 0) \ + x(lru_entry_bad, 131, 0) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, 0) \ + x(ptr_to_missing_alloc_key, 148, 0) \ + x(ptr_to_missing_replicas_entry, 149, 0) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, 0) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, 0) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, 0) \ + x(inode_i_sectors_dirty_but_clean, 203, 0) \ + x(inode_i_sectors_wrong, 204, 0) \ + x(inode_dir_wrong_nlink, 205, 0) \ + x(inode_dir_multiple_links, 206, 0) \ + x(inode_multiple_links_but_nlink_0, 207, 0) \ + x(inode_wrong_backpointer, 208, 0) \ + x(inode_wrong_nlink, 209, 0) \ + x(inode_unreachable, 210, 0) \ + x(deleted_inode_but_clean, 211, 0) \ + x(deleted_inode_missing, 212, 0) \ + x(deleted_inode_is_dir, 213, 0) \ + x(deleted_inode_not_unlinked, 214, 0) \ + x(extent_overlapping, 215, 0) \ + x(key_in_missing_inode, 216, 0) \ + x(key_in_wrong_inode_type, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, 0) \ + x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, 0) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, 0) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, 0) \ + x(accounting_mismatch, 272, 0) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x BCH_SB_ERR_MAX diff --git a/libbcachefs/seqmutex.h b/libbcachefs/seqmutex.h index c1860d816..c4b3d8d3f 100644 --- a/libbcachefs/seqmutex.h +++ b/libbcachefs/seqmutex.h @@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock) static inline void seqmutex_lock(struct seqmutex *lock) { mutex_lock(&lock->lock); -} - -static inline void seqmutex_unlock(struct seqmutex *lock) -{ lock->seq++; - mutex_unlock(&lock->lock); } -static inline u32 seqmutex_seq(struct seqmutex *lock) +static inline u32 seqmutex_unlock(struct seqmutex *lock) { - return lock->seq; + u32 seq = lock->seq; + mutex_unlock(&lock->lock); + return seq; } static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index fa7ad5865..96744b1a7 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); + if (unlikely(new_bytes > INT_MAX)) + return NULL; + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; @@ -1682,6 +1685,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index bfdec48e3..ced63397d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -535,7 +535,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_allocator_background_exit(c); bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); @@ -564,8 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); - free_percpu(c->online_reserved); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); + WARN(v, "online_reserved not 0 at shutdown: %lli", v); + free_percpu(c->online_reserved); + } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); @@ -1193,6 +1195,7 @@ static void bch2_dev_free(struct bch_dev *ca) kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); @@ -1315,6 +1318,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + bch2_dev_allocator_background_init(ca); + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || @@ -1527,6 +1532,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); + bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } @@ -1538,6 +1544,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index f4dd09c41..76ffe08e7 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -697,14 +697,19 @@ do { \ } \ } while (0) +#define per_cpu_sum(_p) \ +({ \ + typeof(*_p) _ret = 0; \ + \ + int cpu; \ + for_each_possible_cpu(cpu) \ + _ret += *per_cpu_ptr(_p, cpu); \ + _ret; \ +}) + static inline u64 percpu_u64_get(u64 __percpu *src) { - u64 ret = 0; - int cpu; - - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(src, cpu); - return ret; + return per_cpu_sum(src); } static inline void percpu_u64_set(u64 __percpu *dst, u64 src) diff --git a/linux/closure.c b/linux/closure.c index 07409e9e3..c971216d9 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -13,14 +13,25 @@ #include #include -static inline void closure_put_after_sub(struct closure *cl, int flags) +static inline void closure_put_after_sub_checks(int flags) { int r = flags & CLOSURE_REMAINING_MASK; - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + if (WARN(flags & CLOSURE_GUARD_MASK, + "closure has guard bits set: %x (%u)", + flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) + r &= ~CLOSURE_GUARD_MASK; + + WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), + "closure ref hit 0 with incorrect flags set: %x (%u)", + flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); +} + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + closure_put_after_sub_checks(flags); - if (!r) { + if (!(flags & CLOSURE_REMAINING_MASK)) { smp_acquire__after_ctrl_dep(); cl->closure_get_happened = false; @@ -139,6 +150,41 @@ void __sched __closure_sync(struct closure *cl) } EXPORT_SYMBOL(__closure_sync); +/* + * closure_return_sync - finish running a closure, synchronously (i.e. waiting + * for outstanding get()s to finish) and returning once closure refcount is 0. + * + * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent + * closure_get_not_zero() calls waill fail. + */ +void __sched closure_return_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + + cl->s = &s; + set_closure_fn(cl, closure_sync_fn, NULL); + + unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, + &cl->remaining); + + closure_put_after_sub_checks(flags); + + if (unlikely(flags & CLOSURE_REMAINING_MASK)) { + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + } + + if (cl->parent) + closure_put(cl->parent); +} +EXPORT_SYMBOL(closure_return_sync); + int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) { struct closure_syncer s = { .task = current };