From 0069b6fe25f4e37a0701949e1d9cf7697fa5d5cc Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Tue, 20 Aug 2024 01:22:02 +0100 Subject: [PATCH] mm/slab: Add TYPESAFE_BY_RCU Add TYPESAFE_BY_RCU support. These caches guarantee type stability of objects without the need for kfree_rcu or similar (under the RCU read lock). They do so by only reclaiming slabs after an RCU quiescent period has passed. This patch also adds ctor support. While we're at it, use TYPESAFE_BY_RCU for anon_vma, which was the original intended usage. Signed-off-by: Pedro Falcato --- kernel/include/onyx/cpumask.h | 2 +- kernel/include/onyx/mm/slab.h | 22 ++++++--- kernel/kernel/mm/rmap.c | 56 +++++++++++++++------- kernel/kernel/mm/slab.c | 87 +++++++++++++++++++++++++++-------- 4 files changed, 122 insertions(+), 45 deletions(-) diff --git a/kernel/include/onyx/cpumask.h b/kernel/include/onyx/cpumask.h index 4040d5772..9959fb874 100644 --- a/kernel/include/onyx/cpumask.h +++ b/kernel/include/onyx/cpumask.h @@ -201,7 +201,7 @@ struct cpumask static inline struct cpumask cpumask_all_but_one(unsigned long cpu) { struct cpumask c; - memset(&c, 0xff, sizeof(c)); + memset((void*) &c, 0xff, sizeof(c)); c.mask[cpu / LONG_SIZE_BITS] &= ~(1UL << (cpu % LONG_SIZE_BITS)); return c; } diff --git a/kernel/include/onyx/mm/slab.h b/kernel/include/onyx/mm/slab.h index 3da0f36c3..be5d16be5 100644 --- a/kernel/include/onyx/mm/slab.h +++ b/kernel/include/onyx/mm/slab.h @@ -47,9 +47,10 @@ struct slab_cache size_t nfullslabs; struct list_head cache_list_node; unsigned int flags; + unsigned int bufctl_off; struct spinlock lock; - void (*ctor)(void *); int mag_limit; + void (*ctor)(void *); // TODO: This is horrible. We need a way to allocate percpu memory, // and then either trim it or grow it when CPUs come online. struct slab_cache_percpu_context pcpu[CONFIG_SMP_NR_CPUS] __align_cache; @@ -57,13 +58,20 @@ struct slab_cache __BEGIN_CDECLS -#define KMEM_CACHE_HWALIGN (1 << 0) -#define KMEM_CACHE_VMALLOC (1 << 1) -#define KMEM_CACHE_NOPCPU (1 << 2) +#define KMEM_CACHE_HWALIGN (1 << 0) +#define KMEM_CACHE_VMALLOC (1 << 1) +#define KMEM_CACHE_NOPCPU (1 << 2) /* Panic if kmem_cache_create fails */ -#define KMEM_CACHE_PANIC (1 << 3) - -#define SLAB_PANIC KMEM_CACHE_PANIC +#define KMEM_CACHE_PANIC (1 << 3) +/* TYPESAFE_BY_RCU makes it so objects _wont switch types_ during an RCU read section. As in the + * slab itself will not be freed or reused until the read section ends. So a reference that was + * valid during an RCU read section will keep pointing to an object of the same type and remain + * "valid", even after getting kfree'd. This flag is most useful with a given ctor to initialize the + * objects before kmem_cache_alloc. */ +#define KMEM_CACHE_TYPESAFE_BY_RCU (1 << 4) + +#define SLAB_PANIC KMEM_CACHE_PANIC +#define SLAB_TYPESAFE_BY_RCU KMEM_CACHE_TYPESAFE_BY_RCU /** * @brief Create a slab cache diff --git a/kernel/kernel/mm/rmap.c b/kernel/kernel/mm/rmap.c index ebc79fdd0..fbf148630 100644 --- a/kernel/kernel/mm/rmap.c +++ b/kernel/kernel/mm/rmap.c @@ -16,22 +16,23 @@ static struct slab_cache *anon_vma_cache; +static void anon_vma_ctor(void *ctor) +{ + struct anon_vma *vma = ctor; + spinlock_init(&vma->lock); + INIT_LIST_HEAD(&vma->vma_list); +} + void __init anon_vma_init(void) { - anon_vma_cache = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - _Alignof(struct anon_vma), KMEM_CACHE_PANIC, NULL); + anon_vma_cache = + kmem_cache_create("anon_vma", sizeof(struct anon_vma), _Alignof(struct anon_vma), + KMEM_CACHE_PANIC | SLAB_TYPESAFE_BY_RCU, anon_vma_ctor); } struct anon_vma *anon_vma_alloc(void) { - struct anon_vma *anon = kmem_cache_alloc(anon_vma_cache, GFP_KERNEL); - if (anon) - { - spinlock_init(&anon->lock); - INIT_LIST_HEAD(&anon->vma_list); - } - - return anon; + return kmem_cache_alloc(anon_vma_cache, GFP_KERNEL); } void __anon_vma_unlink(struct anon_vma *anon, struct vm_area_struct *vma) @@ -108,21 +109,42 @@ struct rmap_walk_info void *context; }; -static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page) +static struct anon_vma *anon_vma_lock(struct page *page) { - DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page); - struct anon_vma *anon_vma = (struct anon_vma *) page->owner; + /* We use RCU read lock and TYPESAFE_BY_RCU to get by here. The idea goes like this: We check if + * page_mapcount != 0 under the rcu_read_lock; if this is true, the anon_vma struct _must_ be + * valid. We then spin_lock the anon_vma (which only works because TYPESAFE_BY_RCU and the read + * lock enforce type stability here). We then recheck the mapcount under the lock. */ + struct anon_vma *anon_vma; + rcu_read_lock(); + anon_vma = (struct anon_vma *) READ_ONCE(page->owner); + if (!page_mapcount(page)) + goto no_anon_vma; - /* anon_vma doesn't exist or can be stale if the page was unmapped. */ + spin_lock(&anon_vma->lock); if (!page_mapcount(page)) + { + spin_unlock(&anon_vma->lock); + goto no_anon_vma; + } + + rcu_read_unlock(); + return anon_vma; +no_anon_vma: + rcu_read_unlock(); + return NULL; +} + +static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page) +{ + DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page); + struct anon_vma *anon_vma = anon_vma_lock(page); + if (!anon_vma) return 0; - /* TODO: We might need TYPESAFE_BY_RCU for anon_vma? */ unsigned long page_addr = page->pageoff; int st = 0; - spin_lock(&anon_vma->lock); - list_for_every (&anon_vma->vma_list) { struct vm_area_struct *vma = container_of(l, struct vm_area_struct, anon_vma_node); diff --git a/kernel/kernel/mm/slab.c b/kernel/kernel/mm/slab.c index c5b6be2c2..4c980040d 100644 --- a/kernel/kernel/mm/slab.c +++ b/kernel/kernel/mm/slab.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,10 @@ struct slab }; size_t size; - struct list_head slab_list_node; + union { + struct list_head slab_list_node; + struct rcu_head typesafe_by_rcu; + }; struct bufctl *object_list; size_t active_objects; size_t nobjects; @@ -121,6 +125,7 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align c->redzone = 0; #endif c->flags = flags | KMEM_CACHE_VMALLOC; + c->bufctl_off = 0; // Minimum object alignment is 16 c->alignment = alignment; @@ -137,6 +142,15 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align c->alignment = ALIGN_TO(c->alignment, 64); } + if (flags & SLAB_TYPESAFE_BY_RCU || ctor) + { + /* We can't place the bufctl inside the object, because either ctor or TYPESAFE_BY_RCU were + * specified, and these are only useful if the allocator _does not_ touch the object. As + * such, we place the bufctls right outside the object. */ + c->bufctl_off = c->objsize; + c->objsize += sizeof(struct bufctl); + } + c->objsize = ALIGN_TO(c->objsize, c->alignment); c->redzone = ALIGN_TO(c->redzone / 2, c->alignment) * 2; @@ -174,6 +188,16 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align #define ALWAYS_INLINE __attribute__((always_inline)) +ALWAYS_INLINE static inline void *kmem_bufctl_to_ptr(struct slab_cache *cache, struct bufctl *buf) +{ + return ((void *) buf) - cache->bufctl_off; +} + +ALWAYS_INLINE static inline struct bufctl *kmem_bufctl_from_ptr(struct slab_cache *cache, void *ptr) +{ + return ptr + cache->bufctl_off; +} + #ifdef SLAB_DEBUG_COUNTS /* Kept here and not in list.h, because this is a horrible pattern that should not be used for * !DEBUG */ @@ -307,7 +331,7 @@ static void *kmem_cache_alloc_from_slab(struct slab *s, unsigned int flags) ret->flags = 0; - return (void *) ret; + return kmem_bufctl_to_ptr(s->cache, ret); } /** @@ -505,8 +529,9 @@ NO_ASAN static struct slab *kmem_cache_create_slab(struct slab_cache *cache, uns asan_poison_shadow((unsigned long) ptr, redzone, KASAN_LEFT_REDZONE); #endif ptr += redzone; - - struct bufctl *ctl = (struct bufctl *) ptr; + if (cache->ctor) + cache->ctor(ptr); + struct bufctl *ctl = (struct bufctl *) (ptr + cache->bufctl_off); ctl->next = NULL; ctl->flags = BUFCTL_PATTERN_FREE; if (last) @@ -663,7 +688,7 @@ static void kmem_cache_reload_mag_with_slab(struct slab_cache *cache, if (buf->flags != BUFCTL_PATTERN_FREE) panic("Bad buf %p, slab %p", buf, slab); - pcpu->magazine[pcpu->size++] = (void *) buf; + pcpu->magazine[pcpu->size++] = kmem_bufctl_to_ptr(cache, buf); slab->object_list = (struct bufctl *) buf->next; if (!buf->next && j + 1 != avail) @@ -898,7 +923,7 @@ void *kmem_cache_alloc(struct slab_cache *cache, unsigned int flags) // If we have objects on our magazine, pop one out and // return. void *ret = pcpu->magazine[--pcpu->size]; - ((struct bufctl *) ret)->flags = 0; + kmem_bufctl_from_ptr(cache, ret)->flags = 0; pcpu->active_objs++; __atomic_store_n(&pcpu->touched, 0, __ATOMIC_RELEASE); @@ -974,7 +999,7 @@ size_t kmem_cache_alloc_bulk(struct slab_cache *cache, unsigned int gfp_flags, s while (to_take--) { void *ptr = pcpu->magazine[--pcpu->size]; - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; res[i++] = ptr; pcpu->active_objs++; } @@ -1003,7 +1028,7 @@ static void kmem_free_to_slab(struct slab_cache *cache, struct slab *slab, void if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); - struct bufctl *ctl = (struct bufctl *) ptr; + struct bufctl *ctl = kmem_bufctl_from_ptr(cache, ptr); if (ctl->flags == BUFCTL_PATTERN_FREE) panic("slab: Double free at %p", ptr); @@ -1049,9 +1074,13 @@ static void kfree_nopcpu(void *ptr) struct slab *slab = kmem_pointer_to_slab(ptr); struct slab_cache *cache = slab->cache; + /* TYPESAFE_BY_RCU cannot participate in typical KASAN lifetime shenanigans. :/ */ + if (!(cache->flags & SLAB_TYPESAFE_BY_RCU)) + { #ifdef CONFIG_KASAN - asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED); + asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED); #endif + } spin_lock(&cache->lock); kmem_free_to_slab(cache, slab, ptr); @@ -1071,7 +1100,7 @@ void kmem_cache_return_pcpu_batch(struct slab_cache *cache, struct slab_cache_pe if (unlikely(slab->cache != cache)) panic("slab: Pointer %p was returned to the wrong cache\n", ptr); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; kmem_free_to_slab(cache, slab, ptr); pcpu->size--; } @@ -1086,7 +1115,7 @@ __always_inline void kmem_cache_free_pcpu_single(struct slab_cache *cache, struct slab_cache_percpu_context *pcpu, void *ptr) { DCHECK(pcpu->size < cache->mag_limit); - struct bufctl *buf = (struct bufctl *) ptr; + struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr); if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); @@ -1118,7 +1147,7 @@ void kasan_kfree(void *ptr, struct slab_cache *cache, size_t chunk_size) if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); - struct bufctl *buf = (struct bufctl *) ptr; + struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr); if (unlikely(buf->flags == BUFCTL_PATTERN_FREE)) { @@ -1126,7 +1155,9 @@ void kasan_kfree(void *ptr, struct slab_cache *cache, size_t chunk_size) } buf->flags = BUFCTL_PATTERN_FREE; - asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED); + if (!(cache->flags & SLAB_TYPESAFE_BY_RCU)) + asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED); + kasan_register_free(ptr, cache); #ifndef NOQUARANTINE kasan_quarantine_add_chunk(buf, chunk_size); @@ -1260,14 +1291,11 @@ void kmem_cache_free_bulk(struct slab_cache *cache, size_t size, void **ptrs) * * @param slab Slab to free */ -static void kmem_cache_free_slab(struct slab *slab) +static void __kmem_cache_free_slab(struct slab *slab) { assert(slab->active_objects == 0); struct slab_cache *cache = slab->cache; - // Free it from the free list - list_remove(&slab->slab_list_node); - kmem_slab_unaccount_pages(slab, cache->flags); // After freeing the slab we may no longer touch the struct slab if (likely(!(cache->flags & KMEM_CACHE_VMALLOC))) free_pages(slab->pages); @@ -1275,6 +1303,25 @@ static void kmem_cache_free_slab(struct slab *slab) vfree(slab->start, slab->size >> PAGE_SHIFT); } +static void kmem_cache_typesafe_free(struct rcu_head *head) +{ + __kmem_cache_free_slab(container_of(head, struct slab, typesafe_by_rcu)); +} + +static void kmem_cache_free_slab(struct slab *slab) +{ + assert(slab->active_objects == 0); + struct slab_cache *cache = slab->cache; + + // Free it from the free list + list_remove(&slab->slab_list_node); + kmem_slab_unaccount_pages(slab, cache->flags); + if (cache->flags & SLAB_TYPESAFE_BY_RCU) + call_rcu(&slab->typesafe_by_rcu, kmem_cache_typesafe_free); + else + __kmem_cache_free_slab(slab); +} + struct slab_rendezvous { unsigned int waiting_for_cpus; @@ -1371,7 +1418,7 @@ static void kmem_cache_shrink_pcpu_all(struct slab_cache *cache) if (unlikely(slab->cache != cache)) panic("slab: Pointer %p was returned to the wrong cache\n", ptr); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; kmem_free_to_slab(cache, slab, ptr); } @@ -1621,7 +1668,7 @@ void kmem_free_kasan(void *ptr) { struct slab *slab = kmem_pointer_to_slab(ptr); assert(slab != NULL); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; spin_lock(&slab->cache->lock); kmem_free_to_slab(slab->cache, slab, ptr); spin_unlock(&slab->cache->lock); @@ -1633,7 +1680,7 @@ static void stack_trace_print(unsigned long *entries, unsigned long nr) for (unsigned long i = 0; i < nr; i++) { char sym[SYM_SYMBOLIZE_BUFSIZ]; - int st = sym_symbolize((void *) entries[i], cul::slice{sym, sizeof(sym)}); + int st = sym_symbolize((void *) entries[i], sym, sizeof(sym), 0); if (st < 0) break; pr_crit("\t%s\n", sym);