diff --git a/kernel/include/onyx/cpumask.h b/kernel/include/onyx/cpumask.h index 4040d5772..9959fb874 100644 --- a/kernel/include/onyx/cpumask.h +++ b/kernel/include/onyx/cpumask.h @@ -201,7 +201,7 @@ struct cpumask static inline struct cpumask cpumask_all_but_one(unsigned long cpu) { struct cpumask c; - memset(&c, 0xff, sizeof(c)); + memset((void*) &c, 0xff, sizeof(c)); c.mask[cpu / LONG_SIZE_BITS] &= ~(1UL << (cpu % LONG_SIZE_BITS)); return c; } diff --git a/kernel/include/onyx/mm/slab.h b/kernel/include/onyx/mm/slab.h index 3da0f36c3..be5d16be5 100644 --- a/kernel/include/onyx/mm/slab.h +++ b/kernel/include/onyx/mm/slab.h @@ -47,9 +47,10 @@ struct slab_cache size_t nfullslabs; struct list_head cache_list_node; unsigned int flags; + unsigned int bufctl_off; struct spinlock lock; - void (*ctor)(void *); int mag_limit; + void (*ctor)(void *); // TODO: This is horrible. We need a way to allocate percpu memory, // and then either trim it or grow it when CPUs come online. struct slab_cache_percpu_context pcpu[CONFIG_SMP_NR_CPUS] __align_cache; @@ -57,13 +58,20 @@ struct slab_cache __BEGIN_CDECLS -#define KMEM_CACHE_HWALIGN (1 << 0) -#define KMEM_CACHE_VMALLOC (1 << 1) -#define KMEM_CACHE_NOPCPU (1 << 2) +#define KMEM_CACHE_HWALIGN (1 << 0) +#define KMEM_CACHE_VMALLOC (1 << 1) +#define KMEM_CACHE_NOPCPU (1 << 2) /* Panic if kmem_cache_create fails */ -#define KMEM_CACHE_PANIC (1 << 3) - -#define SLAB_PANIC KMEM_CACHE_PANIC +#define KMEM_CACHE_PANIC (1 << 3) +/* TYPESAFE_BY_RCU makes it so objects _wont switch types_ during an RCU read section. As in the + * slab itself will not be freed or reused until the read section ends. So a reference that was + * valid during an RCU read section will keep pointing to an object of the same type and remain + * "valid", even after getting kfree'd. This flag is most useful with a given ctor to initialize the + * objects before kmem_cache_alloc. */ +#define KMEM_CACHE_TYPESAFE_BY_RCU (1 << 4) + +#define SLAB_PANIC KMEM_CACHE_PANIC +#define SLAB_TYPESAFE_BY_RCU KMEM_CACHE_TYPESAFE_BY_RCU /** * @brief Create a slab cache diff --git a/kernel/kernel/mm/rmap.c b/kernel/kernel/mm/rmap.c index ebc79fdd0..fbf148630 100644 --- a/kernel/kernel/mm/rmap.c +++ b/kernel/kernel/mm/rmap.c @@ -16,22 +16,23 @@ static struct slab_cache *anon_vma_cache; +static void anon_vma_ctor(void *ctor) +{ + struct anon_vma *vma = ctor; + spinlock_init(&vma->lock); + INIT_LIST_HEAD(&vma->vma_list); +} + void __init anon_vma_init(void) { - anon_vma_cache = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - _Alignof(struct anon_vma), KMEM_CACHE_PANIC, NULL); + anon_vma_cache = + kmem_cache_create("anon_vma", sizeof(struct anon_vma), _Alignof(struct anon_vma), + KMEM_CACHE_PANIC | SLAB_TYPESAFE_BY_RCU, anon_vma_ctor); } struct anon_vma *anon_vma_alloc(void) { - struct anon_vma *anon = kmem_cache_alloc(anon_vma_cache, GFP_KERNEL); - if (anon) - { - spinlock_init(&anon->lock); - INIT_LIST_HEAD(&anon->vma_list); - } - - return anon; + return kmem_cache_alloc(anon_vma_cache, GFP_KERNEL); } void __anon_vma_unlink(struct anon_vma *anon, struct vm_area_struct *vma) @@ -108,21 +109,42 @@ struct rmap_walk_info void *context; }; -static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page) +static struct anon_vma *anon_vma_lock(struct page *page) { - DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page); - struct anon_vma *anon_vma = (struct anon_vma *) page->owner; + /* We use RCU read lock and TYPESAFE_BY_RCU to get by here. The idea goes like this: We check if + * page_mapcount != 0 under the rcu_read_lock; if this is true, the anon_vma struct _must_ be + * valid. We then spin_lock the anon_vma (which only works because TYPESAFE_BY_RCU and the read + * lock enforce type stability here). We then recheck the mapcount under the lock. */ + struct anon_vma *anon_vma; + rcu_read_lock(); + anon_vma = (struct anon_vma *) READ_ONCE(page->owner); + if (!page_mapcount(page)) + goto no_anon_vma; - /* anon_vma doesn't exist or can be stale if the page was unmapped. */ + spin_lock(&anon_vma->lock); if (!page_mapcount(page)) + { + spin_unlock(&anon_vma->lock); + goto no_anon_vma; + } + + rcu_read_unlock(); + return anon_vma; +no_anon_vma: + rcu_read_unlock(); + return NULL; +} + +static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page) +{ + DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page); + struct anon_vma *anon_vma = anon_vma_lock(page); + if (!anon_vma) return 0; - /* TODO: We might need TYPESAFE_BY_RCU for anon_vma? */ unsigned long page_addr = page->pageoff; int st = 0; - spin_lock(&anon_vma->lock); - list_for_every (&anon_vma->vma_list) { struct vm_area_struct *vma = container_of(l, struct vm_area_struct, anon_vma_node); diff --git a/kernel/kernel/mm/slab.c b/kernel/kernel/mm/slab.c index c5b6be2c2..4c980040d 100644 --- a/kernel/kernel/mm/slab.c +++ b/kernel/kernel/mm/slab.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,10 @@ struct slab }; size_t size; - struct list_head slab_list_node; + union { + struct list_head slab_list_node; + struct rcu_head typesafe_by_rcu; + }; struct bufctl *object_list; size_t active_objects; size_t nobjects; @@ -121,6 +125,7 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align c->redzone = 0; #endif c->flags = flags | KMEM_CACHE_VMALLOC; + c->bufctl_off = 0; // Minimum object alignment is 16 c->alignment = alignment; @@ -137,6 +142,15 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align c->alignment = ALIGN_TO(c->alignment, 64); } + if (flags & SLAB_TYPESAFE_BY_RCU || ctor) + { + /* We can't place the bufctl inside the object, because either ctor or TYPESAFE_BY_RCU were + * specified, and these are only useful if the allocator _does not_ touch the object. As + * such, we place the bufctls right outside the object. */ + c->bufctl_off = c->objsize; + c->objsize += sizeof(struct bufctl); + } + c->objsize = ALIGN_TO(c->objsize, c->alignment); c->redzone = ALIGN_TO(c->redzone / 2, c->alignment) * 2; @@ -174,6 +188,16 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align #define ALWAYS_INLINE __attribute__((always_inline)) +ALWAYS_INLINE static inline void *kmem_bufctl_to_ptr(struct slab_cache *cache, struct bufctl *buf) +{ + return ((void *) buf) - cache->bufctl_off; +} + +ALWAYS_INLINE static inline struct bufctl *kmem_bufctl_from_ptr(struct slab_cache *cache, void *ptr) +{ + return ptr + cache->bufctl_off; +} + #ifdef SLAB_DEBUG_COUNTS /* Kept here and not in list.h, because this is a horrible pattern that should not be used for * !DEBUG */ @@ -307,7 +331,7 @@ static void *kmem_cache_alloc_from_slab(struct slab *s, unsigned int flags) ret->flags = 0; - return (void *) ret; + return kmem_bufctl_to_ptr(s->cache, ret); } /** @@ -505,8 +529,9 @@ NO_ASAN static struct slab *kmem_cache_create_slab(struct slab_cache *cache, uns asan_poison_shadow((unsigned long) ptr, redzone, KASAN_LEFT_REDZONE); #endif ptr += redzone; - - struct bufctl *ctl = (struct bufctl *) ptr; + if (cache->ctor) + cache->ctor(ptr); + struct bufctl *ctl = (struct bufctl *) (ptr + cache->bufctl_off); ctl->next = NULL; ctl->flags = BUFCTL_PATTERN_FREE; if (last) @@ -663,7 +688,7 @@ static void kmem_cache_reload_mag_with_slab(struct slab_cache *cache, if (buf->flags != BUFCTL_PATTERN_FREE) panic("Bad buf %p, slab %p", buf, slab); - pcpu->magazine[pcpu->size++] = (void *) buf; + pcpu->magazine[pcpu->size++] = kmem_bufctl_to_ptr(cache, buf); slab->object_list = (struct bufctl *) buf->next; if (!buf->next && j + 1 != avail) @@ -898,7 +923,7 @@ void *kmem_cache_alloc(struct slab_cache *cache, unsigned int flags) // If we have objects on our magazine, pop one out and // return. void *ret = pcpu->magazine[--pcpu->size]; - ((struct bufctl *) ret)->flags = 0; + kmem_bufctl_from_ptr(cache, ret)->flags = 0; pcpu->active_objs++; __atomic_store_n(&pcpu->touched, 0, __ATOMIC_RELEASE); @@ -974,7 +999,7 @@ size_t kmem_cache_alloc_bulk(struct slab_cache *cache, unsigned int gfp_flags, s while (to_take--) { void *ptr = pcpu->magazine[--pcpu->size]; - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; res[i++] = ptr; pcpu->active_objs++; } @@ -1003,7 +1028,7 @@ static void kmem_free_to_slab(struct slab_cache *cache, struct slab *slab, void if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); - struct bufctl *ctl = (struct bufctl *) ptr; + struct bufctl *ctl = kmem_bufctl_from_ptr(cache, ptr); if (ctl->flags == BUFCTL_PATTERN_FREE) panic("slab: Double free at %p", ptr); @@ -1049,9 +1074,13 @@ static void kfree_nopcpu(void *ptr) struct slab *slab = kmem_pointer_to_slab(ptr); struct slab_cache *cache = slab->cache; + /* TYPESAFE_BY_RCU cannot participate in typical KASAN lifetime shenanigans. :/ */ + if (!(cache->flags & SLAB_TYPESAFE_BY_RCU)) + { #ifdef CONFIG_KASAN - asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED); + asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED); #endif + } spin_lock(&cache->lock); kmem_free_to_slab(cache, slab, ptr); @@ -1071,7 +1100,7 @@ void kmem_cache_return_pcpu_batch(struct slab_cache *cache, struct slab_cache_pe if (unlikely(slab->cache != cache)) panic("slab: Pointer %p was returned to the wrong cache\n", ptr); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; kmem_free_to_slab(cache, slab, ptr); pcpu->size--; } @@ -1086,7 +1115,7 @@ __always_inline void kmem_cache_free_pcpu_single(struct slab_cache *cache, struct slab_cache_percpu_context *pcpu, void *ptr) { DCHECK(pcpu->size < cache->mag_limit); - struct bufctl *buf = (struct bufctl *) ptr; + struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr); if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); @@ -1118,7 +1147,7 @@ void kasan_kfree(void *ptr, struct slab_cache *cache, size_t chunk_size) if (unlikely((unsigned long) ptr % cache->alignment)) panic("slab: Bad pointer %p", ptr); - struct bufctl *buf = (struct bufctl *) ptr; + struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr); if (unlikely(buf->flags == BUFCTL_PATTERN_FREE)) { @@ -1126,7 +1155,9 @@ void kasan_kfree(void *ptr, struct slab_cache *cache, size_t chunk_size) } buf->flags = BUFCTL_PATTERN_FREE; - asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED); + if (!(cache->flags & SLAB_TYPESAFE_BY_RCU)) + asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED); + kasan_register_free(ptr, cache); #ifndef NOQUARANTINE kasan_quarantine_add_chunk(buf, chunk_size); @@ -1260,14 +1291,11 @@ void kmem_cache_free_bulk(struct slab_cache *cache, size_t size, void **ptrs) * * @param slab Slab to free */ -static void kmem_cache_free_slab(struct slab *slab) +static void __kmem_cache_free_slab(struct slab *slab) { assert(slab->active_objects == 0); struct slab_cache *cache = slab->cache; - // Free it from the free list - list_remove(&slab->slab_list_node); - kmem_slab_unaccount_pages(slab, cache->flags); // After freeing the slab we may no longer touch the struct slab if (likely(!(cache->flags & KMEM_CACHE_VMALLOC))) free_pages(slab->pages); @@ -1275,6 +1303,25 @@ static void kmem_cache_free_slab(struct slab *slab) vfree(slab->start, slab->size >> PAGE_SHIFT); } +static void kmem_cache_typesafe_free(struct rcu_head *head) +{ + __kmem_cache_free_slab(container_of(head, struct slab, typesafe_by_rcu)); +} + +static void kmem_cache_free_slab(struct slab *slab) +{ + assert(slab->active_objects == 0); + struct slab_cache *cache = slab->cache; + + // Free it from the free list + list_remove(&slab->slab_list_node); + kmem_slab_unaccount_pages(slab, cache->flags); + if (cache->flags & SLAB_TYPESAFE_BY_RCU) + call_rcu(&slab->typesafe_by_rcu, kmem_cache_typesafe_free); + else + __kmem_cache_free_slab(slab); +} + struct slab_rendezvous { unsigned int waiting_for_cpus; @@ -1371,7 +1418,7 @@ static void kmem_cache_shrink_pcpu_all(struct slab_cache *cache) if (unlikely(slab->cache != cache)) panic("slab: Pointer %p was returned to the wrong cache\n", ptr); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; kmem_free_to_slab(cache, slab, ptr); } @@ -1621,7 +1668,7 @@ void kmem_free_kasan(void *ptr) { struct slab *slab = kmem_pointer_to_slab(ptr); assert(slab != NULL); - ((struct bufctl *) ptr)->flags = 0; + kmem_bufctl_from_ptr(cache, ptr)->flags = 0; spin_lock(&slab->cache->lock); kmem_free_to_slab(slab->cache, slab, ptr); spin_unlock(&slab->cache->lock); @@ -1633,7 +1680,7 @@ static void stack_trace_print(unsigned long *entries, unsigned long nr) for (unsigned long i = 0; i < nr; i++) { char sym[SYM_SYMBOLIZE_BUFSIZ]; - int st = sym_symbolize((void *) entries[i], cul::slice{sym, sizeof(sym)}); + int st = sym_symbolize((void *) entries[i], sym, sizeof(sym), 0); if (st < 0) break; pr_crit("\t%s\n", sym);