Skip to content

Commit

Permalink
mm/slab: Add TYPESAFE_BY_RCU
Browse files Browse the repository at this point in the history
Add TYPESAFE_BY_RCU support. These caches guarantee type stability of
objects without the need for kfree_rcu or similar (under the RCU read
lock). They do so by only reclaiming slabs after an RCU quiescent period
has passed. This patch also adds ctor support.

While we're at it, use TYPESAFE_BY_RCU for anon_vma, which was the
original intended usage.

Signed-off-by: Pedro Falcato <[email protected]>
  • Loading branch information
heatd committed Aug 20, 2024
1 parent 4341453 commit 0069b6f
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 45 deletions.
2 changes: 1 addition & 1 deletion kernel/include/onyx/cpumask.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ struct cpumask
static inline struct cpumask cpumask_all_but_one(unsigned long cpu)
{
struct cpumask c;
memset(&c, 0xff, sizeof(c));
memset((void*) &c, 0xff, sizeof(c));
c.mask[cpu / LONG_SIZE_BITS] &= ~(1UL << (cpu % LONG_SIZE_BITS));
return c;
}
Expand Down
22 changes: 15 additions & 7 deletions kernel/include/onyx/mm/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,31 @@ struct slab_cache
size_t nfullslabs;
struct list_head cache_list_node;
unsigned int flags;
unsigned int bufctl_off;
struct spinlock lock;
void (*ctor)(void *);
int mag_limit;
void (*ctor)(void *);
// TODO: This is horrible. We need a way to allocate percpu memory,
// and then either trim it or grow it when CPUs come online.
struct slab_cache_percpu_context pcpu[CONFIG_SMP_NR_CPUS] __align_cache;
};

__BEGIN_CDECLS

#define KMEM_CACHE_HWALIGN (1 << 0)
#define KMEM_CACHE_VMALLOC (1 << 1)
#define KMEM_CACHE_NOPCPU (1 << 2)
#define KMEM_CACHE_HWALIGN (1 << 0)
#define KMEM_CACHE_VMALLOC (1 << 1)
#define KMEM_CACHE_NOPCPU (1 << 2)
/* Panic if kmem_cache_create fails */
#define KMEM_CACHE_PANIC (1 << 3)

#define SLAB_PANIC KMEM_CACHE_PANIC
#define KMEM_CACHE_PANIC (1 << 3)
/* TYPESAFE_BY_RCU makes it so objects _wont switch types_ during an RCU read section. As in the
* slab itself will not be freed or reused until the read section ends. So a reference that was
* valid during an RCU read section will keep pointing to an object of the same type and remain
* "valid", even after getting kfree'd. This flag is most useful with a given ctor to initialize the
* objects before kmem_cache_alloc. */
#define KMEM_CACHE_TYPESAFE_BY_RCU (1 << 4)

#define SLAB_PANIC KMEM_CACHE_PANIC
#define SLAB_TYPESAFE_BY_RCU KMEM_CACHE_TYPESAFE_BY_RCU

/**
* @brief Create a slab cache
Expand Down
56 changes: 39 additions & 17 deletions kernel/kernel/mm/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,23 @@

static struct slab_cache *anon_vma_cache;

static void anon_vma_ctor(void *ctor)
{
struct anon_vma *vma = ctor;
spinlock_init(&vma->lock);
INIT_LIST_HEAD(&vma->vma_list);
}

void __init anon_vma_init(void)
{
anon_vma_cache = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
_Alignof(struct anon_vma), KMEM_CACHE_PANIC, NULL);
anon_vma_cache =
kmem_cache_create("anon_vma", sizeof(struct anon_vma), _Alignof(struct anon_vma),
KMEM_CACHE_PANIC | SLAB_TYPESAFE_BY_RCU, anon_vma_ctor);
}

struct anon_vma *anon_vma_alloc(void)
{
struct anon_vma *anon = kmem_cache_alloc(anon_vma_cache, GFP_KERNEL);
if (anon)
{
spinlock_init(&anon->lock);
INIT_LIST_HEAD(&anon->vma_list);
}

return anon;
return kmem_cache_alloc(anon_vma_cache, GFP_KERNEL);
}

void __anon_vma_unlink(struct anon_vma *anon, struct vm_area_struct *vma)
Expand Down Expand Up @@ -108,21 +109,42 @@ struct rmap_walk_info
void *context;
};

static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page)
static struct anon_vma *anon_vma_lock(struct page *page)
{
DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page);
struct anon_vma *anon_vma = (struct anon_vma *) page->owner;
/* We use RCU read lock and TYPESAFE_BY_RCU to get by here. The idea goes like this: We check if
* page_mapcount != 0 under the rcu_read_lock; if this is true, the anon_vma struct _must_ be
* valid. We then spin_lock the anon_vma (which only works because TYPESAFE_BY_RCU and the read
* lock enforce type stability here). We then recheck the mapcount under the lock. */
struct anon_vma *anon_vma;
rcu_read_lock();
anon_vma = (struct anon_vma *) READ_ONCE(page->owner);
if (!page_mapcount(page))
goto no_anon_vma;

/* anon_vma doesn't exist or can be stale if the page was unmapped. */
spin_lock(&anon_vma->lock);
if (!page_mapcount(page))
{
spin_unlock(&anon_vma->lock);
goto no_anon_vma;
}

rcu_read_unlock();
return anon_vma;
no_anon_vma:
rcu_read_unlock();
return NULL;
}

static int rmap_walk_anon(struct rmap_walk_info *info, struct page *page)
{
DCHECK_PAGE(page_flag_set(page, PAGE_FLAG_ANON), page);
struct anon_vma *anon_vma = anon_vma_lock(page);
if (!anon_vma)
return 0;

/* TODO: We might need TYPESAFE_BY_RCU for anon_vma? */
unsigned long page_addr = page->pageoff;
int st = 0;

spin_lock(&anon_vma->lock);

list_for_every (&anon_vma->vma_list)
{
struct vm_area_struct *vma = container_of(l, struct vm_area_struct, anon_vma_node);
Expand Down
87 changes: 67 additions & 20 deletions kernel/kernel/mm/slab.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <onyx/modules.h>
#include <onyx/page.h>
#include <onyx/perf_probe.h>
#include <onyx/rcupdate.h>
#include <onyx/rwlock.h>
#include <onyx/stackdepot.h>
#include <onyx/vm.h>
Expand Down Expand Up @@ -56,7 +57,10 @@ struct slab
};

size_t size;
struct list_head slab_list_node;
union {
struct list_head slab_list_node;
struct rcu_head typesafe_by_rcu;
};
struct bufctl *object_list;
size_t active_objects;
size_t nobjects;
Expand Down Expand Up @@ -121,6 +125,7 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align
c->redzone = 0;
#endif
c->flags = flags | KMEM_CACHE_VMALLOC;
c->bufctl_off = 0;

// Minimum object alignment is 16
c->alignment = alignment;
Expand All @@ -137,6 +142,15 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align
c->alignment = ALIGN_TO(c->alignment, 64);
}

if (flags & SLAB_TYPESAFE_BY_RCU || ctor)
{
/* We can't place the bufctl inside the object, because either ctor or TYPESAFE_BY_RCU were
* specified, and these are only useful if the allocator _does not_ touch the object. As
* such, we place the bufctls right outside the object. */
c->bufctl_off = c->objsize;
c->objsize += sizeof(struct bufctl);
}

c->objsize = ALIGN_TO(c->objsize, c->alignment);
c->redzone = ALIGN_TO(c->redzone / 2, c->alignment) * 2;

Expand Down Expand Up @@ -174,6 +188,16 @@ struct slab_cache *kmem_cache_create(const char *name, size_t size, size_t align

#define ALWAYS_INLINE __attribute__((always_inline))

ALWAYS_INLINE static inline void *kmem_bufctl_to_ptr(struct slab_cache *cache, struct bufctl *buf)
{
return ((void *) buf) - cache->bufctl_off;
}

ALWAYS_INLINE static inline struct bufctl *kmem_bufctl_from_ptr(struct slab_cache *cache, void *ptr)
{
return ptr + cache->bufctl_off;
}

#ifdef SLAB_DEBUG_COUNTS
/* Kept here and not in list.h, because this is a horrible pattern that should not be used for
* !DEBUG */
Expand Down Expand Up @@ -307,7 +331,7 @@ static void *kmem_cache_alloc_from_slab(struct slab *s, unsigned int flags)

ret->flags = 0;

return (void *) ret;
return kmem_bufctl_to_ptr(s->cache, ret);
}

/**
Expand Down Expand Up @@ -505,8 +529,9 @@ NO_ASAN static struct slab *kmem_cache_create_slab(struct slab_cache *cache, uns
asan_poison_shadow((unsigned long) ptr, redzone, KASAN_LEFT_REDZONE);
#endif
ptr += redzone;

struct bufctl *ctl = (struct bufctl *) ptr;
if (cache->ctor)
cache->ctor(ptr);
struct bufctl *ctl = (struct bufctl *) (ptr + cache->bufctl_off);
ctl->next = NULL;
ctl->flags = BUFCTL_PATTERN_FREE;
if (last)
Expand Down Expand Up @@ -663,7 +688,7 @@ static void kmem_cache_reload_mag_with_slab(struct slab_cache *cache,
if (buf->flags != BUFCTL_PATTERN_FREE)
panic("Bad buf %p, slab %p", buf, slab);

pcpu->magazine[pcpu->size++] = (void *) buf;
pcpu->magazine[pcpu->size++] = kmem_bufctl_to_ptr(cache, buf);
slab->object_list = (struct bufctl *) buf->next;

if (!buf->next && j + 1 != avail)
Expand Down Expand Up @@ -898,7 +923,7 @@ void *kmem_cache_alloc(struct slab_cache *cache, unsigned int flags)
// If we have objects on our magazine, pop one out and
// return.
void *ret = pcpu->magazine[--pcpu->size];
((struct bufctl *) ret)->flags = 0;
kmem_bufctl_from_ptr(cache, ret)->flags = 0;

pcpu->active_objs++;
__atomic_store_n(&pcpu->touched, 0, __ATOMIC_RELEASE);
Expand Down Expand Up @@ -974,7 +999,7 @@ size_t kmem_cache_alloc_bulk(struct slab_cache *cache, unsigned int gfp_flags, s
while (to_take--)
{
void *ptr = pcpu->magazine[--pcpu->size];
((struct bufctl *) ptr)->flags = 0;
kmem_bufctl_from_ptr(cache, ptr)->flags = 0;
res[i++] = ptr;
pcpu->active_objs++;
}
Expand Down Expand Up @@ -1003,7 +1028,7 @@ static void kmem_free_to_slab(struct slab_cache *cache, struct slab *slab, void
if (unlikely((unsigned long) ptr % cache->alignment))
panic("slab: Bad pointer %p", ptr);

struct bufctl *ctl = (struct bufctl *) ptr;
struct bufctl *ctl = kmem_bufctl_from_ptr(cache, ptr);
if (ctl->flags == BUFCTL_PATTERN_FREE)
panic("slab: Double free at %p", ptr);

Expand Down Expand Up @@ -1049,9 +1074,13 @@ static void kfree_nopcpu(void *ptr)
struct slab *slab = kmem_pointer_to_slab(ptr);
struct slab_cache *cache = slab->cache;

/* TYPESAFE_BY_RCU cannot participate in typical KASAN lifetime shenanigans. :/ */
if (!(cache->flags & SLAB_TYPESAFE_BY_RCU))
{
#ifdef CONFIG_KASAN
asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED);
asan_poison_shadow((unsigned long) ptr, cache->objsize, KASAN_FREED);
#endif
}

spin_lock(&cache->lock);
kmem_free_to_slab(cache, slab, ptr);
Expand All @@ -1071,7 +1100,7 @@ void kmem_cache_return_pcpu_batch(struct slab_cache *cache, struct slab_cache_pe

if (unlikely(slab->cache != cache))
panic("slab: Pointer %p was returned to the wrong cache\n", ptr);
((struct bufctl *) ptr)->flags = 0;
kmem_bufctl_from_ptr(cache, ptr)->flags = 0;
kmem_free_to_slab(cache, slab, ptr);
pcpu->size--;
}
Expand All @@ -1086,7 +1115,7 @@ __always_inline void kmem_cache_free_pcpu_single(struct slab_cache *cache,
struct slab_cache_percpu_context *pcpu, void *ptr)
{
DCHECK(pcpu->size < cache->mag_limit);
struct bufctl *buf = (struct bufctl *) ptr;
struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr);

if (unlikely((unsigned long) ptr % cache->alignment))
panic("slab: Bad pointer %p", ptr);
Expand Down Expand Up @@ -1118,15 +1147,17 @@ void kasan_kfree(void *ptr, struct slab_cache *cache, size_t chunk_size)
if (unlikely((unsigned long) ptr % cache->alignment))
panic("slab: Bad pointer %p", ptr);

struct bufctl *buf = (struct bufctl *) ptr;
struct bufctl *buf = kmem_bufctl_from_ptr(cache, ptr);

if (unlikely(buf->flags == BUFCTL_PATTERN_FREE))
{
panic("slab: Double free at %p\n", ptr);
}

buf->flags = BUFCTL_PATTERN_FREE;
asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED);
if (!(cache->flags & SLAB_TYPESAFE_BY_RCU))
asan_poison_shadow((unsigned long) ptr, chunk_size, KASAN_FREED);

kasan_register_free(ptr, cache);
#ifndef NOQUARANTINE
kasan_quarantine_add_chunk(buf, chunk_size);
Expand Down Expand Up @@ -1260,21 +1291,37 @@ void kmem_cache_free_bulk(struct slab_cache *cache, size_t size, void **ptrs)
*
* @param slab Slab to free
*/
static void kmem_cache_free_slab(struct slab *slab)
static void __kmem_cache_free_slab(struct slab *slab)
{
assert(slab->active_objects == 0);
struct slab_cache *cache = slab->cache;

// Free it from the free list
list_remove(&slab->slab_list_node);
kmem_slab_unaccount_pages(slab, cache->flags);
// After freeing the slab we may no longer touch the struct slab
if (likely(!(cache->flags & KMEM_CACHE_VMALLOC)))
free_pages(slab->pages);
else
vfree(slab->start, slab->size >> PAGE_SHIFT);
}

static void kmem_cache_typesafe_free(struct rcu_head *head)
{
__kmem_cache_free_slab(container_of(head, struct slab, typesafe_by_rcu));
}

static void kmem_cache_free_slab(struct slab *slab)
{
assert(slab->active_objects == 0);
struct slab_cache *cache = slab->cache;

// Free it from the free list
list_remove(&slab->slab_list_node);
kmem_slab_unaccount_pages(slab, cache->flags);
if (cache->flags & SLAB_TYPESAFE_BY_RCU)
call_rcu(&slab->typesafe_by_rcu, kmem_cache_typesafe_free);
else
__kmem_cache_free_slab(slab);
}

struct slab_rendezvous
{
unsigned int waiting_for_cpus;
Expand Down Expand Up @@ -1371,7 +1418,7 @@ static void kmem_cache_shrink_pcpu_all(struct slab_cache *cache)

if (unlikely(slab->cache != cache))
panic("slab: Pointer %p was returned to the wrong cache\n", ptr);
((struct bufctl *) ptr)->flags = 0;
kmem_bufctl_from_ptr(cache, ptr)->flags = 0;
kmem_free_to_slab(cache, slab, ptr);
}

Expand Down Expand Up @@ -1621,7 +1668,7 @@ void kmem_free_kasan(void *ptr)
{
struct slab *slab = kmem_pointer_to_slab(ptr);
assert(slab != NULL);
((struct bufctl *) ptr)->flags = 0;
kmem_bufctl_from_ptr(cache, ptr)->flags = 0;
spin_lock(&slab->cache->lock);
kmem_free_to_slab(slab->cache, slab, ptr);
spin_unlock(&slab->cache->lock);
Expand All @@ -1633,7 +1680,7 @@ static void stack_trace_print(unsigned long *entries, unsigned long nr)
for (unsigned long i = 0; i < nr; i++)
{
char sym[SYM_SYMBOLIZE_BUFSIZ];
int st = sym_symbolize((void *) entries[i], cul::slice<char>{sym, sizeof(sym)});
int st = sym_symbolize((void *) entries[i], sym, sizeof(sym), 0);
if (st < 0)
break;
pr_crit("\t%s\n", sym);
Expand Down

0 comments on commit 0069b6f

Please sign in to comment.