From c1cdf1db8eb115978edc2b9323c0f6a1e75ab5d8 Mon Sep 17 00:00:00 2001 From: Isaac Davis Date: Wed, 23 Aug 2017 16:32:52 +0000 Subject: [PATCH] OS-6013 want more efficient id_space_t --- usr/src/common/idspace/id_space.c | 1107 ++++++++++++++++++++++++-- usr/src/uts/common/os/log_sysevent.c | 28 +- usr/src/uts/common/sys/id_space.h | 48 +- 3 files changed, 1105 insertions(+), 78 deletions(-) diff --git a/usr/src/common/idspace/id_space.c b/usr/src/common/idspace/id_space.c index 7d28a8f53331..d4c80de970b9 100644 --- a/usr/src/common/idspace/id_space.c +++ b/usr/src/common/idspace/id_space.c @@ -20,67 +20,482 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. All rights reserved. */ -#include -#include +#include #include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif /* - * ID Spaces + * ID Space Big Theory Statement + * + * 1. Overview + * + * The id_space_t provides an implementation of a managed range of + * integer identifiers. An ID space guarantees that the + * next identifer returned by an allocation is larger than the previous one, + * unless there are no larger slots remaining in the range. In this case, + * the ID space will return the first available slot in the lower part of the + * range (viewing the previous identifier as a partitioning element). If no + * slots are available, id_alloc()/id_allocff() will sleep until an + * identifier becomes available. Accordingly, id_space allocations must be + * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ + * id_allocff_nosleep() will return -1 if no slots are available or if the + * system is low on memory. If id_alloc_nosleep() fails, callers should + * not try to extend the ID space. This is to avoid making a possible + * low-memory situation worse. + * + * As an ID space is designed for representing a range of id_t's, there + * is a preexisting maximal range: [0, MAXUID]. ID space requests outside + * that range will fail on a DEBUG kernel. The id_allocff*() functions + * return the first available id, and should be used when there is benefit + * to having a compact allocated range. + * + * (Presently, the id_space_t abstraction supports only direct allocations; ID + * reservation, in which an ID is allocated but placed in a internal + * dictionary for later use, should be added when a consuming subsystem + * arrives.) + * + * This code is also shared with userland. In userland, we don't have the same + * ability to have sleeping variants, so we effectively turn the normal + * versions without _nosleep into _nosleep. + * + * 2. High-Level Implementation + * + * The only outward-facing struct is the id_space_t - this is returned by + * id_space_create and must be passed to the various id space allocation + * functions. The struct contains an AVL tree of id_tree_t structs - each + * id_tree_t represents a continuous range of IDs. A freshly created id_space_t + * will have just one id_tree_t in its AVL tree, and an additional id_tree_t + * will be added with every call to id_space_extend. The id_tree_t structs are + * ordered in the AVL tree by their starting IDs, low-to-high. The ranges of ID + * trees do not overlap. + * + * An id_tree_t is a tree of id_node_t structs. Each id_node_t has at most + * ID_BRANCH_FACTOR children. Each id_node_t also has a bitfield, which will + * serve two different purposes depending on whether or not the node is a leaf. + * If the node is a leaf, zeroes in the bitfield will indicate unallocated IDs, + * and ones will indicate allocated IDs. If the node is not a leaf, each bit + * will correspond to a child node. A zero will indicate that the corresponding + * child (or one of the child's children or its childrens' children, etc.) has + * an unallocated ID. A one will indicate that the child has no children with + * unallocated IDs - its branch is completely full. As more IDs are allocated, + * the ID tree will grow in both height and number of nodes per level. + * + * 3. Example + * + * The tree starts with just the root node and grows (bottom-up!) as + * more space is required. Here's an example. For simplicity's sake, we'll use + * a branching factor of 2 and assume that the ID range starts at 0. + * + * In this hacked-together notation, `N` signifies a null pointer. + * We start with a single node (labeled node A): + * + * A: bitfield:[0 0] + * children:[N N] + * + * We allocate our first id, which is 0: + * + * A: bitfield:[1 0] + * children:[N N] + * + * And another id, 1: + * + * A: bitfield:[1 1] + * children:[N N] + * + * On the next allocation, we find that the root's bitfield is completely full - + * this is how we know we've run out of space. We add _a new root_, node B: + * + * B: bitfield:[1 0] + * children:[A N] + * + * / + * + * A: bitfield:[1 1] + * children:[N N] + * + * Note that _we didn't have to modify A at all_. * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. + * On the next allocation, we see that B has a child with space remaining. This + * child hasn't actually been allocated yet, so we allocate it: * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. + * B: bitfield:[1 0] + * children:[A C] * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) + * / \ * - * This code is also shared with userland. In userland, we don't have the same - * ability to have sleeping variants, so we effectively turn the normal - * versions without _nosleep into _nosleep. + * A: bitfield:[1 1] C: bitfield:[1 0] + * children:[N N] children:[N N] + * + * On the next allocation, we see that we have filled up C. This means that B is + * now also full, so we update both nodes. Furthermore, the root is now full, so + * we add another layer: + * + * D: bitfield:[1 0] + * children:[B N] + * + * / + * + * B: bitfield:[1 1] + * children:[A C] + * + * / \ + * + * A: bitfield:[1 1] C: bitfield:[1 1] + * children:[N N] children:[N N] + * + * And so on and so on... + * + * 4. Dealing with Offsets and Ranges + * + * From an id_tree_t's point of view, its range of IDs always starts at 0. If + * the user creates a range of IDs that does not start at 0, the required offset + * is added to each ID after the ID is allocated in the tree and before the ID + * is returned to the user. If the user places an upper bound on the ID range + * that does not align with the number of IDs supported by a full tree, + * each bitfield is masked off during tree traversal to artificially impose an + * upper bound. + * + * When the user calls id_space_extend, existing id_tree_t structs are not + * affected. Instead, a new id_tree_t is added to the id_space_t's AVL tree to + * support the new range. If the user specifies a range of IDs that overlaps + * with existing ranges, the actual range of the new id_tree_t is truncated to + * include only the truly new IDs. This way, the ranges of the id_tree_t structs + * will never overlap. + * + * 5. Time and Space Complexity + * + * We grow the tree from the bottom up so the IDs that correspond to specific + * slots in each node's bitfield _never change_. For example, in the simple tree + * above, the second slot in node C will always refer to ID 3. This allows + * constant-time tree growth, because we do not have to iterate over all nodes + * to update their bitfields. + * + * When performing allocation and deletion, we can almost always get away with + * performing just one tree traversal of length log(ID_BRANCH_FACTOR)(n), where + * n is ID_BRANCH_FACTOR<(current tree height). This gives a logarithmic runtime + * for allocation and deletion. + * + * In some cases, we may have to perform more than one traversal - this will + * happen if we reach a leaf that has free IDs, but all the free IDs are smaller + * than the last allocated ID. In this case, we will have to increase our + * desired minimum ID and traverse the tree again. However, this case is rare. + * + * A tree of height h will support ID_BRANCH_FACTOR^(h+1) IDs and require at + * most branch_factor^(h) + branch_factor^(h-1) + ... + branch_factor<(0) nodes. + * This does not seem efficient for a low branching factor (e.g. 2) but becomes + * increasingly efficient as the branching factor increases. + * + * 6. Explanation of Internal Functions + * + * The lowest-level function related to tree traversal is get_open_slot. This + * function takes in a bitfield and high and low indices and returns the index + * of the first zero bit between the passed-in indices. It does not perform + * any wrap-around logic, and does not know about the structure of the tree. + * + * One layer of abstraction up, we have id_tree_walk. It traverses a single + * id_tree_t and performs the specified action (allocate or free) on either the + * exact ID passed in or the first open ID larger than the "start" ID passed in. + * It does not perform any wrap-around logic - for example, if the passed-in + * tree has no open IDs larger than "start" but does have open IDs smaller than + * start, this function will NOT wrap around and return a smaller ID. This + * function does not know about the AVL tree or any id_tree_t structs other than + * the one passed in. This function sees ID values from the id_tree_t's point + * of view - it does not apply offsets to create an accurate, client-facing ID + * value. This function calls id_tree_expand to grow the height of the tree, + * and directly allocates internal nodes if necessary during the tree traversal. + * + * At the outer layer of abstraction, we have id_get_id. This function traverses + * the AVL tree and finds a suitable id_tree_t on which to call id_tree_walk. + * This is where all wrap-around logic occurs - this function WILL return an + * open ID smaller than the passed-in "start" ID if necessary. This is also + * where offsets are applied to the ID returned from id_tree_walk to return + * a client-facing ID value. + * + * The various client-facing ID allocation and freeing functions are thin + * wrappers around id_get_id. They SHOULD NOT call any other internal functions, + * as this will short-circuit important parts of the logic. + * + * Sleeping when waiting for an ID to become free is accomplished using a + * condvar in the client-facing ID allocation functions. The internal functions + * should not touch the condvar. + */ + +#ifndef _KERNEL +/* + * Get index of highest nonzero bit + */ +static int +highbit(ulong_t i) +{ + register int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} +#endif + +/* + * Free a previously allocated id_tree_t. + */ +static void +id_tree_free(id_tree_t *itp) +{ +#ifdef _KERNEL + kmem_free(itp, sizeof (id_tree_t)); +#else + umem_free(itp, sizeof (id_tree_t)); +#endif +} + +/* + * Free a previously allocated id_space_t. + */ +static void +id_space_free(id_space_t *isp) +{ +#ifdef _KERNEL + kmem_free(isp, sizeof (id_space_t)); +#else + umem_free(isp, sizeof (id_space_t)); +#endif +} + +/* + * Allocate and initialize an id_node_t. + */ +static id_node_t * +id_node_create(boolean_t sleep) +{ + id_node_t *np; + +#ifdef _KERNEL + int flag = sleep ? KM_SLEEP : KM_NOSLEEP; + np = kmem_alloc(sizeof (id_node_t), flag); +#else + (void) sleep; + np = umem_alloc(sizeof (id_node_t), UMEM_DEFAULT); +#endif + if (np == NULL) { + return (NULL); + } + bzero(np, sizeof (id_node_t)); + return (np); +} + +/* + * Destroy an id_node_t previously allocated with id_node_create. Recursively + * destroys child nodes. + */ +static void +id_node_destroy(id_node_t *np) +{ + uint_t i; + id_node_t *childp; + + for (i = 0; i < ID_BRANCH_FACTOR; i++) { + if ((childp = np->idn_children[i]) != NULL) { + id_node_destroy(childp); + } + } +#ifdef _KERNEL + kmem_free(np, sizeof (id_node_t)); +#else + umem_free(np, sizeof (id_node_t)); +#endif +} + +/* + * Allocate and initialize an id_tree_t. */ +static id_tree_t * +id_tree_create(boolean_t sleep, id_t low, id_t high) +{ + id_tree_t *itp; + +#ifdef _KERNEL + int flag = sleep ? KM_SLEEP : KM_NOSLEEP; + itp = kmem_alloc(sizeof (id_tree_t), flag); +#else + (void) sleep; + itp = umem_alloc(sizeof (id_tree_t), UMEM_DEFAULT); +#endif + if (itp == NULL) { + return (NULL); + } + if ((itp->idt_root = id_node_create(sleep)) == NULL) { + id_tree_free(itp); + return (NULL); + } + itp->idt_height = 0; + itp->idt_offset = low; + itp->idt_size = 0; + /* + * Note that max IS NOT the highest ID contained in the tree - it is the + * maximum number of IDs the tree contains. + */ + itp->idt_max = high - low; + return (itp); +} -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) +/* + * Destroy an id_tree_t previously allocated with id_tree_create. Kicks off + * destruction of all nodes by destroying root. + */ +static void +id_tree_destroy(id_tree_t *tp) +{ + id_node_destroy(tp->idt_root); + id_tree_free(tp); +} + +/* + * Comparator for use in id_space_t's AVL tree of id_tree_t structs. A tree with + * a lower offset (e.g. lower starting ID) is "less" than a tree with a higher + * offset. + */ +static int +id_tree_compare(const void *first, const void *second) +{ + const id_tree_t *tree1 = first; + const id_tree_t *tree2 = second; + + if (tree1->idt_offset < tree2->idt_offset) { + return (-1); + } else if (tree1->idt_offset > tree2->idt_offset) { + return (-1); + } + + return (0); +} /* * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal, - * for the kernel. Always VM_NOSLEEP in userland. + * Caller must be in a context in which KM_SLEEP is legal, + * for the kernel. Always KM_NOSLEEP in userland. */ id_space_t * id_space_create(const char *name, id_t low, id_t high) { + id_space_t *isp; + id_tree_t *itp; + + ASSERT(low >= 0); + ASSERT(low < high); + ASSERT(high <= MAXUID + 1); + +#ifdef _KERNEL + isp = kmem_alloc(sizeof (id_space_t), KM_SLEEP); + itp = id_tree_create(B_TRUE, low, high); +#else + if ((isp = umem_alloc(sizeof (id_space_t), UMEM_DEFAULT)) == NULL) { + return (NULL); + } + if ((itp = id_tree_create(B_FALSE, low, high)) == NULL) { + id_space_free(isp); + return (NULL); + } +#endif + + /* Create AVL tree and add id_tree_t */ + avl_create(&isp->id_trees, id_tree_compare, sizeof (id_tree_t), + offsetof(id_tree_t, idt_avl)); + avl_add(&isp->id_trees, itp); + + (void) snprintf(isp->id_name, ID_NAMELEN, "%s", name); + isp->id_high = high; + isp->id_low = low; + isp->id_next_free = low; + + /* Initialize mutexes and condvar as appropriate */ #ifdef _KERNEL - int flag = VM_SLEEP; + mutex_init(&isp->id_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&isp->id_cond, NULL, CV_DEFAULT, NULL); #else - int flag = VM_NOSLEEP; + (void) mutex_init(&isp->id_lock, USYNC_THREAD | LOCK_ERRORCHECK, NULL); #endif + return (isp); +} + +#ifdef _KERNEL +/* + * Create an arena to represent the range [low, high). Do not sleep. + */ +id_space_t * +id_space_create_nosleep(const char *name, id_t low, id_t high) +{ + id_space_t *isp; + id_tree_t *itp; + ASSERT(low >= 0); ASSERT(low < high); + ASSERT(high <= MAXUID + 1); + + isp = kmem_alloc(sizeof (id_space_t), KM_NOSLEEP); + if (isp == NULL) { + return (NULL); + } + if ((itp = id_tree_create(B_FALSE, low, high)) == NULL) { + id_space_free(isp); + return (NULL); + } + + /* Create AVL tree and add id_tree_t */ + avl_create(&isp->id_trees, id_tree_compare, sizeof (id_tree_t), + offsetof(id_tree_t, idt_avl)); + avl_add(&isp->id_trees, itp); - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, flag | VMC_IDENTIFIER)); + (void) snprintf(isp->id_name, ID_NAMELEN, "%s", name); + isp->id_high = high; + isp->id_low = low; + isp->id_next_free = low; + + /* Initialize mutexes and condvar as appropriate */ +#ifdef _KERNEL + mutex_init(&isp->id_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&isp->id_cond, NULL, CV_DEFAULT, NULL); +#else + (void) mutex_init(&isp->id_lock, USYNC_THREAD | LOCK_ERRORCHECK, NULL); +#endif + return (isp); } +#endif /* * Destroy a previously created ID space. @@ -89,18 +504,534 @@ id_space_create(const char *name, id_t low, id_t high) void id_space_destroy(id_space_t *isp) { - vmem_destroy(isp); + id_tree_t *itp; + void *c = NULL; + + /* Destroy all id_tree_t structs in AVL tree */ + while ((itp = avl_destroy_nodes(&isp->id_trees, &c)) != NULL) { + id_tree_destroy(itp); + } + avl_destroy(&isp->id_trees); +#ifdef _KERNEL + mutex_destroy(&isp->id_lock); + cv_destroy(&isp->id_cond); +#else + (void) mutex_destroy(&isp->id_lock); +#endif + id_space_free(isp); } +/* + * Add the specified range of IDs to an existing ID space. The new range is + * allowed to overlap arbitrarily with existing ranges. This significantly + * complicates the logic required of this function. + * + * As we iterate through the existing ID ranges, the new ID range will fall into + * one of four cases, as illustrated below: + * + * key: + * [ ] Existing ID range + * ******* Range to be added + * + * Case 1: [ ****** ] + * The range to be added lies completely inside the existing range. We do + * nothing. + * + * Case 2: [ *******]** + * The new range overlaps the end of the old range. We truncate the beginning of + * the new range. + * + * Case 3: *[***** ] + * The new range overlaps the beginning of the old range. We truncate the + * end of the new range. + * + * Case 4: *[*************]** + * The new range completely contains the old range. We split the new range into + * two smaller ranges on either side of the old range. + * + * The structure, documentation and ASCII art for this function are shamelessly + * lifted from the virtual memory implementation in Brown University's Weenix. + */ void id_space_extend(id_space_t *isp, id_t low, id_t high) { -#ifdef _KERNEL - int flag = VM_SLEEP; -#else - int flag = VM_NOSLEEP; -#endif - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, flag); + id_t adjusted_low = low; + id_t adjusted_high = high; + id_tree_t *itp; + id_tree_t *new_tree; + + ASSERT(isp != NULL); + ASSERT(low >= 0); + ASSERT(high <= MAXUID + 1); + + mutex_enter(&isp->id_lock); + /* Iterate through all existing id_tree_t structs in the AVL tree */ + for (itp = avl_first(&isp->id_trees); itp != NULL; + itp = AVL_NEXT(&isp->id_trees, itp)) { + + /* Get ID range of current tree */ + id_t itp_low = itp->idt_offset; + id_t itp_high = itp->idt_max + itp_low; + + /* We're past the new range */ + if (adjusted_high <= itp_low) { + break; + } + + /* We haven't reached the new range */ + if (adjusted_low >= itp_high) { + continue; + } + + /* + * Start of new range is less than start of existing range, but + * end of new range is not less than start of existing range + * --> new range overlaps in SOME way + */ + if (adjusted_low < itp_low) { + if (adjusted_high > itp_high) { + /* + * Case 4: old range completely covered - split + * new range, allocate lower part, and keep + * iterating through trees + */ + new_tree = id_tree_create(B_TRUE, adjusted_low, + itp_low); + avl_add(&isp->id_trees, new_tree); + if (adjusted_low < isp->id_low) { + isp->id_low = adjusted_low; + } + adjusted_low = itp_high; + } else { + /* Case 3: truncate end of new range */ + adjusted_high = itp_low; + } + } else if (adjusted_high > itp_high) { + /* Case 2: truncate beginning of new range */ + adjusted_low = itp_high; + } else { + /* + * Case 1: new range completely encompassed by old + * range - do nothing + */ + mutex_exit(&isp->id_lock); + return; + } + } + + /* Allocate the free range and update space's high and low */ + new_tree = id_tree_create(B_TRUE, adjusted_low, adjusted_high); + avl_add(&isp->id_trees, new_tree); + if (adjusted_high > isp->id_high) { + isp->id_high = adjusted_high; + } + if (adjusted_low < isp->id_low) { + isp->id_low = adjusted_low; + } + mutex_exit(&isp->id_lock); +} + +/* + * Get index of first 0 in bitfield between [low, high). Returns index in slotp + * pointer. Returns B_FALSE if no suitable index exists, B_TRUE otherwise. + */ +static boolean_t +get_open_slot(ulong_t *bitfield, uint32_t low, uint32_t high, uint32_t *slotp) +{ + uint32_t long_bits = sizeof (ulong_t) * 8; + /* Number of longs in bitfield */ + uint32_t arr_len = ID_BRANCH_FACTOR / long_bits; + uint32_t i; + /* Offsets of low and high relative to a single long */ + uint32_t low_offset = low % long_bits; + uint32_t high_offset = high % long_bits; + /* Indices of longs for which masks must be applied */ + uint32_t low_mask_idx = low / long_bits; + uint32_t high_mask_idx = high / long_bits; + + ASSERT(low < high); + + for (i = low/long_bits; i < arr_len; i++) { + /* Masks for range outside of [low, high) */ + ulong_t low_mask; + ulong_t high_mask; + /* Low mask is irrelevant - we're past it */ + if (i > low_mask_idx) { + low_mask = 0; + /* Mask out start of long */ + } else { + /* Avoid shifting by long_bits - this is undefined */ + low_mask = (low_offset == 0) ? 0 + : -1L << (long_bits - low_offset); + } + + /* High mask is irrelevant - we haven't hit it yet */ + if (i < high_mask_idx) { + high_mask = 0; + /* We haven't hit the valid range yet - keep going */ + } else if (i > high_mask_idx) { + continue; + /* Mask out end of long */ + } else { + /* Avoid shifting by long_bits - this is undefined */ + high_mask = (high_offset == 0) ? 0 + : (1L << (long_bits - (high % long_bits))) - 1; + } + + /* Get index of first bit that is a 0, after applying mask */ + uint32_t result = long_bits + - highbit(~(bitfield[i] | low_mask | high_mask)); + + /* + * If the first open index == long_bits, there _is_ no open + * index, so we continue. Otherwise, we return the result, + * adding i * long_bits to account for where we're at in + * the array of longs. + */ + if (result < long_bits) { + ASSERT((i * long_bits) + result >= low); + if (slotp != NULL) { + *slotp = (i * long_bits) + result; + } + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * Sets bit in bitfield to value passed in - 0 or 1. + */ +static void +set_bit(ulong_t *bitfield, uint32_t index, uint32_t value) +{ + uint32_t long_bits = sizeof (ulong_t) * 8; + ulong_t mask = 1L << (long_bits - (index % long_bits) - 1); + + ASSERT(index < ID_BRANCH_FACTOR); + ASSERT((value == 0) || (value == 1)); + + if (value) { + bitfield[index / long_bits] |= mask; + } else { + bitfield[index / long_bits] &= ~mask; + } +} + +/* + * Grows height of tree by one level if tree is currently full but has not + * yet reached max number of IDs it is allowed to hold. Returns B_FALSE if the + * tree has reached its maximum size or cannot be expanded, B_TRUE otherwise. + */ +static boolean_t +expand_tree(id_tree_t *itp, boolean_t sleep) +{ + /* We're not allowed to make the tree bigger */ + if (itp->idt_size == itp->idt_max) { + return (B_FALSE); + } + + /* + * tree is currently full - note that this doesn't cover the case where + * the tree is at its maximum size but the root's bitfield isn't + * totally full - this is handled by the "if" statement above. + */ + if (!get_open_slot(itp->idt_root->idn_bitfield, 0, ID_BRANCH_FACTOR, + NULL)) { + id_node_t *new_root = id_node_create(sleep); + /* + * Make a new root and make the current root the first child of + * the new root. + */ + if (new_root == NULL) { + return (B_FALSE); + } + new_root->idn_children[0] = itp->idt_root; + set_bit(new_root->idn_bitfield, 0, 1); + itp->idt_root = new_root; + itp->idt_height++; + return (B_TRUE); + } + + /* We didn't do anything, but that's ok */ + return (B_TRUE); +} + +/* + * This is where most of the magic happens. Walks the given tree looking for an + * ID to allocate or free. Note that the wrap-around behavior DOES NOT occur + * in this function - this is because this function only involves a single + * id_tree_t, and we must wrap around ALL id_tree_t structs in the AVL tree. + * Returns B_TRUE on success, B_FALSE otherwise. + * + * Arguments: + * itp: The tree to walk. + * start: The inded of the first ID to consider. For id_alloc, this will + * be continuously incremented, and for id_allocff, it will be 0. + * exact: Whether or not we're looking for the _exact_ id passed in, or + * just an ID larger than the id passed in. Should be B_TRUE for + * id_alloc_specific and id_free and B_FALSE for everything else. + * alloc: Whether we're allocating or freeing an ID + * sleep: Specifies whether we sleep or fail if we can't alloc or free an + * ID. + * idp: The found ID is returned in this pointer, if we're allocating. + */ +static boolean_t +id_tree_walk(id_tree_t *itp, id_t start, boolean_t exact, boolean_t alloc, + boolean_t sleep, id_t *idp) +{ + /* Variables for tree traversal */ + uint32_t idx = start; + uint32_t scale_power; + uint32_t tree_levels; + uint32_t i; + id_node_t *curr_node; + id_node_t *new_node; + /* Variables to keep track of path we've taken */ + id_node_t *path_nodes[ID_MAX_HEIGHT]; + uint32_t path_slots[ID_MAX_HEIGHT]; + boolean_t backtrack = B_FALSE; + /* Variables for maintaining next bitfield slot to look for */ + uint32_t slot; + uint32_t maxslot; + id_t result = 0; + + /* Add a layer to the tree if we need to */ + if (alloc && !expand_tree(itp, sleep)) { + return (B_FALSE); + } + + curr_node = itp->idt_root; + tree_levels = itp->idt_height + 1; + + /* scale_power == ID_BRANCH_FACTOR ^ tree_levels */ + scale_power = 1 << (ID_BRANCH_SHIFT * tree_levels); + + /* Iterate down tree */ + for (i = 0; i < tree_levels; i++) { + /* + * idx is the position of the id we're looking for in the + * subtree whose root is curr_node. curr_node's subtree + * is responsible for scale_power leaves (before scale_power is + * divided by ID_BRANCH_FACTOR). + */ + idx %= scale_power; + scale_power /= ID_BRANCH_FACTOR; + + /* + * maxslot puts an upper limit on any bitfields that should + * never be fully filled because the size of the ID space is + * in-between tree sizes. It will only ever be relevant at the + * right edge of the tree. + */ + maxslot = min(((itp->idt_max - 1 - result) / scale_power) + 1, + ID_BRANCH_FACTOR); + /* Try to take exact path down tree, return -1 if we can't */ + if (exact) { + slot = idx / scale_power; + if (alloc) { + uint32_t openslot; + boolean_t success = + get_open_slot(curr_node->idn_bitfield, slot, + maxslot, &openslot); + if (!success || (openslot != slot)) { + return (B_FALSE); + } + } + /* Get next open slot */ + } else { + ASSERT(alloc); /* Inexact frees are not permitted */ + boolean_t success = + get_open_slot(curr_node->idn_bitfield, + idx / scale_power, maxslot, &slot); + if (success == B_FALSE) { + /* + * If we get here, it means we've reached a leaf + * that DOES have open slots, but the open slots + * are all smaller than the "start" ID. This + * means we should check the next leaf over in + * the tree. We do this recursively. + * + * This case should be fairly rare. + */ + id_t next_start = start + ID_BRANCH_FACTOR - + (start % ID_BRANCH_FACTOR); + if (next_start > itp->idt_max) { + return (B_FALSE); + } + return id_tree_walk(itp, next_start, exact, + alloc, sleep, idp); + } + /* + * If the next open slot is larger than the intended + * slot (we didn't get the exact ID we wanted), we + * update idx to correspond to the actual slot that we + * got - this becomes important when idx is scaled on + * the next iteration of the loop. + */ + if (slot > idx/scale_power) { + idx = slot * scale_power; + } + } + + /* Fill in our path bookkeeping in case we need to backtrack */ + path_nodes[i] = curr_node; + path_slots[i] = slot; + /* Accumulate our slot in the final result */ + result += slot * scale_power; + + /* + * If we're not yet at a leaf, we advance curr_node to the + * child corresponding to slot. If the child hasn't been + * allocated yet, we allocate it. + */ + if (i != tree_levels - 1) { + if (curr_node->idn_children[slot] == NULL) { + new_node = id_node_create(sleep); + if (!sleep && new_node == NULL) { + return (B_FALSE); + } + curr_node->idn_children[slot] = new_node; + } + curr_node = curr_node->idn_children[slot]; + } + } + + /* If we're freeing from a fully allocated leaf, we backtrack */ + if (!alloc && + !get_open_slot(curr_node->idn_bitfield, 0, maxslot, NULL)) { + backtrack = B_TRUE; + } + /* Update the bit in the leaf */ + set_bit(curr_node->idn_bitfield, slot, alloc); + /* If the leaf has become full after we allocated, we backtrack */ + if (alloc && + !get_open_slot(curr_node->idn_bitfield, 0, maxslot, NULL)) { + backtrack = B_TRUE; + } + + /* Follow our path back up the tree, updating parents as necessary */ + if (backtrack) { + for (i = tree_levels - 1; i >= 1; i--) { + curr_node = path_nodes[i]; + boolean_t child_free = + get_open_slot(curr_node->idn_bitfield, 0, maxslot, + NULL); + /* + * If we're allocating and the child is now full, or + * we're freeing and the child is now not full, set + * the bit in the parent accordingly. + */ + if (alloc != child_free) { + slot = path_slots[i-1]; + set_bit(path_nodes[i-1]->idn_bitfield, slot, + alloc); + } + } + } + + /* Update tree size */ + if (alloc) { + itp->idt_size++; + } else { + itp->idt_size--; + } + ASSERT(result >= start); /* No wrapping here */ + if (alloc && (idp != NULL)) { + *idp = result; + } + return (B_TRUE); +} + +/* + * Called by the user-facing ID space functions. + * Traverses AVL tree of id_tree_t structs to find suitable tree for ID + * allocation, then calls id_walk_tree to allocate an ID. This function is where + * the wrap-around from high to low happens. Returns B_TRUE on success, B_FALSE + * otherwise. + * + * Arguments: + * isp: The ID space to allocate from. + * start: The inded of the first ID to consider. For id_alloc, this will + * be continuously incremented, and for id_allocff, it will be 0. + * exact: Whether or not we're looking for the _exact_ id passed in, or + * just an ID larger than the id passed in. Should be B_TRUE for + * id_alloc_specific and id_free and B_FALSE for everything else. + * If it's B_FALSE, we'll automatically wrap around to the + * beginning of the AVL tree if there are no IDs larger than start + * that meet our criteria. + * alloc: Whether we're allocating or freeing an ID + * sleep: Specifies whether we sleep or fail if we can't alloc or free an + * ID. + * idp: The found ID is returned in this pointer, if we're allocating. + */ +static boolean_t +id_get_id(id_space_t *isp, id_t start, id_t exact, boolean_t alloc, + boolean_t sleep, id_t *idp) +{ + id_tree_t *itp; + uint32_t i; + uint32_t start_tree; /* position of first viable tree */ + id_t id; + + /* Find first candidate tree */ + for (itp = avl_first(&isp->id_trees), i = 0; itp != NULL; + itp = AVL_NEXT(&isp->id_trees, itp), i++) { + + start_tree = i; + /* This tree has a suitable range - try to allocate an ID */ + if (itp->idt_offset <= start && + (itp->idt_max + itp->idt_offset) > start) { + /* If this tree is full, move to the next one */ + if (!id_tree_walk(itp, start - itp->idt_offset, exact, + alloc, sleep, &id)) { + break; + } + /* We found an id! */ + if (idp != NULL) { + *idp = id + itp->idt_offset; + } + return (B_TRUE); + } + /* + * We've gone too far so we should move to the next loop because + * everything is fair game now + */ + if (itp->idt_offset > start) { + break; + } + } + + /* Only look at other trees if we're not looking for a specific id */ + if (!exact) { + /* First loop through the trees after the starting tree */ + for (; itp != NULL; itp = AVL_NEXT(&isp->id_trees, itp)) { + if (id_tree_walk(itp, 0, B_FALSE, alloc, sleep, &id)) { + ASSERT(id > start); + if (idp != NULL) { + *idp = id + itp->idt_offset; + } + return (B_TRUE); + } + } + + /* + * Then loop through the trees BEFORE the starting tree, + * including the starting tree again. + */ + for (itp = avl_first(&isp->id_trees), i = 0; i <= start_tree; + itp = AVL_NEXT(&isp->id_trees, itp), i++) { + if (id_tree_walk(itp, 0, B_FALSE, alloc, sleep, &id)) { + ASSERT(id < start); + if (idp != NULL) { + *idp = id + itp->idt_offset; + } + return (B_TRUE); + } + } + } + /* Everything is totally full */ + return (B_FALSE); } /* @@ -110,12 +1041,30 @@ id_space_extend(id_space_t *isp, id_t low, id_t high) id_t id_alloc(id_space_t *isp) { + id_t result; + boolean_t success; + + mutex_enter(&isp->id_lock); #ifdef _KERNEL - int flag = VM_SLEEP; + /* Try to get an ID until we succeed, sleeping in-between attempts */ + while (!(success = id_get_id(isp, isp->id_next_free, B_FALSE, B_TRUE, + B_TRUE, &result))) { + cv_wait(&isp->id_cond, &isp->id_lock); + } #else - int flag = VM_NOSLEEP; + success = id_get_id(isp, isp->id_next_free, B_FALSE, B_TRUE, B_TRUE, + &result); #endif - return (ADDR_TO_ID(vmem_alloc(isp, 1, flag | VM_NEXTFIT))); + if (success) { + isp->id_next_free = result + 1; + if (isp->id_next_free == isp->id_high) { + isp->id_next_free = isp->id_low; + } + } else { + result = -1; + } + mutex_exit(&isp->id_lock); + return (result); } /* @@ -126,7 +1075,20 @@ id_alloc(id_space_t *isp) id_t id_alloc_nosleep(id_space_t *isp) { - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); + id_t result; + + mutex_enter(&isp->id_lock); + if (id_get_id(isp, isp->id_next_free, B_FALSE, B_TRUE, B_FALSE, + &result)) { + isp->id_next_free = result + 1; + if (isp->id_next_free == isp->id_high) { + isp->id_next_free = isp->id_low; + } + } else { + result = -1; + } + mutex_exit(&isp->id_lock); + return (result); } /* @@ -136,12 +1098,24 @@ id_alloc_nosleep(id_space_t *isp) id_t id_allocff(id_space_t *isp) { + id_t result; + boolean_t success; + + mutex_enter(&isp->id_lock); #ifdef _KERNEL - int flag = VM_SLEEP; + /* Try to get an ID until we succeed, sleeping in-between attempts */ + while (!(success = id_get_id(isp, isp->id_low, B_FALSE, B_TRUE, B_TRUE, + &result))) { + cv_wait(&isp->id_cond, &isp->id_lock); + } #else - int flag = VM_NOSLEEP; + success = id_get_id(isp, isp->id_low, B_FALSE, B_TRUE, B_TRUE, &result); #endif - return (ADDR_TO_ID(vmem_alloc(isp, 1, flag | VM_FIRSTFIT))); + if (!success) { + result = -1; + } + mutex_exit(&isp->id_lock); + return (result); } /* @@ -152,7 +1126,15 @@ id_allocff(id_space_t *isp) id_t id_allocff_nosleep(id_space_t *isp) { - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); + id_t result; + + mutex_enter(&isp->id_lock); + if (!id_get_id(isp, isp->id_next_free, B_FALSE, B_TRUE, B_FALSE, + &result)) { + result = -1; + } + mutex_exit(&isp->id_lock); + return (result); } /* @@ -162,15 +1144,14 @@ id_allocff_nosleep(id_space_t *isp) id_t id_alloc_specific_nosleep(id_space_t *isp, id_t id) { - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); + id_t result; - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); + mutex_enter(&isp->id_lock); + if (!id_get_id(isp, id, B_TRUE, B_TRUE, B_FALSE, &result)) { + result = -1; + } + mutex_exit(&isp->id_lock); + return (result); } /* @@ -180,5 +1161,11 @@ id_alloc_specific_nosleep(id_space_t *isp, id_t id) void id_free(id_space_t *isp, id_t id) { - vmem_free(isp, ID_TO_ADDR(id), 1); + mutex_enter(&isp->id_lock); +#ifdef _KERNEL + /* Wake up a single waiting thread */ + cv_signal(&isp->id_cond); +#endif + (void) id_get_id(isp, id, B_TRUE, B_FALSE, B_FALSE, NULL); + mutex_exit(&isp->id_lock); } diff --git a/usr/src/uts/common/os/log_sysevent.c b/usr/src/uts/common/os/log_sysevent.c index 0b4fe18af321..6fa8670d8c6b 100644 --- a/usr/src/uts/common/os/log_sysevent.c +++ b/usr/src/uts/common/os/log_sysevent.c @@ -22,6 +22,7 @@ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Toomas Soome + * Copyright (c) 2017, Joyent, Inc. All rights reserved. */ #include @@ -43,6 +44,7 @@ #include #include #include +#include /* for doors */ #include @@ -972,16 +974,14 @@ open_channel(char *channel_name) * Subscriber id 0 is never allocated, but is used as a reserved id * by libsysevent */ - if ((chan->scd_subscriber_cache = vmem_create(channel_name, (void *)1, - MAX_SUBSCRIBERS + 1, 1, NULL, NULL, NULL, 0, - VM_NOSLEEP | VMC_IDENTIFIER)) == NULL) { + if ((chan->scd_subscriber_cache = id_space_create_nosleep(channel_name, 1, + MAX_SUBSCRIBERS)) == NULL) { kmem_free(chan, sizeof (sysevent_channel_descriptor_t)); return (-1); } - if ((chan->scd_publisher_cache = vmem_create(channel_name, (void *)1, - MAX_PUBLISHERS + 1, 1, NULL, NULL, NULL, 0, - VM_NOSLEEP | VMC_IDENTIFIER)) == NULL) { - vmem_destroy(chan->scd_subscriber_cache); + if ((chan->scd_publisher_cache = id_space_create_nosleep(channel_name, 1, + MAX_PUBLISHERS)) == NULL) { + id_space_destroy(chan->scd_subscriber_cache); kmem_free(chan, sizeof (sysevent_channel_descriptor_t)); return (-1); } @@ -1027,8 +1027,8 @@ close_channel(char *channel_name) return; free_channel_registration(chan); - vmem_destroy(chan->scd_subscriber_cache); - vmem_destroy(chan->scd_publisher_cache); + id_space_destroy(chan->scd_subscriber_cache); + id_space_destroy(chan->scd_publisher_cache); kmem_free(chan->scd_channel_name, strlen(chan->scd_channel_name) + 1); if (registered_channels[hash_index] == chan) @@ -1045,14 +1045,12 @@ bind_common(sysevent_channel_descriptor_t *chan, int type) id_t id; if (type == SUBSCRIBER) { - id = (id_t)(uintptr_t)vmem_alloc(chan->scd_subscriber_cache, 1, - VM_NOSLEEP | VM_NEXTFIT); + id = id_alloc_nosleep(chan->scd_subscriber_cache); if (id <= 0 || id > MAX_SUBSCRIBERS) return (0); chan->scd_subscriber_ids[id] = 1; } else { - id = (id_t)(uintptr_t)vmem_alloc(chan->scd_publisher_cache, 1, - VM_NOSLEEP | VM_NEXTFIT); + id = id_alloc_nosleep(chan->scd_publisher_cache); if (id <= 0 || id > MAX_PUBLISHERS) return (0); chan->scd_publisher_ids[id] = 1; @@ -1071,14 +1069,14 @@ unbind_common(sysevent_channel_descriptor_t *chan, int type, id_t id) return (0); (void) remove_all_class(chan, id); chan->scd_subscriber_ids[id] = 0; - vmem_free(chan->scd_subscriber_cache, (void *)(uintptr_t)id, 1); + id_free(chan->scd_subscriber_cache, id); } else { if (id <= 0 || id > MAX_PUBLISHERS) return (0); if (chan->scd_publisher_ids[id] == 0) return (0); chan->scd_publisher_ids[id] = 0; - vmem_free(chan->scd_publisher_cache, (void *)(uintptr_t)id, 1); + id_free(chan->scd_publisher_cache, id); } return (1); diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index 46d25f207fa1..39d872275ea4 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All Rights reserved. + * Copyright (c) 2017, Joyent, Inc. All rights reserved. */ #ifndef _ID_SPACE_H @@ -32,12 +32,54 @@ extern "C" { #include #include +#include + +#ifdef _KERNEL #include -#include +#include +#else +#include +#endif + +#define ID_BRANCH_SHIFT 7 +/* branch factor MUST BE a power of 2 greater than sizeof (ulong_t) */ +#define ID_BRANCH_FACTOR (1 << ID_BRANCH_SHIFT) +#define ID_MAX_HEIGHT 10U +#define ID_NAMELEN 30U + +typedef struct id_node { + ulong_t idn_bitfield[ID_BRANCH_FACTOR/(8 * sizeof (ulong_t))]; + struct id_node *idn_children[ID_BRANCH_FACTOR]; +} id_node_t; -typedef vmem_t id_space_t; +typedef struct id_tree { + avl_node_t idt_avl; + id_node_t *idt_root; + size_t idt_height; + id_t idt_offset; + id_t idt_max; /* max size, NOT highest value */ + size_t idt_size; +} id_tree_t; + +typedef struct id_space { + char id_name[ID_NAMELEN]; + avl_tree_t id_trees; /* an AVL tree of id_trees */ + id_t id_next_free; + id_t id_high; + id_t id_low; + +#ifdef _KERNEL + kmutex_t id_lock; + kcondvar_t id_cond; +#else + mutex_t id_lock; +#endif +} id_space_t; id_space_t *id_space_create(const char *, id_t, id_t); +#ifdef _KERNEL +id_space_t *id_space_create_nosleep(const char *, id_t, id_t); +#endif void id_space_destroy(id_space_t *); void id_space_extend(id_space_t *, id_t, id_t); id_t id_alloc(id_space_t *);