diff --git a/_drgn.pyi b/_drgn.pyi index a8452bb7e..9fcd8e75d 100644 --- a/_drgn.pyi +++ b/_drgn.pyi @@ -1898,6 +1898,69 @@ class Symbol: kind: Final[SymbolKind] """Kind of entity represented by this symbol.""" +class SymbolIndex: + """ + A ``SymbolIndex`` contains a static set of symbols and allows efficient + lookup by name and address. + + With :meth:`Program.register_symbol_finder()`, you can add a callback to + provide custom symbol finding logic. However, in many cases, all that is + necessary is to provide drgn with a list of symbols that you know to be part + of the program. This object allows you to do that. It efficiently implements + the Symbol Finder API given a static set of symbols. For example:: + + >>> prog = drgn.Program() + >>> symbol = drgn.Symbol("foo", 0x123, 1, drgn.SymbolBinding.GLOBAL, drgn.SymbolKind.OBJECT) + >>> finder = drgn.SymbolIndex([symbol]) + >>> prog.register_symbol_finder("SymbolIndex", finder, enable_index=0) + >>> prog.symbols() + [Symbol(name='foo', address=0x123, size=0x1, binding=, kind=)] + >>> prog.symbol("bar") + Traceback (most recent call last): + File "", line 1, in + LookupError: not found + >>> prog.symbol("foo") + Symbol(name='foo', address=0x123, size=0x1, binding=, kind=) + >>> prog.symbol(0x100) + Traceback (most recent call last): + File "", line 1, in + LookupError: not found + >>> prog.symbol(0x123) + Symbol(name='foo', address=0x123, size=0x1, binding=, kind=) + """ + + def __init__(self, symbols: Iterable[Symbol]) -> None: + """ + Create a ``SymbolIndex`` from a sequence of symbols + + The returned symbol index satisfies the Symbol Finder API. It supports + overlapping symbol address ranges and duplicate symbol names. However, + in the case of these sorts of conflicts, it doesn't provide any + guarantee on the order of the results, or which result is returned when + a single symbol is requested. + + :param symbols: An iterable of symbols + :returns: A callable object suitable to provide to + :meth:`Program.register_symbol_finder()`. + """ + + def __call__( + self, + prog: Program, + name: Optional[str], + address: Optional[int], + one: bool, + ) -> List[Symbol]: + """ + Lookup symbol by name, address, or both. + + :param prog: (unused) the program looking up this symbol + :param name: if given, only return symbols with this name + :param address: if given, only return symbols spanning this address + :param one: if given, limit the result to a single symbol + :returns: a list of matching symbols (empty if none are found) + """ + class SymbolBinding(enum.Enum): """ A ``SymbolBinding`` describes the linkage behavior and visibility of a @@ -2776,3 +2839,8 @@ def _linux_helper_pid_task(pid: Object, pid_type: IntegerLike) -> Object: def _linux_helper_find_task(__ns: Object, __pid: IntegerLike) -> Object: ... def _linux_helper_kaslr_offset(__prog: Program) -> int: ... def _linux_helper_pgtable_l5_enabled(__prog: Program) -> bool: ... +def _linux_helper_load_proc_kallsyms( + filename: Optional[str] = None, + modules: bool = False, +) -> SymbolIndex: ... +def _linux_helper_load_builtin_kallsyms(prog: Program) -> SymbolIndex: ... diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 2cf3789c7..b3c4d7b22 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -109,6 +109,7 @@ Symbols .. drgndoc:: Symbol .. drgndoc:: SymbolBinding .. drgndoc:: SymbolKind +.. drgndoc:: SymbolIndex Stack Traces ------------ diff --git a/drgn/__init__.py b/drgn/__init__.py index d83c40a8e..5a03f5a30 100644 --- a/drgn/__init__.py +++ b/drgn/__init__.py @@ -70,6 +70,7 @@ StackTrace, Symbol, SymbolBinding, + SymbolIndex, SymbolKind, Thread, Type, @@ -127,6 +128,7 @@ "StackTrace", "Symbol", "SymbolBinding", + "SymbolIndex", "SymbolKind", "Thread", "Type", diff --git a/drgn/helpers/linux/kallsyms.py b/drgn/helpers/linux/kallsyms.py new file mode 100644 index 000000000..448cbe239 --- /dev/null +++ b/drgn/helpers/linux/kallsyms.py @@ -0,0 +1,237 @@ +# Copyright (c) 2024 Oracle and/or its affiliates +# SPDX-License-Identifier: LGPL-2.1-or-later +""" +Kallsyms +-------- + +The ``drgn.helpers.linux.kallsyms`` module contains helpers which allow you to +use the built-in kallsyms symbol table for drgn symbol lookup. Combined with an +alternative type information source, this can enable debugging Linux kernel core +dumps without the corresponding DWARF debuginfo files. Even without type +information, kallsyms can be used to help locate objects, and drgn's low-level +memory reading functions can be used to do basic debugging tasks. +""" +import os +import re +from typing import Dict, List, Tuple + +from _drgn import ( + _linux_helper_load_builtin_kallsyms, + _linux_helper_load_proc_kallsyms as _load_proc_kallsyms, +) +from drgn import ( + Object, + Program, + ProgramFlags, + Symbol, + SymbolBinding, + SymbolIndex, + SymbolKind, +) +from drgn.helpers.linux.module import for_each_module + +__all__ = ( + "load_vmlinux_kallsyms", + "load_module_kallsyms", +) + + +def _vmcoreinfo_symbols(prog: Program) -> Dict[str, int]: + vmcoreinfo_data = prog["VMCOREINFO"].string_().decode("ascii") + vmcoreinfo_symbols = {} + sym_re = re.compile(r"SYMBOL\(([^)]+)\)=([A-Fa-f0-9]+)") + for line in vmcoreinfo_data.strip().split("\n"): + match = sym_re.fullmatch(line) + if match: + vmcoreinfo_symbols[match.group(1)] = int(match.group(2), 16) + return vmcoreinfo_symbols + + +def _load_builtin_kallsyms(prog: Program) -> SymbolIndex: + symbol_reqd = [ + "kallsyms_names", + "kallsyms_token_table", + "kallsyms_token_index", + "kallsyms_num_syms", + "kallsyms_offsets", + "kallsyms_relative_base", + "kallsyms_addresses", + "_stext", + ] + symbols = _vmcoreinfo_symbols(prog) + args = [] + for sym in symbol_reqd: + args.append(symbols.get(sym, 0)) + return _linux_helper_load_builtin_kallsyms(prog, *args) + + +def load_vmlinux_kallsyms(prog: Program) -> SymbolIndex: + """ + Create a kallsyms index for vmlinux + + This function loads the kallsyms for the core kernel and returns a symbol + index. This function does not require that any debuginfo is loaded for the + kernel: it either relies on ``/proc/kallsyms`` (which requires running drgn + as root) or it parses internal data structures using information found from + the VMCOREINFO note (which requires Linux 6.0 or later, or a backport of + commit ``f09bddbd86619 ("vmcoreinfo: add kallsyms_num_syms symbol")`` and + its dependencies). + + :returns: a symbol index containing kallsyms for the core kernel (vmlinux) + """ + if prog.flags & ProgramFlags.IS_LIVE and os.geteuid() == 0: + return _load_proc_kallsyms() + else: + return _load_builtin_kallsyms(prog) + + +def _nm_type_to_binding_kind(code: str) -> Tuple[SymbolBinding, SymbolKind]: + binding = SymbolBinding.UNKNOWN + kind = SymbolKind.UNKNOWN + if code == "v": + binding = SymbolBinding.WEAK + kind = SymbolKind.OBJECT + elif code == "w": + binding = SymbolBinding.WEAK + elif code in "tT": + kind = SymbolKind.FUNC + elif code.lower() in "srbgncd": + kind = SymbolKind.OBJECT + if binding == SymbolBinding.UNKNOWN and code.isupper(): + binding = SymbolBinding.GLOBAL + return binding, kind + + +def _st_info_to_binding_kind(info: int) -> Tuple[SymbolBinding, SymbolKind]: + binding_int = info >> 4 + STB_WEAK = 2 + STB_GNU_UNIQUE = 10 + if binding_int <= STB_WEAK or binding_int == STB_GNU_UNIQUE: + binding = SymbolBinding(binding_int + 1) + else: + binding = SymbolBinding.UNKNOWN + type_ = info & 0xF + STT_TLS = 6 + STT_GNU_IFUNC = 10 + if type_ <= STT_TLS or type_ == STT_GNU_IFUNC: + kind = SymbolKind(type_) + else: + kind = SymbolKind.UNKNOWN + return binding, kind + + +def _elf_sym_to_symbol(name: str, obj: Object, has_typetab: bool) -> Symbol: + # Linux likes to have the nm(1) character code for its symbols, which it + # refers to as the symbol's "type" (this is of course distinct from the ELF + # notion of a symbol type, let alone what drgn considers a "type"...). + # + # Prior to 5439c985c5a8 ("module: Overwrite st_size instead of st_info"), + # merged in v5.0, the kernel simply overwrote the "st_info" field with a + # single-character code that represents the nm(1) character code for that + # symbol. However, starting with that commit, it was switched to overwrite + # the "st_size" field instead! This was thankfully fixed in v5.2 with + # 1c7651f43777 ("kallsyms: store type information in its own array"). + # + # Unfortunately, this leaves us with three possibilities: + # 1. Pre-v5.0: interpret the "st_info" as a character from nm(1) and try to + # infer the kind and bindings. + # 2. 5.0-5.2: interpret the "st_info" as normal, but ignore the "st_size" + # field since it is bogus. + # 3. 5.2+: both fields are valid, and the nm(1) code is stored in "typetab". + # + # Case 3 can be determined easily by the presence of "typetab" in "struct + # mod_kallsyms". However, cases 1 & 2 are indistinguishable. For our + # purposes, it makes more sense to fall back to case 1. After all, neither + # 5.0 or 5.1 were LTS kernels, nor are they actively used by any major + # distro. We have no way to deal with 5.0 or 5.1, whereas we can make some + # informed guesses for pre-5.0 based on the nm(1) code. + if has_typetab: + binding, kind = _st_info_to_binding_kind(obj.st_info.value_()) + else: + binding, kind = _nm_type_to_binding_kind(chr(obj.st_info.value_())) + return Symbol( # type: ignore + name, + obj.st_value.value_(), + obj.st_size.value_(), + binding, + kind, + ) + + +def _module_kallsyms(module: Object) -> List[Symbol]: + """ + Return a list of symbols for a kernel module + + When compiled with ``CONFIG_KALLSYMS``, the kernel maintains ELF symbol + information about each module within ``struct module``. This function + accesses this symbol information, and returns a list of drgn :class:`Symbol` + objects for the module. Keep in mind that unless ``CONFIG_KALLSYMS_ALL`` is + enabled, these symbols are typically only function symbols. + + :param module: :class:`Object` of type ``struct module *`` + :returns: a list of symbols + """ + try: + ks = module.kallsyms + except AttributeError: + # Prior to 8244062ef1e54 ("modules: fix longstanding /proc/kallsyms vs + # module insertion race."), the kallsyms variables were stored directly + # on the module object. This commit was introduced in 4.5, but was + # backported to some stable kernels too. Fall back to the module object + # in cases where kallsyms field isn't available. + ks = module + + prog = module.prog_ + num_symtab = ks.num_symtab.value_() + try: + ks.member_("typetab") + has_typetab = True + except LookupError: + has_typetab = False + + # The symtab field is a pointer, but it points at an array of Elf_Sym + # objects. Indexing it requires drgn to do pointer arithmetic and issue a + # lot of very small /proc/kcore reads, which can be a real performance + # issue. So convert it into an object representing a correctly-sized array, + # and then read that object all at once. This does one /proc/kcore read, + # which is a major improvement! + symtab = Object( + prog, + type=prog.array_type(ks.symtab.type_.type, num_symtab), + address=ks.symtab.value_(), + ).read_() + + # The strtab is similarly a pointer into a contigous array of strings packed + # next to each other. Reading individual strings from /proc/kcore can be + # quite slow. So read the entire array of bytes into a Python bytes value, + # and we'll extract the individual symbol strings from there. + last_string_start = symtab[num_symtab - 1].st_name.value_() + last_string_len = len(ks.strtab[last_string_start].address_of_().string_()) + 1 + strtab = prog.read(ks.strtab.value_(), last_string_start + last_string_len) + syms = [] + for i in range(ks.num_symtab.value_()): + elfsym = symtab[i] + if not elfsym.st_name: + continue + str_index = elfsym.st_name.value_() + nul_byte = strtab.find(b"\x00", str_index) + name = strtab[str_index:nul_byte].decode("ascii") + syms.append(_elf_sym_to_symbol(name, elfsym, has_typetab)) + return syms + + +def load_module_kallsyms(prog: Program) -> SymbolIndex: + """ + Return a symbol index containing all module symbols from kallsyms + + For kernels built with ``CONFIG_KALLSYMS``, loaded kernel modules contain + an ELF symbol table in kernel memory. This function can parse those data + structures and create a symbol index usable by drgn. However, it requires + that you already have debuginfo for the vmlinux image. + + :returns: a symbol index containing all symbols from module kallsyms + """ + all_symbols = [] + for module in for_each_module(prog): + all_symbols.extend(_module_kallsyms(module)) + return SymbolIndex(all_symbols) diff --git a/libdrgn/Makefile.am b/libdrgn/Makefile.am index 5dcb1f964..261d097bf 100644 --- a/libdrgn/Makefile.am +++ b/libdrgn/Makefile.am @@ -73,6 +73,8 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS_PYS:_defs.py=.c) \ helpers.h \ io.c \ io.h \ + kallsyms.c \ + kallsyms.h \ language.c \ language.h \ language_c.c \ @@ -116,6 +118,7 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS_PYS:_defs.py=.c) \ symbol.h \ type.c \ type.h \ + util.c \ util.h \ vector.h @@ -170,6 +173,7 @@ _drgn_la_SOURCES = python/constants.c \ python/program.c \ python/stack_trace.c \ python/symbol.c \ + python/symbol_index.c \ python/test.c \ python/thread.c \ python/type.c \ diff --git a/libdrgn/kallsyms.c b/libdrgn/kallsyms.c new file mode 100644 index 000000000..e7a12fd14 --- /dev/null +++ b/libdrgn/kallsyms.c @@ -0,0 +1,661 @@ +// Copyright (c) 2024 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +#include +#include + +#include "binary_buffer.h" +#include "drgn_internal.h" +#include "kallsyms.h" +#include "program.h" +#include "string_builder.h" +#include "symbol.h" + +/** + * This struct contains the tables necessary to reconstruct kallsyms names. + * + * vmlinux (core kernel) kallsyms names are compressed using table compression. + * There is some description of it in the kernel's "scripts/kallsyms.c", but + * this is a brief overview that should make the code below comprehensible. + * + * Table compression uses the remaining 128 characters not defined by ASCII and + * maps them to common substrings (e.g. the prefix "write_"). Each name is + * represented as a sequence of bytes which refers to strings in this table. + * The two arrays below comprise this table: + * + * - token_table: this is one long string with all of the tokens concatenated + * together, e.g. "a\0b\0c\0...z\0write_\0read_\0..." + * - token_index: this is a 256-entry long array containing the index into + * token_table where you'll find that token's string. + * + * To decode a string, for each byte you simply index into token_index, then use + * that to index into token_table, and copy that string into your buffer. + * + * The actual kallsyms symbol names are concatenated into a buffer called + * "names". The first byte in a name is the length (in tokens, not decoded + * bytes) of the symbol name. The remaining "length" bytes are decoded via the + * table as described above. The first decoded byte is a character representing + * what type of symbol this is (e.g. text, data structure, etc). + */ +struct kallsyms_reader { + uint32_t num_syms; + uint8_t *names; + size_t names_len; + char *token_table; + size_t token_table_len; + uint16_t *token_index; + bool long_names; +}; + +/* + * Kallsyms doesn't include symbol length. We determine symbol length by the + * start of the subsequent symbol. Unfortunately, there can be large gaps in + * the symbol table, for instance on x86_64 the Linux kernel has percpu symbols + * near the beginning of the address space, and a large gap before normal kernel + * symbols. The result of this is that we can create symbols with incredibly + * large sizes, and then drgn's symbolization will print addresses using that + * symbol and a very large offset, which is absolutely meaningless. + * + * To avoid this, we set a cap on the length of a symbol. Unfortunately, this is + * a heuristic. It's entirely possible to have very large data symbols. This + * value is chosen somewhat arbitrarily, but seems to produce decent results. + */ +#define MAX_SYMBOL_LENGTH 0x10000 + +/* + * Since 73bbb94466fd3 ("kallsyms: support "big" kernel symbols"), the + * "kallsyms_names" array may use the most significant bit to indicate that the + * initial element for each symbol (normally representing the number of tokens + * in the symbol) requires two bytes. + * + * Unfortunately, that means that values 128-255 are now ambiguous: on older + * kernels, they should be interpreted literally, but on newer kernels, they + * require treating as a two byte sequence. Since the commit included no changes + * to the symbol names or vmcoreinfo, there's no way to detect it except via + * heuristics. + * + * The commit in question is a new feature and not likely to be backported to + * stable, so our heuristic is that it was first included in kernel 6.1. + * However, we first check the environment variable DRGN_KALLSYMS_LONG: if it + * exists, then we use its first character to determine our behavior: 1, y, Y + * all indicate that we should use long names. 0, n, N all indicate that we + * should not. + */ +static bool guess_long_names(struct drgn_program *prog) +{ + const char *env = getenv("DRGN_KALLSYMS_LONG"); + if (env) { + if (*env == '1' || *env == 'y' || *env == 'Y') + return true; + else if (*env == '0' || *env == 'n' || *env == 'N') + return false; + } + + char *p = prog->vmcoreinfo.osrelease; + long major = strtol(p, &p, 10); + long minor = 0; + if (*p == '.') + minor = strtol(p + 1, NULL, 10); + + return (major == 6 && minor >= 1) || major > 6; +} + +/** + * Copy the kallsyms names tables from the program into host memory. + * @param prog Program to read from + * @param kr kallsyms_reader to populate + * @param vi vmcoreinfo for the program + */ +static struct drgn_error * +kallsyms_copy_tables(struct drgn_program *prog, struct kallsyms_reader *kr, + struct kallsyms_locations *loc) +{ + struct drgn_error *err; + const size_t token_index_size = (UINT8_MAX + 1) * sizeof(uint16_t); + uint64_t last_token; + size_t names_idx; + char data; + uint8_t len_u8; + int len; + bool bswap; + + err = drgn_program_bswap(prog, &bswap); + if (err) + return err; + + // Read num_syms from vmcore (bswap is done for us already) + err = drgn_program_read_u32(prog, + loc->kallsyms_num_syms, + false, &kr->num_syms); + if (err) + return err; + + // Read the constant-sized token_index table (256 entries) + kr->token_index = malloc(token_index_size); + if (!kr->token_index) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->token_index, + loc->kallsyms_token_index, + token_index_size, false); + if (err) + return err; + if (bswap) + for (size_t i = 0; i < kr->num_syms; i++) + kr->token_index[i] = bswap_16(kr->token_index[i]); + + // Find the end of the last token, so we get the overall length of + // token_table. Then copy the token_table into host memory. + last_token = loc->kallsyms_token_table + kr->token_index[UINT8_MAX]; + do { + err = drgn_program_read_memory(prog, &data, + last_token, 1, false); + if (err) + return err; + + last_token++; + } while (data); + kr->token_table_len = last_token - loc->kallsyms_token_table + 1; + kr->token_table = malloc(kr->token_table_len); + if (!kr->token_table) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->token_table, + loc->kallsyms_token_table, + kr->token_table_len, false); + if (err) + return err; + + // Ensure that all members of token_index are in-bounds for indexing + // into token_table. + for (size_t i = 0; i <= UINT8_MAX; i++) + if (kr->token_index[i] >= kr->token_table_len) + return drgn_error_format(DRGN_ERROR_OTHER, + "kallsyms: token_index out of bounds (token_index[%zu] = %u >= %zu)", + i, kr->token_index[i], kr->token_table_len); + + // Now find the end of the names array by skipping through it, then copy + // that into host memory. + names_idx = 0; + kr->long_names = guess_long_names(prog); + for (size_t i = 0; i < kr->num_syms; i++) { + err = drgn_program_read_u8(prog, + loc->kallsyms_names + names_idx, + false, &len_u8); + if (err) + return err; + len = len_u8; + if ((len & 0x80) && kr->long_names) { + if (__builtin_add_overflow(names_idx, 1, &names_idx)) + return drgn_error_create(DRGN_ERROR_OTHER, + "couldn't find end of kallsyms_names"); + err = drgn_program_read_u8(prog, + loc->kallsyms_names + names_idx, + false, &len_u8); + if (err) + return err; + // 73bbb94466fd3 ("kallsyms: support "big" kernel + // symbols") mentions that ULEB128 is used, but only + // implements the ability to encode lengths with 2 + // bytes, for a maximum value of 16k. It's possible in + // the future we may need to support larger sizes, but + // it's difficult to predict the future of the kallsyms + // format. For now, just check that there's no third + // byte to the length. + if (len_u8 & 0x80) + return drgn_error_format( + DRGN_ERROR_OTHER, + "Unexpected 3-byte length encoding in kallsyms names" + ); + len = (len & 0x7F) | (len_u8 << 7); + } + if (__builtin_add_overflow(names_idx, len + 1, &names_idx)) + return drgn_error_format( + DRGN_ERROR_OTHER, "couldn't find end of kallsyms_names"); + } + kr->names_len = names_idx; + kr->names = malloc(names_idx); + if (!kr->names) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->names, + loc->kallsyms_names, + names_idx, false); + if (err) + return err; + + return NULL; +} + +static struct drgn_error *kallsyms_binary_buffer_error(struct binary_buffer *bb, + const char *pos, + const char *message) +{ + return drgn_error_format(DRGN_ERROR_OTHER, + "couldn't parse kallsyms: %s", message); +} + +/** + * Extract the symbol name and type + * @param kr Registry containing kallsyms data + * @param names_bb A binary buffer tracking our position within the + * `kallsyms_names` array + * @param sb Buffer to write output symbol to + * @param[out] kind_ret Where to write the symbol kind data + * @returns NULL on success, or an error + */ +static struct drgn_error * +kallsyms_expand_symbol(struct kallsyms_reader *kr, + struct binary_buffer *names_bb, + struct string_builder *sb, char *kind_ret) +{ + uint64_t len; + struct drgn_error *err = binary_buffer_next_uleb128(names_bb, &len); + if (err) + return err; + + const uint8_t *data = (uint8_t *)names_bb->pos; + err = binary_buffer_skip(names_bb, len); + if (err) + return err; + + bool skipped_first = false; + + while (len) { + char *token_ptr = &kr->token_table[kr->token_index[*data]]; + while (*token_ptr) { + if (skipped_first) { + if (!string_builder_appendc(sb, *token_ptr)) + return &drgn_enomem; + } else { + *kind_ret = *token_ptr; + skipped_first = true; + } + token_ptr++; + } + + data++; + len--; + } + + if (!string_builder_null_terminate(sb)) + return &drgn_enomem; + return NULL; +} + +/** + * Used to find _stext in the kallsyms before we've moved everything into + * the drgn_symbol_index. Finds the index matching the given name, or -1. + */ +static struct drgn_error * +search_for_string(struct kallsyms_reader *kr, const char *name, ssize_t *ret) +{ + STRING_BUILDER(sb); + size_t len = strlen(name); + struct binary_buffer names_bb; + binary_buffer_init(&names_bb, kr->names, kr->names_len, false, + kallsyms_binary_buffer_error); + for (ssize_t i = 0; i < kr->num_syms; i++) { + char kind; + sb.len = 0; + struct drgn_error *err = + kallsyms_expand_symbol(kr, &names_bb, &sb, &kind); + if (err) + return err; + if (sb.len == len && strcmp(name, sb.str) == 0) { + *ret = i; + return NULL; + } + } + return drgn_error_format(DRGN_ERROR_OTHER, + "Could not find '%s' symbol in kallsyms", name); +} + +static void symbol_from_kallsyms(uint64_t address, char *name, char kind, + uint64_t size, struct drgn_symbol *ret) +{ + char kind_lower = tolower(kind); + ret->name = name; + ret->address = address; + ret->size = size; + ret->binding = DRGN_SYMBOL_BINDING_GLOBAL; + + // See nm(1) for information on decoding this "kind" character + if (kind == 'u') + ret->binding = DRGN_SYMBOL_BINDING_UNIQUE; + else if (kind_lower == 'v' || kind_lower == 'w') + ret->binding = DRGN_SYMBOL_BINDING_WEAK; + else if (isupper(kind)) + ret->binding = DRGN_SYMBOL_BINDING_GLOBAL; + else + // If lowercase, the symbol is usually local, but it's + // not guaranteed. Use unknown for safety here. + ret->binding = DRGN_SYMBOL_BINDING_UNKNOWN; + + switch (kind_lower) { + case 'b': // bss + case 'c': // uninitialized data + case 'd': // initialized data + case 'g': // initialized data (small objects) + case 'r': // read-only data + case 'v': // weak object (guaranteed by elf_info() in kernel/module.c) + ret->kind = DRGN_SYMBOL_KIND_OBJECT; + break; + case 't': // text + ret->kind = DRGN_SYMBOL_KIND_FUNC; + break; + default: + ret->kind = DRGN_SYMBOL_KIND_UNKNOWN; + } + ret->name_lifetime = DRGN_LIFETIME_STATIC; + ret->lifetime = DRGN_LIFETIME_STATIC; // avoid copying +} + +/** Compute an address via the CONFIG_KALLSYMS_ABSOLUTE_PERCPU method*/ +static uint64_t absolute_percpu(uint64_t base, int32_t val) +{ + if (val >= 0) + return (uint64_t) val; + else + return base - 1 - val; +} + +/** + * Load the kallsyms address information from @a prog + * + * Just as symbol name loading is complex, so is address loading. Addresses may + * be stored directly as an array of pointers, but more commonly, they are + * stored as an array of 32-bit integers which are related to an offset. This + * function decodes the addresses into a plain array of 64-bit addresses. + * + * @param prog The program to read from + * @param kr The symbol registry to fill + * @param vi vmcoreinfo containing necessary symbols + * @returns NULL on success, or error + */ +static struct drgn_error * +kallsyms_load_addresses(struct drgn_program *prog, struct kallsyms_reader *kr, + struct kallsyms_locations *loc, uint64_t **ret) +{ + struct drgn_error *err = NULL; + bool bswap, bits64; + _cleanup_free_ uint32_t *addr32 = NULL; + + err = drgn_program_bswap(prog, &bswap); + if (err) + return err; + err = drgn_program_is_64_bit(prog, &bits64); + if (err) + return err; + + _cleanup_free_ uint64_t *addresses = + malloc_array(kr->num_syms, sizeof(addresses[0])); + if (!addresses) + return &drgn_enomem; + + if (loc->kallsyms_addresses) { + /* + * The kallsyms addresses are stored as plain addresses in an + * array of unsigned long! Read the appropriate size array and + * do any necessary byte swaps. + */ + if (bits64) { + err = drgn_program_read_memory(prog, addresses, + loc->kallsyms_addresses, + kr->num_syms * sizeof(addresses[0]), + false); + if (err) + return err; + if (bswap) + for (int i = 0; i < kr->num_syms; i++) + addresses[i] = bswap_64(addresses[i]); + } else { + addr32 = malloc_array(kr->num_syms, sizeof(addr32[0])); + if (!addr32) + return &drgn_enomem; + + err = drgn_program_read_memory(prog, addr32, + loc->kallsyms_addresses, + kr->num_syms * sizeof(addr32[0]), + false); + if (err) + return err; + for (int i = 0; i < kr->num_syms; i++) { + if (bswap) + addresses[i] = bswap_32(addr32[i]); + else + addresses[i] = addr32[i]; + } + } + } else { + /* + * The kallsyms addresses are stored in an array of 4-byte + * values, which can be interpreted in two ways: + * (1) if CONFIG_KALLSYMS_ABSOLUTE_PERCPU is enabled, then + * positive values are addresses, and negative values are + * offsets from a base address. + * (2) otherwise, the 4-byte values are directly used as + * addresses + * First, read the values, then figure out which way to + * interpret them. + */ + uint64_t relative_base; + if (bits64) { + // performs the bswap for us, if necessary + err = drgn_program_read_u64(prog, loc->kallsyms_relative_base, + false, &relative_base); + if (err) + return err; + } else { + uint32_t rel32; + // performs the bswap for us, if necessary + err = drgn_program_read_u32(prog, loc->kallsyms_relative_base, + false, &rel32); + if (err) + return err; + relative_base = rel32; + } + addr32 = malloc_array(kr->num_syms, sizeof(addr32[0])); + if (!addr32) + return &drgn_enomem; + + err = drgn_program_read_memory(prog, addr32, + loc->kallsyms_offsets, + kr->num_syms * sizeof(uint32_t), + false); + if (err) + return err; + if (bswap) + for (int i = 0; i < kr->num_syms; i++) + addr32[i] = bswap_32(addr32[i]); + + /* + * Now that we've read the offsets data, we need to determine + * how to interpret them. To do this, use the _stext symbol. We + * have the correct value from vmcoreinfo. Compute it both ways + * and pick the correct interpretation. + */ + ssize_t stext_idx; + err = search_for_string(kr, "_stext", &stext_idx); + if (err) + return err; + uint64_t stext_abs = relative_base + addr32[stext_idx]; + uint64_t stext_pcpu = absolute_percpu(relative_base, (int32_t)addr32[stext_idx]); + if (stext_abs == loc->_stext) { + for (int i = 0; i < kr->num_syms; i++) + addresses[i] = relative_base + addr32[i]; + } else if (stext_pcpu == loc->_stext) { + for (int i = 0; i < kr->num_syms; i++) + addresses[i] = absolute_percpu(relative_base, (int32_t)addr32[i]); + } else { + err = drgn_error_create( + DRGN_ERROR_OTHER, + "Unable to interpret kallsyms address data"); + if (err) + return err; + } + } + *ret = no_cleanup_ptr(addresses); + return NULL; +} + +static void kallsyms_reader_cleanup(struct kallsyms_reader *kr) +{ + free(kr->names); + free(kr->token_index); + free(kr->token_table); +} + +struct drgn_error * +drgn_load_builtin_kallsyms(struct drgn_program *prog, + struct kallsyms_locations *loc, + struct drgn_symbol_index *ret) +{ + if (!(loc->kallsyms_names && loc->kallsyms_token_table + && loc->kallsyms_token_index && loc->kallsyms_num_syms)) + return drgn_error_create( + DRGN_ERROR_MISSING_DEBUG_INFO, + "The symbols: kallsyms_names, kallsyms_token_table, " + "kallsyms_token_index, and kallsyms_num_syms were not " + "found in VMCOREINFO. There is not enough " + "information to load the kallsyms table." + ); + + _cleanup_(kallsyms_reader_cleanup) struct kallsyms_reader kr = {}; + + struct drgn_error *err = kallsyms_copy_tables(prog, &kr, loc); + if (err) + return err; + + _cleanup_free_ uint64_t *addresses = NULL; + err = kallsyms_load_addresses(prog, &kr, loc, &addresses); + if (err) + return err; + + _cleanup_(drgn_symbol_index_builder_deinit) + struct drgn_symbol_index_builder builder; + drgn_symbol_index_builder_init(&builder); + STRING_BUILDER(sb); + + struct binary_buffer names_bb; + binary_buffer_init(&names_bb, kr.names, kr.names_len, false, + kallsyms_binary_buffer_error); + for (int i = 0; i < kr.num_syms; i++) { + struct drgn_symbol symbol; + char kind; + uint64_t size = 0; + sb.len = 0; + err = kallsyms_expand_symbol(&kr, &names_bb, &sb, &kind); + if (err) + return err; + if (sb.len == 0) + return drgn_error_format(DRGN_ERROR_OTHER, + "error: zero-length symbol in kallsyms"); + if (i + 1 < kr.num_syms && + addresses[i + 1] - addresses[i] < MAX_SYMBOL_LENGTH) + size = addresses[i + 1] - addresses[i]; + symbol_from_kallsyms(addresses[i], sb.str, kind, size, &symbol); + if (!drgn_symbol_index_builder_add(&builder, &symbol)) + return &drgn_enomem; + } + + return drgn_symbol_index_init_from_builder(ret, &builder); +} + +/** Load kallsyms directly from the /proc/kallsyms file */ +struct drgn_error *drgn_load_proc_kallsyms(const char *filename, bool modules, + struct drgn_symbol_index *ret) +{ + _cleanup_fclose_ FILE *fp = fopen(filename, "r"); + if (!fp) + return drgn_error_create_os("fopen", errno, filename); + + struct drgn_error *err = NULL; + struct drgn_symbol sym = {}; + _cleanup_(drgn_symbol_index_builder_deinit) + struct drgn_symbol_index_builder builder; + drgn_symbol_index_builder_init(&builder); + _cleanup_free_ char *line = NULL; + _cleanup_free_ char *current_module = NULL; + size_t line_size = 0, line_number = 1; + ssize_t res; + while ((res = getline(&line, &line_size, fp)) != -1) { + char *save = NULL; + char *name, *type_str, *mod, *addr_rem, *addr_str; + char type; + uint64_t addr; + bool new_module = false; + + addr_str = strtok_r(line, " \t\r\n", &save); + type_str = strtok_r(NULL," \t\r\n", &save); + name = strtok_r(NULL," \t\r\n", &save); + mod = strtok_r(NULL," \t\r\n", &save); + + if (!addr_str || !type_str || !name) { + err = drgn_error_format(DRGN_ERROR_OTHER, + "Error parsing kallsyms line %zu", + line_number); + break; + } + if (mod && !modules) { + break; + } else if (mod && (!current_module || strcmp(mod, current_module) != 0)) { + free(current_module); + current_module = strdup(mod); + new_module = true; + if (!current_module) { + err = &drgn_enomem; + break; + } + } + + type = *type_str; + addr = strtoull(addr_str, &addr_rem, 16); + if (*addr_rem) { + // addr_rem should be set to the first un-parsed character, and + // since the entire string should be a valid base 16 integer, + // we expect it to be \0 + err = drgn_error_format(DRGN_ERROR_OTHER, + "Invalid address \"%s\" in kallsyms line %zu", + addr_str, line_number); + break; + } + + // We now may know the size of the previous symbol, so long as + // that symbol was in the same module as the current one. + // Otherwise we'll leave it as zero. Note that for module + // kallsyms, it has been observed that addresses are not always + // increasing, even within the same module, so we need to be + // careful to avoid overflow here. + if (!new_module && addr > sym.address) { + uint64_t size = addr - sym.address; + if (size < MAX_SYMBOL_LENGTH) + sym.size = size; + } + if (sym.name && !drgn_symbol_index_builder_add(&builder, &sym)) { + err = &drgn_enomem; + break; + } + free((char *)sym.name); + + symbol_from_kallsyms(addr, name, type, 0, &sym); + + // Copy the name so we don't clobber it in the next iteration + sym.name = strdup(name); + if (!sym.name) { + err = &drgn_enomem; + break; + } + + line_number++; + } + + if (!err && ferror(fp)) + err = drgn_error_create_os("Error reading kallsyms", errno, "/proc/kallsyms"); + + // Append the final symbol + if (!err && sym.name && !drgn_symbol_index_builder_add(&builder, &sym)) + err = &drgn_enomem; + free((char *)sym.name); + + if (!err) + err = drgn_symbol_index_init_from_builder(ret, &builder); + return err; +} diff --git a/libdrgn/kallsyms.h b/libdrgn/kallsyms.h new file mode 100644 index 000000000..cc95be488 --- /dev/null +++ b/libdrgn/kallsyms.h @@ -0,0 +1,46 @@ +// Copyright (c) 2024 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +/** + * @file + * + * Kallsyms data handling + * + * See @ref Kallsyms + */ + +#ifndef DRGN_KALLSYMS_H +#define DRGN_KALLSYMS_H + +#include +#include + +#include "hash_table.h" +#include "symbol.h" + +struct kallsyms_locations { + uint64_t kallsyms_names; + uint64_t kallsyms_token_table; + uint64_t kallsyms_token_index; + uint64_t kallsyms_num_syms; + uint64_t kallsyms_offsets; + uint64_t kallsyms_relative_base; + uint64_t kallsyms_addresses; + uint64_t _stext; +}; + +/** + * Initialize a symbol index containing symbols from /proc/kallsyms + */ +struct drgn_error *drgn_load_proc_kallsyms(const char *filename, bool modules, + struct drgn_symbol_index *ret); + +/** + * Initialize a symbol index containing symbols from built-in kallsyms tables + */ +struct drgn_error * +drgn_load_builtin_kallsyms(struct drgn_program *prog, + struct kallsyms_locations *loc, + struct drgn_symbol_index *ret); + +#endif // DRGN_KALLSYMS_H diff --git a/libdrgn/orc_info.c b/libdrgn/orc_info.c index 31793137c..8a82a72c9 100644 --- a/libdrgn/orc_info.c +++ b/libdrgn/orc_info.c @@ -54,10 +54,9 @@ drgn_raw_orc_entry_is_terminator(struct drgn_module *module, unsigned int i) } } -static _Thread_local struct drgn_module *compare_orc_entries_module; -static int compare_orc_entries(const void *a, const void *b) +static int compare_orc_entries(const void *a, const void *b, void *arg) { - struct drgn_module *module = compare_orc_entries_module; + struct drgn_module *module = arg; unsigned int index_a = *(unsigned int *)a; unsigned int index_b = *(unsigned int *)b; @@ -340,7 +339,6 @@ static struct drgn_error *drgn_debug_info_parse_orc(struct drgn_module *module) for (unsigned int i = 0; i < num_entries; i++) indices[i] = i; - compare_orc_entries_module = module; /* * Sort the ORC entries for binary search. Since Linux kernel commit * f14bf6a350df ("x86/unwind/orc: Remove boot-time ORC unwind tables @@ -348,9 +346,9 @@ static struct drgn_error *drgn_debug_info_parse_orc(struct drgn_module *module) * it if necessary. */ for (unsigned int i = 1; i < num_entries; i++) { - if (compare_orc_entries(&indices[i - 1], &indices[i]) > 0) { - qsort(indices, num_entries, sizeof(indices[0]), - compare_orc_entries); + if (compare_orc_entries(&indices[i - 1], &indices[i], module) > 0) { + qsort_arg(indices, num_entries, sizeof(indices[0]), + compare_orc_entries, module); break; } } diff --git a/libdrgn/python/drgnpy.h b/libdrgn/python/drgnpy.h index 6c892512a..7cfcfad9a 100644 --- a/libdrgn/python/drgnpy.h +++ b/libdrgn/python/drgnpy.h @@ -18,6 +18,7 @@ #include "../hash_table.h" #include "../pp.h" #include "../program.h" +#include "../symbol.h" /* These were added in Python 3.7. */ #ifndef Py_UNREACHABLE @@ -108,6 +109,11 @@ typedef struct { PyObject *attr_cache; } DrgnType; +typedef struct { + PyObject_HEAD + struct drgn_symbol_index index; +} SymbolIndex; + typedef struct { PyObject_HEAD /* @@ -242,6 +248,7 @@ extern PyTypeObject Register_type; extern PyTypeObject StackFrame_type; extern PyTypeObject StackTrace_type; extern PyTypeObject Symbol_type; +extern PyTypeObject SymbolIndex_type; extern PyTypeObject Thread_type; extern PyTypeObject ThreadIterator_type; extern PyTypeObject TypeEnumerator_type; @@ -304,6 +311,8 @@ Program *program_from_kernel(PyObject *self); Program *program_from_pid(PyObject *self, PyObject *args, PyObject *kwds); PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj); +PyObject *Symbol_list_wrap(struct drgn_symbol **symbols, size_t count, + PyObject *name_obj); PyObject *Thread_wrap(struct drgn_thread *drgn_thread); @@ -344,6 +353,8 @@ struct index_arg { }; int index_converter(PyObject *o, void *p); +int u64_converter(PyObject *o, void *p); + struct path_arg { bool allow_fd; bool allow_none; @@ -387,5 +398,9 @@ DrgnObject *drgnpy_linux_helper_pid_task(PyObject *self, PyObject *args, DrgnObject *drgnpy_linux_helper_find_task(PyObject *self, PyObject *args); PyObject *drgnpy_linux_helper_kaslr_offset(PyObject *self, PyObject *arg); PyObject *drgnpy_linux_helper_pgtable_l5_enabled(PyObject *self, PyObject *arg); +PyObject *drgnpy_linux_helper_load_proc_kallsyms(PyObject *self, PyObject *args, + PyObject *kwds); +PyObject *drgnpy_linux_helper_load_builtin_kallsyms(PyObject *self, PyObject *args, + PyObject *kwds); #endif /* DRGNPY_H */ diff --git a/libdrgn/python/helpers.c b/libdrgn/python/helpers.c index 82d03018e..bc843c286 100644 --- a/libdrgn/python/helpers.c +++ b/libdrgn/python/helpers.c @@ -3,6 +3,7 @@ #include "drgnpy.h" #include "../helpers.h" +#include "../kallsyms.h" #include "../program.h" PyObject *drgnpy_linux_helper_direct_mapping_offset(PyObject *self, PyObject *arg) @@ -291,3 +292,56 @@ PyObject *drgnpy_linux_helper_pgtable_l5_enabled(PyObject *self, PyObject *arg) return PyErr_Format(PyExc_ValueError, "not Linux kernel"); Py_RETURN_BOOL(prog->prog.vmcoreinfo.pgtable_l5_enabled); } + +PyObject *drgnpy_linux_helper_load_proc_kallsyms(PyObject *self, PyObject *args, + PyObject *kwds) + +{ + static char *kwnames[] = {"filename", "modules", NULL}; + const char *filename = "/proc/kallsyms"; + int modules = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|sp:load_proc_kallsyms", + kwnames, &filename, &modules)) + return NULL; + + _cleanup_pydecref_ SymbolIndex *index = call_tp_alloc(SymbolIndex); + if (!index) + return set_drgn_error(&drgn_enomem); + + struct drgn_error *err = drgn_load_proc_kallsyms(filename, modules, &index->index); + if (err) + return set_drgn_error(err); + return (PyObject *)no_cleanup_ptr(index); +} + +PyObject * +drgnpy_linux_helper_load_builtin_kallsyms(PyObject *self, PyObject *args, + PyObject *kwds) +{ + static char *kwnames[] = {"prog", "names", "token_table", "token_index", "num_syms", + "offsets", "relative_base", "addresses", "_stext", NULL}; + struct kallsyms_locations kl; + PyObject *prog_obj; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O&O&O&O&O&O&O&O&:load_builtin_kallsyms", + kwnames, &Program_type, &prog_obj, + u64_converter, &kl.kallsyms_names, + u64_converter, &kl.kallsyms_token_table, + u64_converter, &kl.kallsyms_token_index, + u64_converter, &kl.kallsyms_num_syms, + u64_converter, &kl.kallsyms_offsets, + u64_converter, &kl.kallsyms_relative_base, + u64_converter, &kl.kallsyms_addresses, + u64_converter, &kl._stext)) + return NULL; + + struct drgn_program *prog = &((Program *)prog_obj)->prog; + _cleanup_pydecref_ SymbolIndex *index = call_tp_alloc(SymbolIndex); + if (!index) + return set_drgn_error(&drgn_enomem); + + struct drgn_error *err = drgn_load_builtin_kallsyms(prog, &kl, &index->index); + if (err) + return set_drgn_error(err); + return (PyObject *)no_cleanup_ptr(index); +} diff --git a/libdrgn/python/main.c b/libdrgn/python/main.c index f5b164cd5..2da8b10a4 100644 --- a/libdrgn/python/main.c +++ b/libdrgn/python/main.c @@ -208,6 +208,12 @@ static PyMethodDef drgn_methods[] = { METH_O}, {"_linux_helper_pgtable_l5_enabled", drgnpy_linux_helper_pgtable_l5_enabled, METH_O}, + {"_linux_helper_load_proc_kallsyms", + (PyCFunction)drgnpy_linux_helper_load_proc_kallsyms, + METH_VARARGS | METH_KEYWORDS}, + {"_linux_helper_load_builtin_kallsyms", + (PyCFunction)drgnpy_linux_helper_load_builtin_kallsyms, + METH_VARARGS | METH_KEYWORDS}, {}, }; @@ -297,6 +303,7 @@ DRGNPY_PUBLIC PyMODINIT_FUNC PyInit__drgn(void) add_type(m, &StackFrame_type) || add_type(m, &StackTrace_type) || add_type(m, &Symbol_type) || + add_type(m, &SymbolIndex_type) || add_type(m, &DrgnType_type) || add_type(m, &Thread_type) || add_type(m, &ThreadIterator_type) || diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index 407d934ce..600991ac8 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -504,6 +504,16 @@ py_symbol_find_fn(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, void *arg, struct drgn_symbol_result_builder *builder) { + // Fast path for SymbolIndex: don't bother converting to and from Python + // types, as this is a C finder. Use Py_TYPE and pointer comparison + // directly here to avoid needing to take the GIL for + // PyObject_TypeCheck(). SymbolIndex cannot be subclassed, so the logic + // for subclass checking is unnecessary anyway. + if (Py_TYPE(PyTuple_GET_ITEM(arg, 1)) == &SymbolIndex_type) { + SymbolIndex *ix = (SymbolIndex *)PyTuple_GET_ITEM(arg, 1); + return drgn_symbol_index_find(name, addr, flags, &ix->index, builder); + } + PyGILState_guard(); _cleanup_pydecref_ PyObject *name_obj = NULL; @@ -1231,23 +1241,7 @@ static PyObject *Program_symbols(Program *self, PyObject *args) if (err) return set_drgn_error(err); - _cleanup_pydecref_ PyObject *list = PyList_New(count); - if (!list) { - drgn_symbols_destroy(symbols, count); - return NULL; - } - for (size_t i = 0; i < count; i++) { - PyObject *pysym = Symbol_wrap(symbols[i], (PyObject *)self); - if (!pysym) { - /* Free symbols which aren't yet added to list. */ - drgn_symbols_destroy(symbols, count); - return NULL; - } - symbols[i] = NULL; - PyList_SET_ITEM(list, i, pysym); - } - free(symbols); - return_ptr(list); + return Symbol_list_wrap(symbols, count, (PyObject *)self); } static PyObject *Program_symbol(Program *self, PyObject *arg) diff --git a/libdrgn/python/symbol.c b/libdrgn/python/symbol.c index 83ea7525f..d0e84e1bf 100644 --- a/libdrgn/python/symbol.c +++ b/libdrgn/python/symbol.c @@ -16,6 +16,29 @@ PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj) return (PyObject *)ret; } +PyObject *Symbol_list_wrap(struct drgn_symbol **symbols, size_t count, + PyObject *name_obj) +{ + _cleanup_pydecref_ PyObject *list = PyList_New(count); + if (!list) { + drgn_symbols_destroy(symbols, count); + return NULL; + } + for (size_t i = 0; i < count; i++) { + PyObject *pysym = Symbol_wrap(symbols[i], name_obj); + if (!pysym) { + /* Free symbols which aren't yet added to list. */ + drgn_symbols_destroy(symbols, count); + /* Free list and all symbols already added. */ + return NULL; + } + symbols[i] = NULL; + PyList_SET_ITEM(list, i, pysym); + } + free(symbols); + return_ptr(list); +} + static PyObject *Symbol_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) { struct drgn_symbol *sym; diff --git a/libdrgn/python/symbol_index.c b/libdrgn/python/symbol_index.c new file mode 100644 index 000000000..d19467352 --- /dev/null +++ b/libdrgn/python/symbol_index.c @@ -0,0 +1,122 @@ +// Copyright (c) 2024 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +#include "drgnpy.h" +#include "../symbol.h" + +static void SymbolIndex_dealloc(SymbolIndex *self) +{ + drgn_symbol_index_deinit(&self->index); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *SymbolIndex_call(SymbolIndex *self, PyObject *args, PyObject *kwargs) +{ + PyObject *prog_obj; + struct index_arg address = { .allow_none = true }; + const char *name; + static char *kwnames[] = {"prog", "name", "address", "one", NULL}; + int single; // 'p' format specifier expects an int, not bool + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OzO&p:__call__", kwnames, + &prog_obj, &name, index_converter, &address, + &single)) + return NULL; + + unsigned int flags = 0; + if (single) + flags |= DRGN_FIND_SYMBOL_ONE; + if (!address.is_none) + flags |= DRGN_FIND_SYMBOL_ADDR; + if (name) + flags |= DRGN_FIND_SYMBOL_NAME; + + struct drgn_symbol_result_builder builder; + drgn_symbol_result_builder_init(&builder, flags & DRGN_FIND_SYMBOL_ONE); + + struct drgn_error *err = + drgn_symbol_index_find(name, address.uvalue, flags, &self->index, &builder); + if (err) + goto error; + + /* We return a list regardless */ + if (single) { + struct drgn_symbol *symbol = drgn_symbol_result_builder_single(&builder); + _cleanup_pydecref_ PyObject *list = PyList_New(symbol ? 1 : 0); + if (!list) + goto error; + if (symbol) { + PyObject *pysym = Symbol_wrap(symbol, (PyObject *)self); + if (!pysym) + goto error; + PyList_SET_ITEM(list, 0, pysym); + } + return_ptr(list); + } else { + struct drgn_symbol **syms; + size_t count; + drgn_symbol_result_builder_array(&builder, &syms, &count); + return Symbol_list_wrap(syms, count, (PyObject *)self); + } + + return NULL; +error: + drgn_symbol_result_builder_abort(&builder); + return err ? set_drgn_error(err) : NULL; +} + +static PyObject *SymbolIndex_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) +{ + static char *kwnames[] = {"symbols", NULL}; + PyObject *list_obj; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", kwnames, &list_obj)) + return NULL; + + _cleanup_pydecref_ PyObject *iter = + PyObject_GetIter(list_obj); + if (!iter) + return NULL; + + _cleanup_(drgn_symbol_index_builder_deinit) + struct drgn_symbol_index_builder builder; + drgn_symbol_index_builder_init(&builder); + + for (;;) { + _cleanup_pydecref_ PyObject *item = PyIter_Next(iter); + if (!item) + break; + if (!PyObject_TypeCheck(item, &Symbol_type)) + return PyErr_Format(PyExc_TypeError, "expected sequence of Symbols"); + Symbol *sym = (Symbol *)item; + if (!drgn_symbol_index_builder_add(&builder, sym->sym)) + return PyErr_NoMemory(); + } + + if (PyErr_Occurred()) + return NULL; + + _cleanup_pydecref_ SymbolIndex *index_obj = call_tp_alloc(SymbolIndex); + if (!index_obj) + return NULL; + + struct drgn_error *err = + drgn_symbol_index_init_from_builder(&index_obj->index, + &builder); + // On error, the builder and index are already deinitialized + if (err) + return set_drgn_error(err); + + return (PyObject *)no_cleanup_ptr(index_obj); +} + +PyTypeObject SymbolIndex_type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "_drgn.SymbolIndex", + .tp_basicsize = sizeof(SymbolIndex), + .tp_dealloc = (destructor)SymbolIndex_dealloc, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_doc = drgn_SymbolIndex_DOC, + .tp_call = (ternaryfunc)SymbolIndex_call, + .tp_new = SymbolIndex_new, +}; diff --git a/libdrgn/python/util.c b/libdrgn/python/util.c index 16200f456..40b09f36f 100644 --- a/libdrgn/python/util.c +++ b/libdrgn/python/util.c @@ -78,6 +78,17 @@ int index_converter(PyObject *o, void *p) } } +int u64_converter(PyObject *o, void *p) +{ + uint64_t *arg = p; + + _cleanup_pydecref_ PyObject *index_obj = PyNumber_Index(o); + if (!index_obj) + return 0; + *arg = PyLong_AsUint64(index_obj); + return (*arg != UINT64_C(-1) || !PyErr_Occurred()); +} + int path_converter(PyObject *o, void *p) { if (o == NULL) { diff --git a/libdrgn/symbol.c b/libdrgn/symbol.c index 02ae0e7fd..51177deb0 100644 --- a/libdrgn/symbol.c +++ b/libdrgn/symbol.c @@ -2,15 +2,22 @@ // SPDX-License-Identifier: LGPL-2.1-or-later #include +#include #include #include +#include "binary_search.h" #include "drgn_internal.h" +#include "string_builder.h" #include "symbol.h" #include "util.h" +DEFINE_VECTOR_FUNCTIONS(symbol_vector); + LIBDRGN_PUBLIC void drgn_symbol_destroy(struct drgn_symbol *sym) { + if (sym && sym->lifetime == DRGN_LIFETIME_STATIC) + return; if (sym && sym->name_lifetime == DRGN_LIFETIME_OWNED) /* Cast here is necessary - we want symbol users to * never modify sym->name, but when we own the name, @@ -32,6 +39,7 @@ void drgn_symbol_from_elf(const char *name, uint64_t address, { ret->name = name; ret->name_lifetime = DRGN_LIFETIME_STATIC; + ret->lifetime = DRGN_LIFETIME_OWNED; ret->address = address; ret->size = elf_sym->st_size; int binding = GELF_ST_BIND(elf_sym->st_info); @@ -79,6 +87,7 @@ drgn_symbol_create(const char *name, uint64_t address, uint64_t size, sym->binding = binding; sym->kind = kind; sym->name_lifetime = name_lifetime; + sym->lifetime = DRGN_LIFETIME_OWNED; *ret = sym; return NULL; } @@ -174,3 +183,254 @@ void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder symbolp_vector_shrink_to_fit(&builder->vector); symbolp_vector_steal(&builder->vector, syms_ret, count_ret); } + +static int name_compar(const void *lhs, const void *rhs, void *arg) +{ + struct drgn_symbol_index *ix = arg; + uint32_t left_ix = *(const uint32_t *)lhs; + uint32_t right_ix = *(const uint32_t *)rhs; + return strcmp(ix->symbols[left_ix].name, ix->symbols[right_ix].name); +} + +static int addr_compar(const void *lhs, const void *rhs) +{ + const struct drgn_symbol *left = lhs; + const struct drgn_symbol *right = rhs; + // returning a simple subtraction would not work well since these are + // unsigned + if (left->address < right->address) + return -1; + else if (left->address > right->address) + return 1; + else + return 0; +} + +struct drgn_error * +drgn_symbol_index_init(struct drgn_symbol *symbols, uint32_t count, + char *buffer, struct drgn_symbol_index *ret) +{ + ret->symbols = symbols; + ret->num_syms = count; + ret->strings = buffer; + ret->name_sort = NULL; + ret->max_addrs = NULL; + drgn_symbol_name_table_init(&ret->htab); + ret->name_sort = malloc_array(count, sizeof(ret->name_sort[0])); + if (!ret->name_sort) + goto enomem; + ret->max_addrs = malloc_array(count, sizeof(ret->max_addrs[0])); + if (!ret->max_addrs) + goto enomem; + + // In many cases (e.g kallsyms), symbols are already sorted by address, + // but not always. Check whether sorted, and if not, sort. + for (uint32_t i = 1; i < ret->num_syms; i++) { + if (ret->symbols[i - 1].address > ret->symbols[i].address) { + qsort(ret->symbols, count, sizeof(ret->symbols[0]), addr_compar); + break; + } + } + + // Kallsyms doesn't include symbol lengths, so symbols are + // non-overlapping. But this is not true in general! Symbols may + // overlap, which makes address lookup complicated. Rather than using a + // complex range data structure, we can use two binary searches, one to + // find the first symbol which could overlap with an address, and one to + // find the last symbol, and then linearly search that array. This + // performs poorly if there are symbols which span many others, but + // that's a rare case. In order to do this strategy, we need an array + // that contains the maximum address spanned by any symbol at or before + // that index. + if (ret->num_syms > 0) // in case num_syms == 0 + ret->max_addrs[0] = ret->symbols[0].address + ret->symbols[0].size; + for (uint32_t i = 1; i < ret->num_syms; i++) { + uint64_t max_addr = ret->symbols[i].address + ret->symbols[i].size; + ret->max_addrs[i] = max(ret->max_addrs[i - 1], max_addr); + } + + // Sort the "name_sort" array by name so we get runs of symbols with the + // same name + for (uint32_t i = 0; i < ret->num_syms; i++) + ret->name_sort[i] = i; + qsort_arg(ret->name_sort, ret->num_syms, sizeof(ret->name_sort[0]), + name_compar, ret); + + // For each unique symbol name, insert the range of symbol indexes + // into the hash table for fast name lookup + struct drgn_symbol_name_table_entry entry; + uint32_t current = 0; + while (current < ret->num_syms) { + const char *current_str = ret->symbols[ret->name_sort[current]].name; + uint32_t next = current + 1; + while (next < ret->num_syms) { + const char *next_str = ret->symbols[ret->name_sort[next]].name; + if (strcmp(current_str, next_str) != 0) + break; + next++; + } + + entry.key = current_str; + entry.value.start = current; + entry.value.end = next; + if (drgn_symbol_name_table_insert(&ret->htab, &entry, NULL) < 0) + goto enomem; + + current = next; + } + return NULL; + +enomem: + drgn_symbol_index_deinit(ret); + return &drgn_enomem; +} + +void +drgn_symbol_index_deinit(struct drgn_symbol_index *index) +{ + // The symbol array is contiguous and all names come from strings + free(index->symbols); + free(index->max_addrs); + drgn_symbol_name_table_deinit(&index->htab); + free(index->strings); + free(index->name_sort); + // Simplify error handling by ensuring deinit is safe to call twice + memset(index, 0, sizeof(*index)); +} + +static void address_search_range(struct drgn_symbol_index *index, uint64_t address, + uint32_t *start_ret, uint32_t *end_ret) +{ + // First, identify the maximum symbol index which could possibly contain + // this address. Think of this as: + // end_ret = bisect_right([s.address for s in symbols], address) + #define less_than_start(a, b) (*(a) < (b)->address) + *end_ret = binary_search_gt(index->symbols, index->num_syms, &address, + less_than_start); + #undef less_than_start + + // Second, identify first symbol index which could possibly contain this + // address. We need to use "max_addrs" for this task: + // bisect_right(max_addrs, address) + #define less_than_end(a, b) (*(a) < *(b)) + *start_ret = binary_search_gt(index->max_addrs, index->num_syms, &address, + less_than_end); + #undef less_than_end +} + +struct drgn_error * +drgn_symbol_index_find(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder) +{ + struct drgn_symbol_index *index = arg; + + // Unlike the ELF symbol finder, we don't have any particular rules + // about which symbols get priority when looking up a single symbol. + // If we decide this logic is critical, it would probably make sense to + // move it into the symbol finder's API via the result builder, rather + // than reimplementing it here. + + if (flags & DRGN_FIND_SYMBOL_ADDR) { + uint32_t start, end; + address_search_range(index, address, &start, &end); + for (uint32_t i = start; i < end; i++) { + struct drgn_symbol *s = &index->symbols[i]; + if (s->address > address || address >= s->address + s->size) + continue; + if ((flags & DRGN_FIND_SYMBOL_NAME) && + strcmp(s->name, name) != 0) + continue; + if (!drgn_symbol_result_builder_add(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } else if (flags & DRGN_FIND_SYMBOL_NAME) { + struct drgn_symbol_name_table_iterator it = + drgn_symbol_name_table_search(&index->htab, &name); + if (!it.entry) + return NULL; + for (uint32_t i = it.entry->value.start; i < it.entry->value.end; i++) { + struct drgn_symbol *s = &index->symbols[index->name_sort[i]]; + if (!drgn_symbol_result_builder_add(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } else { + for (int i = 0; i < index->num_syms; i++) { + struct drgn_symbol *s = &index->symbols[i]; + if (!drgn_symbol_result_builder_add(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } + return NULL; +} + +void +drgn_symbol_index_builder_init(struct drgn_symbol_index_builder *builder) +{ + builder->names = (struct string_builder)STRING_BUILDER_INIT; + symbol_vector_init(&builder->symbols); +} + +void +drgn_symbol_index_builder_deinit(struct drgn_symbol_index_builder *builder) +{ + string_builder_deinit(&builder->names); + symbol_vector_deinit(&builder->symbols); +} + +bool +drgn_symbol_index_builder_add(struct drgn_symbol_index_builder *builder, + const struct drgn_symbol *ptr) +{ + struct drgn_symbol copy = *ptr; + + // Temporarily store the index into the name + copy.name = (char *)builder->names.len; + return string_builder_append(&builder->names, ptr->name) + && string_builder_appendc(&builder->names, '\0') + && symbol_vector_append(&builder->symbols, ©); +} + +struct drgn_error * +drgn_symbol_index_init_from_builder(struct drgn_symbol_index *index, + struct drgn_symbol_index_builder *builder) +{ + size_t names_len = builder->names.len; + char *names = string_builder_steal(&builder->names); + char *tmp_names = realloc(names, names_len); + if (tmp_names) + names = tmp_names; + + symbol_vector_shrink_to_fit(&builder->symbols); + struct drgn_symbol *symbols; + size_t num_syms; + symbol_vector_steal(&builder->symbols, &symbols, &num_syms); + + // Now that the name array is finalized, resolve the names to real + // pointers. Update the name lifetime to static, reflecting that the + // symbol name is owned by the finder whose lifetime is bound to the + // program's once it is attached. The same goes for the symbol. Using + // static lifetimes helps avoid unnecessary copying. + for (size_t i = 0; i < num_syms; i++) { + size_t string_index = (size_t)symbols[i].name; + symbols[i].name = &names[string_index]; + symbols[i].name_lifetime = DRGN_LIFETIME_STATIC; + symbols[i].lifetime = DRGN_LIFETIME_STATIC; + } + + if (num_syms > UINT32_MAX) { + free(names); + free(symbols); + return drgn_error_format(DRGN_ERROR_OUT_OF_BOUNDS, + "too many symbols provided: %zu > %" PRIu32, + num_syms, UINT32_MAX); + } + + return drgn_symbol_index_init(symbols, num_syms, names, index); +} diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index 4a2caf1c5..c3dd75ca7 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -1,4 +1,5 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. +// Copyright (c) 2024, Oracle and/or its affiliates. // SPDX-License-Identifier: LGPL-2.1-or-later #ifndef DRGN_SYMBOL_H @@ -9,6 +10,8 @@ #include "cleanup.h" #include "drgn_internal.h" #include "handler.h" +#include "hash_table.h" +#include "string_builder.h" #include "vector.h" struct drgn_symbol { @@ -18,6 +21,7 @@ struct drgn_symbol { enum drgn_symbol_binding binding; enum drgn_symbol_kind kind; enum drgn_lifetime name_lifetime; + enum drgn_lifetime lifetime; }; struct drgn_symbol_finder { @@ -64,4 +68,103 @@ void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder struct drgn_error * drgn_symbol_copy(struct drgn_symbol *dst, struct drgn_symbol *src); +DEFINE_HASH_MAP(drgn_symbol_name_table, const char *, + struct { uint32_t start; uint32_t end; }, + c_string_key_hash_pair, c_string_key_eq); + +/** + * An index of symbols, supporting efficient lookup by name or address + * + * While the dynamic symbol finding callback is a very flexible API, many use + * cases can be served best by simply providing drgn with a known symbol table + * to index. Drgn can efficiently implement the name and address lookup + * functions once, and provide a symbol finder implementation, so that clients + * need not redo this boilerplate. + * + * In the interest of simplicity, the index is immutable once created. This + * allows us to use simple data structures. If the symbol table needs frequent + * updates, then registering a custom symbol finder should be preferred. + */ +struct drgn_symbol_index { + /** Array of symbols, in sorted order by address */ + struct drgn_symbol *symbols; + + /** Array of max_addr, to aid address lookup */ + uint64_t *max_addrs; + + /** Number of symbols */ + uint32_t num_syms; + + /** The buffer containing all symbol names */ + char *strings; + + /** Array of symbol indices, sorted by name. Used by the htab. */ + uint32_t *name_sort; + + /** Map of symbol names to index */ + struct drgn_symbol_name_table htab; +}; + +/** + * Create a symbol index from an array of symbols + * + * This takes ownership of the symbol array and the individual symbols. The @a + * buffer argument allows us to provide a single backing buffer for all strings + * (in which case the lifetimes of each symbol name should be static). On error + * @a symbols and @a buffer are already freed, since the builder took ownership + * of them. + */ +struct drgn_error * +drgn_symbol_index_init(struct drgn_symbol *symbols, uint32_t count, + char *buffer, struct drgn_symbol_index *ret); + +/** Deinitialize the symbol index. Safe to call multiple times. */ +void drgn_symbol_index_deinit(struct drgn_symbol_index *index); + +DEFINE_VECTOR_TYPE(symbol_vector, struct drgn_symbol); + +struct drgn_symbol_index_builder { + struct string_builder names; + struct symbol_vector symbols; +}; + +/** + * Create a symbol builder which will efficiently pack string names next + * to each other in memory, rather than allocating many small strings. + */ +void +drgn_symbol_index_builder_init(struct drgn_symbol_index_builder *builder); + +/** + * For destroying a builder on error conditions. It is safe to call this + * multiple times, including after drgn_symbol_index_init_from_builder(). + */ +void +drgn_symbol_index_builder_deinit(struct drgn_symbol_index_builder *builder); + +/** + * Add symbol to the builder: the builder does not take ownership of @a ptr, + * instead making a copy. + */ +bool +drgn_symbol_index_builder_add(struct drgn_symbol_index_builder *builder, + const struct drgn_symbol *ptr); + +/** + * Convert the builder to a symbol index, destroying the builder. + * On error, the builder and symbol index are both deinitialized, requiring no + * further cleanup. + */ +struct drgn_error * +drgn_symbol_index_init_from_builder(struct drgn_symbol_index *index, + struct drgn_symbol_index_builder *builder); + +/** + * The actual implementation of the Symbol Finder API. + */ +struct drgn_error * +drgn_symbol_index_find(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder); + #endif /* DRGN_SYMBOL_H */ diff --git a/libdrgn/util.c b/libdrgn/util.c new file mode 100644 index 000000000..4650ee505 --- /dev/null +++ b/libdrgn/util.c @@ -0,0 +1,19 @@ +// Copyright (c) 2024 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later +#include "util.h" + +static _Thread_local int (*qsort_arg_compar)(const void *, const void *, void*); +static _Thread_local void *qsort_arg_arg; + +static int qsort_arg_compar_wrapper(const void *a, const void *b) +{ + return qsort_arg_compar(a, b, qsort_arg_arg); +} + +void qsort_arg(void *base, size_t nmemb, size_t size, + int (*compar)(const void *, const void *, void*), void *arg) +{ + qsort_arg_compar = compar; + qsort_arg_arg = arg; + qsort(base, nmemb, size, qsort_arg_compar_wrapper); +} diff --git a/libdrgn/util.h b/libdrgn/util.h index 78c0c4a32..ea5281210 100644 --- a/libdrgn/util.h +++ b/libdrgn/util.h @@ -211,4 +211,15 @@ static inline uint64_t uint_max(int n) #define add_to_possibly_null_pointer(ptr, i) \ ((typeof(ptr))((uintptr_t)(ptr) + (i) * sizeof(*(ptr)))) +/** + * Similar to qsort_r (passes @a arg to @a compar) but **not** reentrant + * + * The qsort_r() function's main feature is that it is reentrant, but also adds + * the convenience of including an argument to the callback function. + * Unfortunately it is a glibc extension. This provides a similar API but it is + * only thread-safe, not reentrant. See qsort_r(3) for details. + */ +void qsort_arg(void *base, size_t nmemb, size_t size, + int (*compar)(const void *, const void *, void*), void *arg); + #endif /* DRGN_UTIL_H */ diff --git a/tests/linux_kernel/helpers/test_kallsyms.py b/tests/linux_kernel/helpers/test_kallsyms.py new file mode 100644 index 000000000..104f6fae5 --- /dev/null +++ b/tests/linux_kernel/helpers/test_kallsyms.py @@ -0,0 +1,111 @@ +# Copyright (c) 2024 Oracle and/or its affiliates +# SPDX-License-Identifier: LGPL-2.1-or-later +import re +import tempfile +from unittest import TestCase + +from drgn import Symbol, SymbolBinding, SymbolKind +from drgn.helpers.linux.kallsyms import ( + _load_builtin_kallsyms, + _load_proc_kallsyms, + load_module_kallsyms, +) +from tests.linux_kernel import LinuxKernelTestCase, skip_unless_have_test_kmod + + +def compare_local_symbols(self, finder, modules=False): + expr = re.compile( + r"(?P
[0-9a-f]+) (?P.) " r"(?P[^\s]+)(\s+(?P\[\w+\]))?" + ) + names = {} + count = 0 + with open("/proc/kallsyms") as f: + for line in f: + match = expr.fullmatch(line.strip()) + self.assertIsNotNone(match, line) + if match.group("mod") and not modules: + break + count += 1 + name = match.group("name") + addr = int(match.group("address"), 16) + names.setdefault(name, []).append((addr, match.group("kind"), name)) + + for name, syms in names.items(): + res = finder(None, name, None, False) + expected_addrs = sorted(t[0] for t in syms) + found_addrs = sorted(s.address for s in res) + self.assertEqual(expected_addrs, found_addrs) + + all_res = finder(None, None, None, False) + self.assertEqual(count, len(all_res)) + + +KALLSYMS_DATA = b"""\ +0000000000000000 u null +0000000000000008 d local_data +0000000000000010 B global_bss +0000000000000020 v weak_symbol +0000000000000040 ? unknown +0000000000001000 T text [mymod] +0000000000002000 T modfunc1 [mymod2] +0000000000002010 T modfunc2 [mymod2] +0000000000002008 T modfunc3 [mymod2] +""" + + +class TestProcKallsyms(TestCase): + def test_local_proc_kallsyms(self): + finder = _load_proc_kallsyms() + compare_local_symbols(self, finder) + + def test_local_proc_kallsyms_with_modules(self): + finder = _load_proc_kallsyms(modules=True) + compare_local_symbols(self, finder, modules=True) + + def test_static_data(self): + with tempfile.NamedTemporaryFile() as f: + f.write(KALLSYMS_DATA) + f.flush() + finder = _load_proc_kallsyms(filename=f.name, modules=True) + + syms = finder(None, None, None, False) + expected = [ + Symbol("null", 0x0, 8, SymbolBinding.UNIQUE, SymbolKind.UNKNOWN), + Symbol("local_data", 0x8, 8, SymbolBinding.UNKNOWN, SymbolKind.OBJECT), + Symbol("global_bss", 0x10, 16, SymbolBinding.GLOBAL, SymbolKind.OBJECT), + Symbol("weak_symbol", 0x20, 32, SymbolBinding.WEAK, SymbolKind.OBJECT), + # this one has zero size since it is at the end of vmlinux + Symbol("unknown", 0x40, 0, SymbolBinding.UNKNOWN, SymbolKind.UNKNOWN), + # this one has zero size since it is at the end of a module + Symbol("text", 0x1000, 0, SymbolBinding.GLOBAL, SymbolKind.FUNC), + # this one has a non-zero size since it is within a module + Symbol("modfunc1", 0x2000, 16, SymbolBinding.GLOBAL, SymbolKind.FUNC), + # this one has a zero size since it is at the end of the file. It is + # returned in sorted order by address despite kallsyms not + # containing it in sorted order. + Symbol("modfunc3", 0x2008, 0, SymbolBinding.GLOBAL, SymbolKind.FUNC), + # this one has a zero size since it is followed by an out-of-order + # symbol + Symbol("modfunc2", 0x2010, 0, SymbolBinding.GLOBAL, SymbolKind.FUNC), + ] + self.assertEqual(syms, expected) + + +class TestBuiltinKallsyms(LinuxKernelTestCase): + def test_builtin_kallsyms(self): + if b"kallsyms_num_syms" not in self.prog["VMCOREINFO"].string_(): + self.skipTest("VMCOREINFO is missing necessary symbols") + finder = _load_builtin_kallsyms(self.prog) + compare_local_symbols(self, finder) + + @skip_unless_have_test_kmod + def test_module_kallsyms(self): + finder = load_module_kallsyms(self.prog) + test_data = finder(None, "drgn_test_empty_list", None, True)[0] + self.assertEqual("drgn_test_empty_list", test_data.name) + self.assertEqual(SymbolKind.OBJECT, test_data.kind) + self.assertIn(test_data.binding, (SymbolBinding.GLOBAL, SymbolBinding.UNKNOWN)) + size = self.prog.type("struct list_head").size + self.assertEqual(size, test_data.size) + address = self.prog.object("drgn_test_empty_list").address_ + self.assertEqual(address, test_data.address) diff --git a/tests/test_symbol.py b/tests/test_symbol.py index ee84c7e29..d9cc3dd94 100644 --- a/tests/test_symbol.py +++ b/tests/test_symbol.py @@ -3,7 +3,7 @@ import tempfile from _drgn_util.elf import ET, PT, SHT, STB, STT -from drgn import Program, Symbol, SymbolBinding, SymbolKind +from drgn import Program, Symbol, SymbolBinding, SymbolIndex, SymbolKind from tests import TestCase from tests.dwarfwriter import dwarf_sections from tests.elfwriter import ElfSection, ElfSymbol, create_elf_file @@ -343,3 +343,121 @@ def test_many_without_filter(self): self.expect_args(None, None, False) self.assertEqual(self.prog.symbols(), self.TEST_SYMS) self.assertTrue(self.called) + + +class TestSymbolIndex(TestCase): + # Symbols are listed here in order of address, but are shuffled below + AA = Symbol("AA", 10, 5, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + BB = Symbol("BB", 12, 1, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + CC = Symbol("CC", 13, 8, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + DD = Symbol("DD", 28, 5, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + EE = Symbol("EE", 34, 1, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + FF = Symbol("FF", 34, 10, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + GG = Symbol("GG", 34, 2, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + BB2 = Symbol("BB", 36, 3, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + + TEST_SYMS = [GG, BB, AA, BB2, CC, FF, DD, EE] + + def setUp(self): + # This class tests both the SymbolIndex callable interface, and the + # Symbol Finder API. While this seems like it duplicates code, it's + # necessary to test both since they exercise different code paths: the + # Symbol Finder API uses a more efficient fast path. + self.finder = SymbolIndex(self.TEST_SYMS) + self.prog = Program() + self.prog.register_symbol_finder("test", self.finder, enable_index=0) + + def test_name_single(self): + for sym in self.TEST_SYMS: + if sym.name != "BB": + self.assertEqual([sym], self.finder(self.prog, sym.name, None, True)) + self.assertEqual(sym, self.prog.symbol(sym.name)) + self.assertEqual([sym], self.finder(self.prog, sym.name, None, False)) + self.assertEqual([sym], self.prog.symbols(sym.name)) + + def test_name_multiple(self): + multi_result = self.finder(self.prog, "BB", None, False) + self.assertEqual(2, len(multi_result)) + self.assertIn(self.BB, multi_result) + self.assertIn(self.BB2, multi_result) + + multi_result = self.prog.symbols("BB") + self.assertEqual(2, len(multi_result)) + self.assertIn(self.BB, multi_result) + self.assertIn(self.BB2, multi_result) + + single_result = self.finder(self.prog, "BB", None, True) + self.assertIn(single_result[0], (self.BB, self.BB2)) + + single_result = self.prog.symbol("BB") + self.assertIn(single_result, (self.BB, self.BB2)) + + def test_addr(self): + cases = { + 9: [], + 10: [self.AA], + 12: [self.AA, self.BB], + 13: [self.AA, self.CC], + 15: [self.CC], + 25: [], + 28: [self.DD], + 30: [self.DD], + 34: [self.EE, self.FF, self.GG], + 35: [self.FF, self.GG], + 36: [self.FF, self.BB2], + 43: [self.FF], + 44: [], + } + for address, expected in cases.items(): + # first, lookup by address alone and ensure we get all correct + # candidates: + multi_result = self.finder(self.prog, None, address, False) + self.assertEqual(len(expected), len(multi_result)) + self.assertTrue(all(e in multi_result for e in expected)) + multi_result = self.prog.symbols(address) + self.assertEqual(len(expected), len(multi_result)) + self.assertTrue(all(e in multi_result for e in expected)) + + # next, ensure that the single lookup works as expected: + if expected: + single_result = self.finder(self.prog, None, address, True) + self.assertEqual(1, len(single_result)) + self.assertIn(single_result[0], expected) + single_result = self.prog.symbol(address) + self.assertIn(single_result, expected) + + # Now, test that adding a name filter correctly filters: + # This cannot be tested with the Program.symbol() API since only + # one filter is allowed there. + for sym in expected: + self.assertEqual([sym], self.finder(self.prog, sym.name, address, True)) + self.assertEqual( + [sym], self.finder(self.prog, sym.name, address, False) + ) + + self.assertEqual([], self.finder(None, "MISSING", address, True)) + self.assertEqual([], self.finder(None, "MISSING", address, False)) + + def test_all(self): + result = self.finder(self.prog, None, None, True) + self.assertEqual(1, len(result)) + self.assertIn(result[0], self.TEST_SYMS) + result = self.finder(self.prog, None, None, False) + self.assertEqual(len(self.TEST_SYMS), len(result)) + for sym in self.TEST_SYMS: + self.assertIn(sym, result) + result = self.prog.symbols() + self.assertEqual(len(self.TEST_SYMS), len(result)) + for sym in self.TEST_SYMS: + self.assertIn(sym, result) + + def test_empty_index(self): + index = SymbolIndex([]) + # Check all the possible query patterns to ensure they can safely handle + # an empty list. + self.assertEqual([], index(self.prog, "name search", None, True)) + self.assertEqual([], index(self.prog, "name search", None, False)) + self.assertEqual([], index(self.prog, None, 0xFFFF, True)) + self.assertEqual([], index(self.prog, None, 0xFFFF, False)) + self.assertEqual([], index(self.prog, "name search", 0xFFFF, True)) + self.assertEqual([], index(self.prog, "name search", 0xFFFF, False))