Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SymbolIndex and kallsyms helpers #388

Merged
merged 6 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions _drgn.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1898,6 +1898,69 @@ class Symbol:
kind: Final[SymbolKind]
"""Kind of entity represented by this symbol."""

class SymbolIndex:
"""
A ``SymbolIndex`` contains a static set of symbols and allows efficient
lookup by name and address.

With :meth:`Program.register_symbol_finder()`, you can add a callback to
provide custom symbol finding logic. However, in many cases, all that is
necessary is to provide drgn with a list of symbols that you know to be part
of the program. This object allows you to do that. It efficiently implements
the Symbol Finder API given a static set of symbols. For example::

>>> prog = drgn.Program()
>>> symbol = drgn.Symbol("foo", 0x123, 1, drgn.SymbolBinding.GLOBAL, drgn.SymbolKind.OBJECT)
>>> finder = drgn.SymbolIndex([symbol])
>>> prog.register_symbol_finder("SymbolIndex", finder, enable_index=0)
>>> prog.symbols()
[Symbol(name='foo', address=0x123, size=0x1, binding=<SymbolBinding.GLOBAL: 2>, kind=<SymbolKind.OBJECT: 1>)]
>>> prog.symbol("bar")
Traceback (most recent call last):
File "<console>", line 1, in <module>
LookupError: not found
>>> prog.symbol("foo")
Symbol(name='foo', address=0x123, size=0x1, binding=<SymbolBinding.GLOBAL: 2>, kind=<SymbolKind.OBJECT: 1>)
>>> prog.symbol(0x100)
Traceback (most recent call last):
File "<console>", line 1, in <module>
LookupError: not found
>>> prog.symbol(0x123)
Symbol(name='foo', address=0x123, size=0x1, binding=<SymbolBinding.GLOBAL: 2>, kind=<SymbolKind.OBJECT: 1>)
"""

def __init__(self, symbols: Iterable[Symbol]) -> None:
"""
Create a ``SymbolIndex`` from a sequence of symbols

The returned symbol index satisfies the Symbol Finder API. It supports
overlapping symbol address ranges and duplicate symbol names. However,
in the case of these sorts of conflicts, it doesn't provide any
guarantee on the order of the results, or which result is returned when
a single symbol is requested.

:param symbols: An iterable of symbols
:returns: A callable object suitable to provide to
:meth:`Program.register_symbol_finder()`.
"""

def __call__(
self,
prog: Program,
name: Optional[str],
address: Optional[int],
one: bool,
) -> List[Symbol]:
"""
Lookup symbol by name, address, or both.

:param prog: (unused) the program looking up this symbol
:param name: if given, only return symbols with this name
:param address: if given, only return symbols spanning this address
:param one: if given, limit the result to a single symbol
:returns: a list of matching symbols (empty if none are found)
"""

class SymbolBinding(enum.Enum):
"""
A ``SymbolBinding`` describes the linkage behavior and visibility of a
Expand Down Expand Up @@ -2776,3 +2839,8 @@ def _linux_helper_pid_task(pid: Object, pid_type: IntegerLike) -> Object:
def _linux_helper_find_task(__ns: Object, __pid: IntegerLike) -> Object: ...
def _linux_helper_kaslr_offset(__prog: Program) -> int: ...
def _linux_helper_pgtable_l5_enabled(__prog: Program) -> bool: ...
def _linux_helper_load_proc_kallsyms(
filename: Optional[str] = None,
modules: bool = False,
) -> SymbolIndex: ...
def _linux_helper_load_builtin_kallsyms(prog: Program) -> SymbolIndex: ...
1 change: 1 addition & 0 deletions docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ Symbols
.. drgndoc:: Symbol
.. drgndoc:: SymbolBinding
.. drgndoc:: SymbolKind
.. drgndoc:: SymbolIndex

Stack Traces
------------
Expand Down
2 changes: 2 additions & 0 deletions drgn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
StackTrace,
Symbol,
SymbolBinding,
SymbolIndex,
SymbolKind,
Thread,
Type,
Expand Down Expand Up @@ -127,6 +128,7 @@
"StackTrace",
"Symbol",
"SymbolBinding",
"SymbolIndex",
"SymbolKind",
"Thread",
"Type",
Expand Down
237 changes: 237 additions & 0 deletions drgn/helpers/linux/kallsyms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
# Copyright (c) 2024 Oracle and/or its affiliates
# SPDX-License-Identifier: LGPL-2.1-or-later
"""
Kallsyms
--------

The ``drgn.helpers.linux.kallsyms`` module contains helpers which allow you to
use the built-in kallsyms symbol table for drgn symbol lookup. Combined with an
alternative type information source, this can enable debugging Linux kernel core
dumps without the corresponding DWARF debuginfo files. Even without type
information, kallsyms can be used to help locate objects, and drgn's low-level
memory reading functions can be used to do basic debugging tasks.
"""
import os
import re
from typing import Dict, List, Tuple

from _drgn import (
_linux_helper_load_builtin_kallsyms,
_linux_helper_load_proc_kallsyms as _load_proc_kallsyms,
)
from drgn import (
Object,
Program,
ProgramFlags,
Symbol,
SymbolBinding,
SymbolIndex,
SymbolKind,
)
from drgn.helpers.linux.module import for_each_module

__all__ = (
"load_vmlinux_kallsyms",
"load_module_kallsyms",
)


def _vmcoreinfo_symbols(prog: Program) -> Dict[str, int]:
vmcoreinfo_data = prog["VMCOREINFO"].string_().decode("ascii")
vmcoreinfo_symbols = {}
sym_re = re.compile(r"SYMBOL\(([^)]+)\)=([A-Fa-f0-9]+)")
for line in vmcoreinfo_data.strip().split("\n"):
match = sym_re.fullmatch(line)
if match:
vmcoreinfo_symbols[match.group(1)] = int(match.group(2), 16)
return vmcoreinfo_symbols


def _load_builtin_kallsyms(prog: Program) -> SymbolIndex:
symbol_reqd = [
"kallsyms_names",
"kallsyms_token_table",
"kallsyms_token_index",
"kallsyms_num_syms",
"kallsyms_offsets",
"kallsyms_relative_base",
"kallsyms_addresses",
"_stext",
]
symbols = _vmcoreinfo_symbols(prog)
args = []
for sym in symbol_reqd:
args.append(symbols.get(sym, 0))
return _linux_helper_load_builtin_kallsyms(prog, *args)


def load_vmlinux_kallsyms(prog: Program) -> SymbolIndex:
"""
Create a kallsyms index for vmlinux

This function loads the kallsyms for the core kernel and returns a symbol
index. This function does not require that any debuginfo is loaded for the
kernel: it either relies on ``/proc/kallsyms`` (which requires running drgn
as root) or it parses internal data structures using information found from
the VMCOREINFO note (which requires Linux 6.0 or later, or a backport of
commit ``f09bddbd86619 ("vmcoreinfo: add kallsyms_num_syms symbol")`` and
its dependencies).

:returns: a symbol index containing kallsyms for the core kernel (vmlinux)
"""
if prog.flags & ProgramFlags.IS_LIVE and os.geteuid() == 0:
return _load_proc_kallsyms()
else:
return _load_builtin_kallsyms(prog)


def _nm_type_to_binding_kind(code: str) -> Tuple[SymbolBinding, SymbolKind]:
binding = SymbolBinding.UNKNOWN
kind = SymbolKind.UNKNOWN
if code == "v":
binding = SymbolBinding.WEAK
kind = SymbolKind.OBJECT
elif code == "w":
binding = SymbolBinding.WEAK
elif code in "tT":
kind = SymbolKind.FUNC
elif code.lower() in "srbgncd":
kind = SymbolKind.OBJECT
if binding == SymbolBinding.UNKNOWN and code.isupper():
binding = SymbolBinding.GLOBAL
return binding, kind


def _st_info_to_binding_kind(info: int) -> Tuple[SymbolBinding, SymbolKind]:
binding_int = info >> 4
STB_WEAK = 2
STB_GNU_UNIQUE = 10
if binding_int <= STB_WEAK or binding_int == STB_GNU_UNIQUE:
binding = SymbolBinding(binding_int + 1)
else:
binding = SymbolBinding.UNKNOWN
type_ = info & 0xF
STT_TLS = 6
STT_GNU_IFUNC = 10
if type_ <= STT_TLS or type_ == STT_GNU_IFUNC:
kind = SymbolKind(type_)
else:
kind = SymbolKind.UNKNOWN
return binding, kind


def _elf_sym_to_symbol(name: str, obj: Object, has_typetab: bool) -> Symbol:
# Linux likes to have the nm(1) character code for its symbols, which it
# refers to as the symbol's "type" (this is of course distinct from the ELF
# notion of a symbol type, let alone what drgn considers a "type"...).
#
# Prior to 5439c985c5a8 ("module: Overwrite st_size instead of st_info"),
# merged in v5.0, the kernel simply overwrote the "st_info" field with a
# single-character code that represents the nm(1) character code for that
# symbol. However, starting with that commit, it was switched to overwrite
# the "st_size" field instead! This was thankfully fixed in v5.2 with
# 1c7651f43777 ("kallsyms: store type information in its own array").
#
# Unfortunately, this leaves us with three possibilities:
# 1. Pre-v5.0: interpret the "st_info" as a character from nm(1) and try to
# infer the kind and bindings.
# 2. 5.0-5.2: interpret the "st_info" as normal, but ignore the "st_size"
# field since it is bogus.
# 3. 5.2+: both fields are valid, and the nm(1) code is stored in "typetab".
#
# Case 3 can be determined easily by the presence of "typetab" in "struct
# mod_kallsyms". However, cases 1 & 2 are indistinguishable. For our
# purposes, it makes more sense to fall back to case 1. After all, neither
# 5.0 or 5.1 were LTS kernels, nor are they actively used by any major
# distro. We have no way to deal with 5.0 or 5.1, whereas we can make some
# informed guesses for pre-5.0 based on the nm(1) code.
if has_typetab:
binding, kind = _st_info_to_binding_kind(obj.st_info.value_())
else:
binding, kind = _nm_type_to_binding_kind(chr(obj.st_info.value_()))
return Symbol( # type: ignore
name,
obj.st_value.value_(),
obj.st_size.value_(),
binding,
kind,
)


def _module_kallsyms(module: Object) -> List[Symbol]:
"""
Return a list of symbols for a kernel module

When compiled with ``CONFIG_KALLSYMS``, the kernel maintains ELF symbol
information about each module within ``struct module``. This function
accesses this symbol information, and returns a list of drgn :class:`Symbol`
objects for the module. Keep in mind that unless ``CONFIG_KALLSYMS_ALL`` is
enabled, these symbols are typically only function symbols.

:param module: :class:`Object` of type ``struct module *``
:returns: a list of symbols
"""
try:
ks = module.kallsyms
except AttributeError:
# Prior to 8244062ef1e54 ("modules: fix longstanding /proc/kallsyms vs
# module insertion race."), the kallsyms variables were stored directly
# on the module object. This commit was introduced in 4.5, but was
# backported to some stable kernels too. Fall back to the module object
# in cases where kallsyms field isn't available.
ks = module

prog = module.prog_
num_symtab = ks.num_symtab.value_()
try:
ks.member_("typetab")
has_typetab = True
except LookupError:
has_typetab = False

# The symtab field is a pointer, but it points at an array of Elf_Sym
# objects. Indexing it requires drgn to do pointer arithmetic and issue a
# lot of very small /proc/kcore reads, which can be a real performance
# issue. So convert it into an object representing a correctly-sized array,
# and then read that object all at once. This does one /proc/kcore read,
# which is a major improvement!
brenns10 marked this conversation as resolved.
Show resolved Hide resolved
symtab = Object(
prog,
type=prog.array_type(ks.symtab.type_.type, num_symtab),
address=ks.symtab.value_(),
).read_()

# The strtab is similarly a pointer into a contigous array of strings packed
# next to each other. Reading individual strings from /proc/kcore can be
# quite slow. So read the entire array of bytes into a Python bytes value,
# and we'll extract the individual symbol strings from there.
last_string_start = symtab[num_symtab - 1].st_name.value_()
last_string_len = len(ks.strtab[last_string_start].address_of_().string_()) + 1
strtab = prog.read(ks.strtab.value_(), last_string_start + last_string_len)
syms = []
for i in range(ks.num_symtab.value_()):
elfsym = symtab[i]
if not elfsym.st_name:
continue
str_index = elfsym.st_name.value_()
nul_byte = strtab.find(b"\x00", str_index)
name = strtab[str_index:nul_byte].decode("ascii")
syms.append(_elf_sym_to_symbol(name, elfsym, has_typetab))
return syms


def load_module_kallsyms(prog: Program) -> SymbolIndex:
"""
Return a symbol index containing all module symbols from kallsyms

For kernels built with ``CONFIG_KALLSYMS``, loaded kernel modules contain
an ELF symbol table in kernel memory. This function can parse those data
structures and create a symbol index usable by drgn. However, it requires
that you already have debuginfo for the vmlinux image.

:returns: a symbol index containing all symbols from module kallsyms
"""
all_symbols = []
for module in for_each_module(prog):
all_symbols.extend(_module_kallsyms(module))
return SymbolIndex(all_symbols)
4 changes: 4 additions & 0 deletions libdrgn/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS_PYS:_defs.py=.c) \
helpers.h \
io.c \
io.h \
kallsyms.c \
kallsyms.h \
language.c \
language.h \
language_c.c \
Expand Down Expand Up @@ -116,6 +118,7 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS_PYS:_defs.py=.c) \
symbol.h \
type.c \
type.h \
util.c \
util.h \
vector.h

Expand Down Expand Up @@ -170,6 +173,7 @@ _drgn_la_SOURCES = python/constants.c \
python/program.c \
python/stack_trace.c \
python/symbol.c \
python/symbol_index.c \
python/test.c \
python/thread.c \
python/type.c \
Expand Down
Loading