From 2073e390ce5136735f4863fc7f1b56782f9322b6 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Tue, 27 Aug 2024 15:49:17 -0700 Subject: [PATCH 1/2] slabinfo: catch errors due to freelist corruption Freelist corruption results in the "slabinfo" module crashing. Since this is a common symptom of use-after-free bugs, we'd rather give useful information about this case. So don't crash the module. Catch the error and report corruption issues at the end. This also helps in certain cases where we are running against a live kernel, and the freelist is not corrupt, but it changed by the time we decoded the pointer. As a result, we print different messages at different times: for live systems, we say this may be transient, but for core dumps, we say it indicates a potential use-after-free bug. To do this, we have to implement a rather sketchy workaround to use the _SlabCacheHelperSlub from drgn. This is manually verified to work on 0.0.25 through 0.0.27, which are the only supported drgn versions. But, we need to work on upstream tweaks to improve the slab helpers, so we don't need to rely on the hack. Signed-off-by: Stephen Brennan --- drgn_tools/slabinfo.py | 77 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/drgn_tools/slabinfo.py b/drgn_tools/slabinfo.py index fd1bdd13..2a5a0436 100644 --- a/drgn_tools/slabinfo.py +++ b/drgn_tools/slabinfo.py @@ -4,13 +4,16 @@ Helper to view slabinfo data """ import argparse +from typing import List from typing import NamedTuple from typing import Set from typing import Tuple from drgn import cast +from drgn import FaultError from drgn import Object from drgn import Program +from drgn import ProgramFlags from drgn import Type from drgn.helpers.linux.cpumask import for_each_present_cpu from drgn.helpers.linux.list import list_for_each_entry @@ -40,6 +43,8 @@ class SlabCacheInfo(NamedTuple): """Slab size""" name: str """Name of the slab cache""" + freelist_corrupt_cpus: List[int] + """A list of CPUs for which the freelist was found to be corrupt""" def _slab_type(prog: Program) -> Type: @@ -204,19 +209,41 @@ def slub_per_cpu_partial_free(cpu_partial: Object) -> int: return partial_free -def kmem_cache_slub_info(cache: Object) -> Tuple[int, int]: +class _CpuSlubWrapper: + def __init__(self, obj): + self._obj = obj + + def __getattr__(self, key): + if key == "cpu_slab": + raise AttributeError("CpuSlubWrapper!") + return self._obj.__getattribute__(key) + + +def kmem_cache_slub_info(cache: Object) -> Tuple[int, int, List[int]]: """ For given kmem_cache object, parse through each cpu and get number of total slabs and free objects + If the CPU freelist was corrupt, then we do our best effort to count free + objects, but we may undercount them. We set the corruption flag when this + happens. + :param: ``struct kmem_cache`` drgn object - :returns: total slabs, free objects + :returns: total slabs, free objects, corruption instances """ prog = cache.prog_ use_slab = _has_struct_slab(prog) total_slabs = objects = free_objects = 0 - slub_helper = _get_slab_cache_helper(cache) + + # The "cpu_slab" variable is used by the slab helper to preload the percpu + # freelists. Not only does this duplicate work we're about to do, but also + # corrupt slab caches will crash this function before we can detect which + # CPU is corrupt. Pretend we have no "cpu_slab" variable when getting the + # helper. This depends on implementation details: we will improve the helper + # upstream to avoid this for the future. + slub_helper = _get_slab_cache_helper(_CpuSlubWrapper(cache)) + corrupt = [] for cpuid in for_each_present_cpu(prog): per_cpu_slab = per_cpu_ptr(cache.cpu_slab, cpuid) @@ -237,15 +264,25 @@ def kmem_cache_slub_info(cache: Object) -> Tuple[int, int]: objects = 0 free_objects += objects - page_inuse - cpu_free_objects = slub_get_cpu_freelist_cnt(cpu_freelist, slub_helper) - free_objects += cpu_free_objects + + # Easily the most common form of corruption in the slab allocator comes + # from use after free, which overwrites the freelist pointer and causes + # a fault error. Catch this and report it for later. + try: + cpu_free_objects = slub_get_cpu_freelist_cnt( + cpu_freelist, slub_helper + ) + except FaultError: + corrupt.append(cpuid) + else: + free_objects += cpu_free_objects partial_frees = slub_per_cpu_partial_free(cpu_partial) free_objects += partial_frees total_slabs += 1 - return total_slabs, free_objects + return total_slabs, free_objects, corrupt def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo: @@ -255,7 +292,7 @@ def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo: :param cache: ``struct kmem_cache`` drgn object :returns: a :class:`SlabCacheInfo` with statistics about the cache """ - total_slabs, free_objects = kmem_cache_slub_info(cache) + total_slabs, free_objects, corrupt = kmem_cache_slub_info(cache) ( nr_slabs, nr_total_objs, @@ -280,6 +317,7 @@ def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo: total_slabs, ssize, cache.name.string_().decode("utf-8"), + corrupt, ) @@ -296,12 +334,17 @@ def print_slab_info(prog: Program) -> None: "NAME", ] ) + corruption = [] for cache in for_each_slab_cache(prog): slabinfo = get_kmem_cache_slub_info(cache) + maybe_asterisk = "" + if slabinfo.freelist_corrupt_cpus: + maybe_asterisk = "*" + corruption.append(slabinfo) table.row( slabinfo.cache.value_(), slabinfo.objsize, - slabinfo.allocated, + f"{slabinfo.allocated}{maybe_asterisk}", slabinfo.total, slabinfo.nr_slabs, f"{int(slabinfo.ssize / 1024)}k", @@ -309,6 +352,24 @@ def print_slab_info(prog: Program) -> None: ) table.write() + if corruption: + if prog.flags & ProgramFlags.IS_LIVE: + print( + "NOTE: freelist corruption was detected. This is not " + "necessarily an error, as live systems may encounter race " + "conditions." + ) + else: + print( + "WARNING: freelist corruption was detected. It is likely that " + "a use-after-free bug occurred." + ) + table = FixedTable(["CACHE:<24s", "CORRUPT CPUS"]) + for slabinfo in corruption: + cpus = ", ".join(map(str, slabinfo.freelist_corrupt_cpus)) + table.row(slabinfo.name, cpus) + table.write() + class SlabInfo(CorelensModule): """Print info about each slab cache""" From 3afc56146f54d09dfd1f6d3c1b7436eda7e638be Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 26 Aug 2024 10:21:38 -0700 Subject: [PATCH 2/2] Release v1.1.0 Signed-off-by: Stephen Brennan --- CHANGELOG.md | 54 +++++++++++++++++++++++++++++++++ buildrpm/python-drgn-tools.spec | 5 ++- setup.py | 2 +- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd60aa68..f44a2915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,65 @@ Changelog ========= +Release Cycle +------------- + +Prior to v1, the version numbers loosely followed the rule that new features +would bump the minor version level, and bug fix releases would bump the patch +version level. + +Beginning with v1, a new scheme is adopted which allows for a "development" +version, and a stable version. Using the version numbers `x.y.z`, we have: + +1. The **development** version is of the form `x.0.z`, where `x` represents the + major version under development. Each release is performed by incrementing + `z`, the patch level, regardless of the type of changes. The development + version ends with the release of the "stable" version of `x.1.0`. The + development version is maintained on the `main` branch. +2. The **stable** version is of the form `x.y.z`, where `y >= 1`, and `x` is of + course the major version. The "stable" versions are the only ones which are + released to Oracle Linux as RPMs. Releases will generally increment `z`, the + patch version, for bug fix releases. It's possible that in rare cases, we + will increment `y` for backports, in cases where we want to backport a module + to the stable release. The stable version is maintained in a branch named + `stable/vX`, where `X` is replaced with the major version number (e.g. + `stable/v1`). + +The stable version is maintained in parallel as the development version is +developed. Fixes in the stable release must first be present in the development +release (and all newer stable releases, if applicable). + +For the most part, regular maintenance of the stable version will end with the +release of the next stable version, but maintenance may continue at our +discretion. + +Examples: + +- `1.1.0` - the initial public release of the `1.x` stable series. +- `1.1.1` - the first bugfix release of the `1.x` stable series. +- `2.0.0` - the initial development version of the `2.x`. +- `2.0.1` - an incremental development release in `2.x` development. It may + contain bug fixes or new features. +- `2.1.0` - the initial public release of the `2.x` stable series. + + Unreleased ---------- Changes which are committed to git, but not yet released, may appear here. +1.1.0 - Tue, Aug 27, 2023 +------------------------- + +This is the first public release of drgn-tools! + +* Fixes for the irq, workqueue, bt, & nfs_tools modules. +* Add ls and fsnotify modules. +* Added new helpers for tasks & task states. +* Basic functionality for running on UEK-NEXT (unsupported). +* RPM name updated to "drgn-tools". +* Support for CTF debuginfo added for the locks module. + 0.9.1 - Mon, Apr 22, 2023 ------------------------- diff --git a/buildrpm/python-drgn-tools.spec b/buildrpm/python-drgn-tools.spec index a9da0669..ccc18234 100644 --- a/buildrpm/python-drgn-tools.spec +++ b/buildrpm/python-drgn-tools.spec @@ -1,7 +1,7 @@ # Copyright (c) 2024, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ Name: python-drgn-tools -Version: 0.9.1 +Version: 1.1.0 Release: 1%{?dist} Summary: Helper scripts for drgn, containing the corelens utility @@ -61,6 +61,9 @@ rm %{buildroot}/usr/bin/DRGN %{_mandir}/man1/corelens.1.gz %changelog +* Tue Aug 27 2024 Stephen Brennan - 1.1.0-1 +- Update to 1.1.0 + * Mon Apr 22 2024 Stephen Brennan - 0.9.1-1 - Update to 0.9.1 diff --git a/setup.py b/setup.py index 33f53440..1902c732 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ long_description = "drgn helper script repository" -RELEASE_VERSION = "0.9.1" +RELEASE_VERSION = "1.1.0" PACKAGES = ["drgn_tools"] if not os.environ.get("DRGN_TOOLS_V2_OMIT"): PACKAGES.append("drgn_tools.v2")