Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the list_lru interator helper #74

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ drgn\_tools.itertools module
:undoc-members:
:show-inheritance:

drgn\_tools.list_lru module
-----------------------

.. automodule:: drgn_tools.list_lru
:members:
:undoc-members:
:show-inheritance:

drgn\_tools.lock module
-----------------------

Expand Down
236 changes: 236 additions & 0 deletions drgn_tools/list_lru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Copyright (c) 2024, Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
"""
LRU Lists
------------

Helper to work with LRU lists. LRU can be created to be memcg aware and
ordered by NUMA node.

The routines iterate through the specified LRU and on NUMA machines, the
output keeps the entries ordered by NUMA node.

The list_lru_from_memcg_node_for_each_entry() function allows the user to
restrict the iteration by the memcg index when the list_lru is memcg_aware
and the NUMA node identifier.

list_lru_kmem_to_memcgidx() is a helper to find the mem_cgroup index
from a list_lru kvm address. This helper will find the memcg of a list_lru
address. This routine is only interested in slab allocated entries and does
not check nor handle the MEMCG_DATA_KMEM case.
"""
from typing import Iterator
from typing import Union

from drgn import cast
from drgn import IntegerLike
from drgn import NULL
from drgn import Object
from drgn import Program
from drgn import Type
from drgn.helpers.linux.list import list_empty
from drgn.helpers.linux.list import list_for_each_entry
from drgn.helpers.linux.mm import compound_head
from drgn.helpers.linux.mm import page_to_virt
from drgn.helpers.linux.mm import virt_to_page
from drgn.helpers.linux.nodemask import for_each_online_node
from drgn.helpers.linux.nodemask import node_state
from drgn.helpers.linux.xarray import xa_for_each
from drgn.helpers.linux.xarray import xa_load

from drgn_tools.util import has_member

MEMCG_DATA_OBJCGS = 1
MEMCG_DATA_KMEM = 2

__all__ = (
"list_lru_for_each_entry",
"list_lru_from_memcg_node_for_each_entry",
"list_lru_kmem_to_memcgidx",
)


def list_lru_for_each_entry(
prog: Program, type: Union[str, Type], lru: Object, member: str
) -> Iterator[Object]:
"""
Iterate over all of the entries in a list_lru.

:param prog: Kernel being debugged
:param type: Entry type.
:param lru: ``struct list_lru *``
:param member: Name of list node member in entry type.
:return: Iterator of ``type *`` objects.
"""
memcg_aware = 0
if has_member(lru, "memcg_aware") and lru.memcg_aware:
memcg_aware = 1

if has_member(lru, "node"):
# no lru.node in uek7 but covered in above test
if has_member(lru.node, "memcg_lrus") and lru.node[0].memcg_lrus:
memcg_aware = 1

if memcg_aware:
if has_member(lru, "ext") or has_member(lru, "xa"):
# v5.13 (uek7) or newer
if has_member(lru, "ext"):
# uek7 has a UEK_KABI_REPLACE of node to ext
xa = lru.ext.xa
else:
# uek8
xa = lru.xa
# Keep the entries grouped by the NUMA node.
for nid in for_each_online_node(prog):
for _, memcg in xa_for_each(xa.address_of_()):
# convert from the void ptr
memcg = Object(prog, "struct list_lru_memcg *", memcg)
lru_one = memcg.node[nid]
if lru_one.nr_items > 0:
for entry in list_for_each_entry(
type, lru_one.list.address_of_(), member
):
yield entry
else:
# Before v5.13, memcg entries are in an array
# Keep the entries grouped by the NUMA node.
for nid in for_each_online_node(prog):
i = 0
while i < prog["memcg_nr_cache_ids"]:
li = lru.node[nid].memcg_lrus.lru[i].list
i = i + 1
if not list_empty(li.address_of_()):
for entry in list_for_each_entry(
type, li.address_of_(), member
):
yield entry
else:
# not lru.memcg_aware
for nid in for_each_online_node(prog):
# not lru.memcg_aware
if has_member(lru, "ext"):
li = lru.ext.node[nid].lru.list
else:
li = lru.node[nid].lru.list
for entry in list_for_each_entry(type, li.address_of_(), member):
yield entry


def list_lru_from_memcg_node_for_each_entry(
prog: Program,
mindx: IntegerLike,
nid: IntegerLike,
type: Union[str, Type],
lru: Object,
member: str,
) -> Iterator[Object]:
"""
Iterate over the entries in a list_lru by the provided memcg and NUMA node.

:param prog: Kernel being debugged
:param mindx: memcg index.
:param nid: NUMA node ID.
:param type: Entry type.
:param lru: ``struct list_lru *``
:param member: Name of list node member in entry type.
:return: Iterator of ``type *`` objects.
"""
if node_state(nid, prog["N_ONLINE"]):
memcg_aware = 0
if has_member(lru, "memcg_aware") and lru.memcg_aware:
memcg_aware = 1
if has_member(lru, "node"):
# no lru.node in uek7 but covered in above test
if has_member(lru.node, "memcg_lrus") and lru.node[0].memcg_lrus:
memcg_aware = 1
if memcg_aware:
if has_member(lru, "ext") or has_member(lru, "xa"):
# v5.13 (uek7) or newer
if has_member(lru, "ext"):
# uek7 has a UEK_KABI_REPLACE of node to ext
xa = lru.ext.xa
else:
# uek8
xa = lru.xa
# Keep the entries grouped by the NUMA node.
memcg = xa_load(xa.address_of_(), mindx)
# convert from the void ptr unless it is a NULL
if memcg != NULL(prog, "void *"):
memcg = Object(prog, "struct list_lru_memcg *", memcg)
lru_one = memcg.node[nid]
if lru_one.nr_items > 0:
for entry in list_for_each_entry(
type, lru_one.list.address_of_(), member
):
yield entry
else:
# Before v5.13
# make sure the memcg index is within the legal limits
if mindx >= 0 and mindx < prog["memcg_nr_cache_ids"]:
li = lru.node[nid].memcg_lrus.lru[mindx].list
if not list_empty(li.address_of_()):
for entry in list_for_each_entry(
type, li.address_of_(), member
):
yield entry
else:
# not lru.memcg_aware
if has_member(lru, "ext"):
li = lru.ext.node[nid].lru.list
else:
li = lru.node[nid].lru.list
for entry in list_for_each_entry(type, li.address_of_(), member):
yield entry


def list_lru_kmem_to_memcgidx(prog: Program, kvm: IntegerLike) -> IntegerLike:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory, we could get the numa node id from the page ptr, BUT there are multiple ways that is can be encoded.

Option 1: it is encoded in the vm_page.flags with the offset determined by what else is embedded in the vm_page.flags (section, zone, last cupid)
Option 2: it is in a special section_to_node_table[] array where the section is encoded in the the vm_page.flags

A check for the section_to_node_table[] is easy enough but the vm_page.flags offsets/width are CONFIG dependent and is less obvious for Drgn find automatically. From disassembling the code, the hosts that I have, the numa node is:
(page.flags >> 0x36)

"""
Convert the kvm of an embedded list_lru and return the memcg index.
Return -1 if the list_lru is not memcg enabled or value could not be found.
Memory cgroups for slab allocation are per object. This code expects a slab
allocated kvm and the MEMCG_DATA_KMEM case is NOT covered in this routine.

:param prog: Kernel being debugged
:param kvm: address of a list_lru
:return: memcg index, -1 means not found
"""
page = virt_to_page(prog, kvm)
cpage = compound_head(page)
# page_objcgs_check() MEMCG_DATA_OBJCGS memcg are managed per object
if has_member(cpage, "memcg_data") or has_member(cpage, "obj_cgroups"):
if has_member(cpage, "memcg_data"):
memcg_data = cpage.memcg_data
else:
# cast to an integer for the MEMCG_DATA_KMEM test.
memcg_data = cast("unsigned long", cpage.obj_cgroups)
if memcg_data & MEMCG_DATA_OBJCGS:
objcgrp = Object(
prog, "struct obj_cgroup **", memcg_data - MEMCG_DATA_OBJCGS
)
# offset of object calculation
pvm = page_to_virt(cpage)
kvm = Object(prog, "void *", kvm)
if has_member(cpage, "slab_cache"):
slab_cache = cpage.slab_cache
else:
# v5.17 (uek8) moved the kmem_cache to a new slab structure.
if cpage.flags & (1 << prog.constant("PG_slab")):
slab = Object(prog, "struct slab *", cpage)
slab_cache = slab.slab_cache
else:
return -1
objoffset = (kvm - pvm) / slab_cache.size
memcgrp = objcgrp[objoffset].memcg
if memcgrp == NULL(prog, "struct mem_cgroup *"):
return -1
else:
return memcgrp.kmemcg_id
else:
return -1
else:
# Before v5.13
scache = cpage.slab_cache
if scache == NULL(prog, "struct kmem_cache *"):
return -1
else:
return cpage.slab_cache.memcg_params.memcg.kmemcg_id
143 changes: 143 additions & 0 deletions tests/test_list_lru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright (c) 2024, Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from typing import Optional
from typing import Union

from drgn import cast
from drgn import IntegerLike
from drgn import Object
from drgn import Path
from drgn import Program
from drgn.helpers.common.format import escape_ascii_string
from drgn.helpers.linux.fs import for_each_mount
from drgn.helpers.linux.fs import mount_dst
from drgn.helpers.linux.fs import mount_fstype
from drgn.helpers.linux.nodemask import for_each_online_node

from drgn_tools.list_lru import list_lru_for_each_entry
from drgn_tools.list_lru import list_lru_from_memcg_node_for_each_entry
from drgn_tools.list_lru import list_lru_kmem_to_memcgidx


def test_list_lru(
prog: Program,
*,
dst: Optional[Path] = None,
fstype: Optional[Union[str, bytes]] = None,
verbose: Optional[IntegerLike] = None,
) -> None:
"""
Tests memcg aware and unaware lru by walking the entire lru and use the
kvm to look up the memcg index and then walk lru by memcg for a NUMA nodes.
The memcg unaware lru should not find a memcg index.
"""
for mnt in for_each_mount(
prog,
src=None,
dst=dst,
fstype=fstype,
):
mnt_dst = escape_ascii_string(mount_dst(mnt), escape_backslash=True)
mnt_fstype = escape_ascii_string(
mount_fstype(mnt), escape_backslash=True
)
sb = mnt.mnt.mnt_sb
lru = sb.s_dentry_lru
d_cnt = 0
errors = 0
# iterate through the dentry lru, find the memcg index and verify entry can
# be found by memcg
print(
f"memcg aware test on {mnt_dst} dentry lru at {hex(lru.address_of_())}"
)
for dentry in list_lru_for_each_entry(
prog, "struct dentry", lru.address_of_(), "d_lru"
):
d_cnt = d_cnt + 1
dlru = cast("unsigned long", dentry.d_lru.address_of_())
memcg = list_lru_kmem_to_memcgidx(prog, dlru)
if memcg == -1:
if verbose is not None:
print(f"lru for dentry {hex(dentry)} not found")
errors = errors + 1
else:
found = 0
for n in for_each_online_node(prog):
# look for the entry if not found in an earlier NUMA node
if found == 0:
if verbose is not None:
print(
f"looking for dentry {hex(dentry)} in memcg {memcg} node {n}"
)
for dentry2 in list_lru_from_memcg_node_for_each_entry(
prog,
memcg,
n,
"struct dentry",
lru.address_of_(),
"d_lru",
):
if hex(dentry) == hex(dentry2):
found = 1
break
if found == 0:
if verbose is not None:
print(
f"lru for dentry {hex(dentry)} NOT found at mencg idx {memcg}"
)
errors = errors + 1
else:
if verbose is not None:
print(
f"lru for dentry {hex(dentry)} FOUND at mencg idx {memcg}"
)
print(f"errors {errors} in {d_cnt} dentrys")
if mnt_fstype == "xfs":
print(
f"memcg unaware test on {mnt_dst} xfs_buf lru at {hex(lru.address_of_())}"
)
# memcg unaware lru
mp = Object(prog, "struct xfs_mount *", sb.s_fs_info)
lru = mp.m_ddev_targp.bt_lru
d_cnt = 0
errors = 0
# iterate through the xfs_buf lru, memcg index will be -1
for bp in list_lru_for_each_entry(
prog, "struct xfs_buf", lru.address_of_(), "b_lru"
):
d_cnt = d_cnt + 1
bplru = cast("unsigned long", bp.b_lru.address_of_())
memcg = list_lru_kmem_to_memcgidx(prog, bplru)
if memcg == -1:
if verbose is not None:
print(f"lru for bp {hex(bp)} not found")
memcg = 0
found = 0
for n in for_each_online_node(prog):
if verbose is not None:
print(
f"looking for bmap {hex(bp)} in memcg {memcg} node {n}"
)
for bp2 in list_lru_from_memcg_node_for_each_entry(
prog,
memcg,
n,
"struct xfs_buf",
lru.address_of_(),
"b_lru",
):
if hex(bp) == hex(bp):
found = 1
break
if found == 0:
if verbose is not None:
print(
f"lru for bp {hex(bp)} NOT found at mencg idx {memcg}"
)
errors = errors + 1
else:
if verbose is not None:
print(
f"lru for bp {hex(bp)} FOUND at mencg idx {memcg}"
)
print(f"errors {errors} in {d_cnt} bps")