Skip to content

Commit

Permalink
locks: osq: add support to check for optimistic spinners of sleeping …
Browse files Browse the repository at this point in the history
…lock(s).

We have ran into situtaions where optimistic spin on sleeping lock by
multiple CPUs causes high sys load, which further leads to other issues.

This helper allows to check for such spinners.

Some examples from a vmcore where system ran into fork-bomb:

>>> scan_osq_node(prog)
There are spinners on one or more osq_lock
CPU(s): [21, 17, 59, 67, 6, 0, 28, 53, 4, 41, 55, 38, 63, 23, 27, 19, 36] are spinning on same osq_lock
CPU(s): [42, 56, 35, 18, 22, 3, 1] are spinning on same osq_lock
CPU(s): [70, 44, 9] are spinning on same osq_lock
CPU(s): [16, 26] are spinning on same osq_lock

>>> scan_osq_node(prog,1)
There are spinners on one or more osq_lock
CPU(s): [21, 17, 59, 67, 6, 0, 28, 53, 4, 41, 55, 38, 63, 23, 27, 19, 36] are spinning on same osq_lock
CPU: 21 has been spinning for 1.2 seconds.
CPU: 17 has been spinning for 1.3 seconds.
CPU: 59 has been spinning for 1.3 seconds.
CPU: 67 has been spinning for 1.5 seconds.
.......................................
.......................................
CPU: 19 has been spinning for 3.004 seconds.
CPU: 36 has been spinning for 3.1 seconds.
......................................

>>> scan_osq_node(prog,2)

There are spinners on one or more osq_lock
CPU(s): [21, 17, 59, 67, 6, 0, 28, 53, 4, 41, 55, 38, 63, 23, 27, 19, 36] are spinning on same osq_lock
CPU: 21 has been spinning for 0.0 seconds.

Call stack at cpu:  21
PID: 64691    TASK: ffff8fac01282f80 [R] CPU: 21!  COMMAND: "bash"
!# 0 [ffffa447d645fb58] node_cpu at 0xffffffff960f3748 kernel/locking/osq_lock.c:27:0
!# 1 [ffffa447d645fb58] osq_lock at 0xffffffff960f3748 kernel/locking/osq_lock.c:143:0
 # 2 [ffffa447d645fb68] rwsem_optimistic_spin at 0xffffffff96899cc0 kernel/locking/rwsem-xadd.c:451:0
 # 3 [ffffa447d645fb68] __rwsem_down_write_failed_common at 0xffffffff96899cc0 kernel/locking/rwsem-xadd.c:529:0
 # 4 [ffffa447d645fb68] rwsem_down_write_failed at 0xffffffff96899cc0 kernel/locking/rwsem-xadd.c:617:0
 # 5 [ffffa447d645fbf8] call_rwsem_down_write_failed at 0xffffffff96889537 arch/x86/lib/rwsem.S:117:0
 # 6 [ffffa447d645fc40] __down_write at 0xffffffff96898bcd arch/x86/include/asm/rwsem.h:142:0
 # 7 [ffffa447d645fc40] down_write at 0xffffffff96898bcd kernel/locking/rwsem.c:72:0
 # 8 [ffffa447d645fc58] lock_anon_vma_root at 0xffffffff9623ae3c mm/rmap.c:239:0
 # 9 [ffffa447d645fc58] unlink_anon_vmas at 0xffffffff9623ae3c mm/rmap.c:390:0
 #10 [ffffa447d645fca8] free_pgtables at 0xffffffff96225231 mm/memory.c:406:0
 #11 [ffffa447d645fce8] exit_mmap at 0xffffffff96230f48 mm/mmap.c:3212:0
 #12 [ffffa447d645fd98] __mmput at 0xffffffff96097ab2 kernel/fork.c:969:0
 #13 [ffffa447d645fd98] mmput at 0xffffffff96097ab2 kernel/fork.c:990:0
 #14 [ffffa447d645fdb8] copy_process at 0xffffffff96098801 kernel/fork.c:2058:0
 #15 [ffffa447d645fea0] copy_process at 0xffffffff96099db9 kernel/fork.c:1653:0
 #16 [ffffa447d645fea0] _do_fork at 0xffffffff96099db9 kernel/fork.c:2153:0
 #17 [ffffa447d645ff20] SYSC_clone at 0xffffffff9609a149 kernel/fork.c:2265:0
 #18 [ffffa447d645ff20] SyS_clone at 0xffffffff9609a149 kernel/fork.c:2259:0
 #19 [ffffa447d645ff30] do_syscall_64 at 0xffffffff96003ca9 arch/x86/entry/common.c:298:0
 #20 [ffffa447d645ff58] entry_SYSCALL_64 at 0xffffffff96a001b1 arch/x86/entry/entry_64.S:238:0
.......................................
.......................................
CPU: 36 has been spinning for 3.1 seconds.

Call stack at cpu:  36
PID: 16329    TASK: ffff8f85db930000 [R] CPU: 36!  COMMAND: bash
 0 [ffffa4481243fb68] rwsem_try_write_lock_unqueued at 0xffffffff96899d00 kernel/locking/rwsem-xadd.c:364:0
 1 [ffffa4481243fb68] rwsem_optimistic_spin at 0xffffffff96899d00 kernel/locking/rwsem-xadd.c:465:0
 2 [ffffa4481243fb68] __rwsem_down_write_failed_common at 0xffffffff96899d00 kernel/locking/rwsem-xadd.c:529:0
 3 [ffffa4481243fb68] rwsem_down_write_failed at 0xffffffff96899d00 kernel/locking/rwsem-xadd.c:617:0
 # 4 [ffffa4481243fbf8] call_rwsem_down_write_failed at 0xffffffff96889537 arch/x86/lib/rwsem.S:117:0
 # 5 [ffffa4481243fc40] __down_write at 0xffffffff96898bcd arch/x86/include/asm/rwsem.h:142:0
 # 6 [ffffa4481243fc40] down_write at 0xffffffff96898bcd kernel/locking/rwsem.c:72:0
 # 7 [ffffa4481243fc58] lock_anon_vma_root at 0xffffffff9623ae3c mm/rmap.c:239:0
 # 8 [ffffa4481243fc58] unlink_anon_vmas at 0xffffffff9623ae3c mm/rmap.c:390:0
 # 9 [ffffa4481243fca8] free_pgtables at 0xffffffff96225231 mm/memory.c:406:0
 #10 [ffffa4481243fce8] exit_mmap at 0xffffffff96230f48 mm/mmap.c:3212:0
 #11 [ffffa4481243fd98] __mmput at 0xffffffff96097ab2 kernel/fork.c:969:0
 #12 [ffffa4481243fd98] mmput at 0xffffffff96097ab2 kernel/fork.c:990:0
 #13 [ffffa4481243fdb8] copy_process at 0xffffffff96098801 kernel/fork.c:2058:0
 #14 [ffffa4481243fea0] copy_process at 0xffffffff96099db9 kernel/fork.c:1653:0
 #15 [ffffa4481243fea0] _do_fork at 0xffffffff96099db9 kernel/fork.c:2153:0
 #16 [ffffa4481243ff20] SYSC_clone at 0xffffffff9609a149 kernel/fork.c:2265:0
 #17 [ffffa4481243ff20] SyS_clone at 0xffffffff9609a149 kernel/fork.c:2259:0
 #18 [ffffa4481243ff30] do_syscall_64 at 0xffffffff96003ca9 arch/x86/entry/common.c:298:0
 #19 [ffffa4481243ff58] entry_SYSCALL_64 at 0xffffffff96a001b1 arch/x86/entry/entry_64.S:238:0

Signed-off-by: Imran Khan <[email protected]>
  • Loading branch information
imran-kn committed Mar 14, 2024
1 parent f7a9db9 commit ef14db4
Showing 1 changed file with 60 additions and 0 deletions.
60 changes: 60 additions & 0 deletions drgn_tools/lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,14 @@
trapping these two function is sufficient to check the semaphore waiters.
"""
import argparse
from collections import defaultdict
from typing import Set

import drgn
from drgn import Object
from drgn import Program
from drgn.helpers.linux.cpumask import for_each_present_cpu
from drgn.helpers.linux.percpu import per_cpu

from drgn_tools.bt import bt
from drgn_tools.bt import bt_has_any
Expand All @@ -37,6 +41,62 @@
from drgn_tools.locking import for_each_rwsem_waiter
from drgn_tools.locking import mutex_owner
from drgn_tools.locking import show_lock_waiter
from drgn_tools.locking import tail_osq_node_to_spinners
from drgn_tools.task import get_current_run_time
from drgn_tools.task import nanosecs_to_secs


def scan_osq_node(prog: Program, verbosity=0) -> None:
"""
Show CPUs spinning to grab sleeping lock(s).
:param prog: drgn.Program
:param verbosity: specify verbosity of report as follows:
0: Show which CPUs are spinning
1: Show which CPUs are spinning and for how long
2: Show spinning CPUs, their spin duration and call stack
till the point of spin
"""

osq_spinners: defaultdict = defaultdict(list)
for cpu in for_each_present_cpu(prog):
osq_node = per_cpu(prog["osq_node"], cpu)
if not osq_node.next.value_():
continue

while osq_node.next.value_():
osq_node = Object(
prog,
"struct optimistic_spin_node",
address=osq_node.next.value_(),
)

if osq_node.address_ in osq_spinners.keys():
continue

for spinner_cpu in tail_osq_node_to_spinners(osq_node):
osq_spinners[osq_node.address_].append(spinner_cpu)

if not len(osq_spinners):
print("There are no spinners on any osq_lock")
else:
print("There are spinners on one or more osq_lock")
for key in osq_spinners.keys():
print(f"CPU(s): {osq_spinners[key]} are spinning on same osq_lock")
if verbosity >= 1:
cpu_list = osq_spinners[key]
for cpu in cpu_list:
run_time_ns = get_current_run_time(prog, cpu)
run_time_s = nanosecs_to_secs(run_time_ns)
print(
f"CPU: {cpu} has been spinning for {run_time_s} seconds."
)

if (
verbosity == 2
): # for max verbosity dump call stack of spinners
print("\nCall stack at cpu: ", cpu)
bt(prog, cpu=cpu)


def scan_mutex_lock(prog: Program, stack: bool) -> None:
Expand Down

0 comments on commit ef14db4

Please sign in to comment.