From cc96b6e0d3d9f32c0e0c79aa458a6b356f9a2627 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Tue, 17 Dec 2024 23:01:57 +0000 Subject: [PATCH] internal/crashmonitor: adjust trap frames Typically all frames in the middle of a stack trace are calls, and report the "return PC". That is, the instruction following the CALL where the callee will eventually return to. runtime.CallersFrames is aware of this property and will decrement each PC by 1 to "back up" to the location of the CALL, which is the actual line number the user expects. This does not work for traps (such as a SIGSEGV), as a trap is not a call, so the reported PC is not the return PC, but the actual PC of the trap. runtime.Callers is aware of this and will intentionally increment trap PCs in order to correct for the decrement performed by runtime.CallersFrames. See runtime.tracebackPCs and runtume.(*unwinder).symPC. We must emulate the same behavior, otherwise we will report the location of the instruction immediately prior to the trap, which may be on a different line, or even a different inlined functions. Fixes golang/go#70637. Change-Id: I7cd6ffe783a81aa91dcdca313d910341c0bb0440 Reviewed-on: https://go-review.googlesource.com/c/telemetry/+/637755 LUCI-TryBot-Result: Go LUCI Reviewed-by: Alan Donovan Reviewed-by: Cherry Mui --- internal/crashmonitor/monitor.go | 113 ++++++++++++++++--- internal/crashmonitor/monitor_test.go | 154 +++++++++++++++++++------- 2 files changed, 217 insertions(+), 50 deletions(-) diff --git a/internal/crashmonitor/monitor.go b/internal/crashmonitor/monitor.go index 612f756..f43d2cd 100644 --- a/internal/crashmonitor/monitor.go +++ b/internal/crashmonitor/monitor.go @@ -170,6 +170,35 @@ func telemetryCounterName(crash []byte) (string, error) { // there is no possibility of strings from the crash report (which may // contain PII) leaking into the telemetry system. func parseStackPCs(crash string) ([]uintptr, error) { + // getSymbol parses the symbol name out of a line of the form: + // SYMBOL(ARGS) + // + // Note: SYMBOL may contain parens "pkg.(*T).method". However, type + // parameters are always replaced with ..., so they cannot introduce + // more parens. e.g., "pkg.(*T[...]).method". + // + // ARGS can contain parens. We want the first paren that is not + // immediately preceded by a ".". + // + // TODO(prattmic): This is mildly complicated and is only used to find + // runtime.sigpanic, so perhaps simplify this by checking explicitly + // for sigpanic. + getSymbol := func(line string) (string, error) { + var prev rune + for i, c := range line { + if line[i] != '(' { + prev = c + continue + } + if prev == '.' { + prev = c + continue + } + return line[:i], nil + } + return "", fmt.Errorf("no symbol for stack frame: %s", line) + } + // getPC parses the PC out of a line of the form: // \tFILE:LINE +0xRELPC sp=... fp=... pc=... getPC := func(line string) (uint64, error) { @@ -186,6 +215,9 @@ func parseStackPCs(crash string) ([]uintptr, error) { childSentinel = sentinel() on = false // are we in the first running goroutine? lines = strings.Split(crash, "\n") + symLine = true // within a goroutine, every other line is a symbol or file/line/pc location, starting with symbol. + currSymbol string + prevSymbol string // symbol of the most recent previous frame with a PC. ) for i := 0; i < len(lines); i++ { line := lines[i] @@ -228,21 +260,76 @@ func parseStackPCs(crash string) ([]uintptr, error) { // Note: SYMBOL may contain parens "pkg.(*T).method" // The RELPC is sometimes missing. - // Skip the symbol(args) line. - i++ - if i == len(lines) { - break - } - line = lines[i] + if symLine { + var err error + currSymbol, err = getSymbol(line) + if err != nil { + return nil, fmt.Errorf("error extracting symbol: %v", err) + } - // Parse the PC, and correct for the parent and child's - // different mappings of the text section. - pc, err := getPC(line) - if err != nil { - // Inlined frame, perhaps; skip it. - continue + symLine = false // Next line is FILE:LINE. + } else { + // Parse the PC, and correct for the parent and child's + // different mappings of the text section. + pc, err := getPC(line) + if err != nil { + // Inlined frame, perhaps; skip it. + + // Done with this frame. Next line is a new frame. + // + // Don't update prevSymbol; we only want to + // track frames with a PC. + currSymbol = "" + symLine = true + continue + } + + pc = pc-parentSentinel+childSentinel + + // If the previous frame was sigpanic, then this frame + // was a trap (e.g., SIGSEGV). + // + // Typically all middle frames are calls, and report + // the "return PC". That is, the instruction following + // the CALL where the callee will eventually return to. + // + // runtime.CallersFrames is aware of this property and + // will decrement each PC by 1 to "back up" to the + // location of the CALL, which is the actual line + // number the user expects. + // + // This does not work for traps, as a trap is not a + // call, so the reported PC is not the return PC, but + // the actual PC of the trap. + // + // runtime.Callers is aware of this and will + // intentionally increment trap PCs in order to correct + // for the decrement performed by + // runtime.CallersFrames. See runtime.tracebackPCs and + // runtume.(*unwinder).symPC. + // + // We must emulate the same behavior, otherwise we will + // report the location of the instruction immediately + // prior to the trap, which may be on a different line, + // or even a different inlined functions. + // + // TODO(prattmic): The runtime applies the same trap + // behavior for other "injected calls", see injectCall + // in runtime.(*unwinder).next. Do we want to handle + // those as well? I don't believe we'd ever see + // runtime.asyncPreempt or runtime.debugCallV2 in a + // typical crash. + if prevSymbol == "runtime.sigpanic" { + pc++ + } + + pcs = append(pcs, uintptr(pc)) + + // Done with this frame. Next line is a new frame. + prevSymbol = currSymbol + currSymbol = "" + symLine = true } - pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel)) } return pcs, nil } diff --git a/internal/crashmonitor/monitor_test.go b/internal/crashmonitor/monitor_test.go index 74a7572..ac2a47f 100644 --- a/internal/crashmonitor/monitor_test.go +++ b/internal/crashmonitor/monitor_test.go @@ -24,16 +24,20 @@ import ( func TestMain(m *testing.M) { entry := os.Getenv("CRASHMONITOR_TEST_ENTRYPOINT") switch entry { - case "via-stderr": + case "via-stderr.panic", "via-stderr.trap": // This mode bypasses Start and debug.SetCrashOutput; // the crash is printed to stderr. debug.SetTraceback("system") crashmonitor.WriteSentinel(os.Stderr) - child() // this line is "TestMain:+9" + if entry == "via-stderr.panic" { + childPanic() // this line is "TestMain:+10" + } else { + childTrap() // this line is "TestMain:+12" + } panic("unreachable") - case "start.panic", "start.exit": + case "start.panic", "start.trap", "start.exit": // These modes uses Start and debug.SetCrashOutput. // We stub the actual telemetry by instead writing to a file. crashmonitor.SetIncrementCounter(func(name string) { @@ -46,12 +50,18 @@ func TestMain(m *testing.M) { ReportCrashes: true, TelemetryDir: os.Getenv("CRASHMONITOR_TELEMETRY_DIR"), }) - if entry == "start.panic" { + switch entry { + case "start.panic": go func() { - child() // this line is "TestMain.func2:1" + childPanic() // this line is "TestMain.func3:+1" }() select {} // deadlocks when reached - } else { + case "start.trap": + go func() { + childTrap() // this line is "TestMain.func4:+1" + }() + select {} // deadlocks when reached + case "start.exit": os.Exit(42) } @@ -60,48 +70,94 @@ func TestMain(m *testing.M) { } } -func child() { +func childPanic() { + fmt.Println("hello") + grandchildPanic() // this line is "childPanic:+2" +} + +func grandchildPanic() { + panic("oops") // this line is "grandchildPanic:=79" (the call from child is inlined) +} + +var sinkPtr *int + +func childTrap() { fmt.Println("hello") - grandchild() // this line is "child:+2" + grandchildTrap(sinkPtr) // this line is "childTrap:+2" } -func grandchild() { - panic("oops") // this line is "grandchild:=92" (the call from child is inlined) +func grandchildTrap(i *int) { + *i = 42 // this line is "grandchildTrap:=90" (the call from childTrap is inlined) } // TestViaStderr is an internal test that asserts that the telemetry // stack generated by the panic in grandchild is correct. It uses // stderr, and does not rely on [start.Start] or [debug.SetCrashOutput]. func TestViaStderr(t *testing.T) { - _, _, stderr := runSelf(t, "via-stderr") - got, err := crashmonitor.TelemetryCounterName(stderr) - if err != nil { - t.Fatal(err) - } - got = sanitize(counter.DecodeStack(got)) - want := "crash/crash\n" + + // Standard panic. + t.Run("panic", func(t *testing.T) { + _, _, stderr := runSelf(t, "via-stderr.panic") + got, err := crashmonitor.TelemetryCounterName(stderr) + if err != nil { + t.Fatal(err) + } + got = sanitize(counter.DecodeStack(got)) + want := "crash/crash\n" + "runtime.gopanic:--\n" + - "golang.org/x/telemetry/internal/crashmonitor_test.grandchild:=69\n" + - "golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n" + - "golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+9\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.grandchildPanic:=79\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+10\n" + "main.main:--\n" + "runtime.main:--\n" + "runtime.goexit:--" - if !crashmonitor.Supported() { // !go1.23 - // Traceback excludes PCs for inlined frames. Before go1.23 - // (https://go.dev/cl/571798 specifically), passing the set of - // PCs in the traceback to runtime.CallersFrames, would report - // only the innermost inlined frame and none of the inline - // "callers". - // - // Thus, here we must drop the caller of the inlined frame. - want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n", "") - } + if !crashmonitor.Supported() { // !go1.23 + // Traceback excludes PCs for inlined frames. Before + // go1.23 (https://go.dev/cl/571798 specifically), + // passing the set of PCs in the traceback to + // runtime.CallersFrames, would report only the + // innermost inlined frame and none of the inline + // "callers". + // + // Thus, here we must drop the caller of the inlined + // frame. + want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n", "") + } - if got != want { - t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) - } + if got != want { + t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) + } + }) + + // Panic via trap. + t.Run("trap", func(t *testing.T) { + _, _, stderr := runSelf(t, "via-stderr.trap") + got, err := crashmonitor.TelemetryCounterName(stderr) + if err != nil { + t.Fatal(err) + } + got = sanitize(counter.DecodeStack(got)) + want := "crash/crash\n" + + "runtime.gopanic:--\n" + + "runtime.panicmem:--\n" + + "runtime.sigpanic:--\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.grandchildTrap:=90\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.TestMain:+12\n" + + "main.main:--\n" + + "runtime.main:--\n" + + "runtime.goexit:--" + + if !crashmonitor.Supported() { // !go1.23 + // See above. + want = strings.ReplaceAll(want, "runtime.sigpanic:--\n", "") + want = strings.ReplaceAll(want, "golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n", "") + } + + if got != want { + t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) + } + }) } func waitForExitFile(t *testing.T, exitFile string) { @@ -144,8 +200,8 @@ func TestStart(t *testing.T) { } }) - // Assert that the crash monitor increments a telemetry - // counter of the correct name when the child process panics. + // Assert that the crash monitor increments a telemetry counter of the + // correct name when the child process panics. t.Run("panic", func(t *testing.T) { // Gather a stack trace from executing the panic statement above. telemetryFile, exitFile, _ := runSelf(t, "start.panic") @@ -157,14 +213,38 @@ func TestStart(t *testing.T) { got := sanitize(counter.DecodeStack(string(data))) want := "crash/crash\n" + "runtime.gopanic:--\n" + - "golang.org/x/telemetry/internal/crashmonitor_test.grandchild:=69\n" + - "golang.org/x/telemetry/internal/crashmonitor_test.child:+2\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.grandchildPanic:=79\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.childPanic:+2\n" + "golang.org/x/telemetry/internal/crashmonitor_test.TestMain.func3:+1\n" + "runtime.goexit:--" if got != want { t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) } }) + + // Assert that the crash monitor increments a telemetry counter of the + // correct name when the child process panics via trap. + t.Run("trap", func(t *testing.T) { + // Gather a stack trace from executing the panic statement above. + telemetryFile, exitFile, _ := runSelf(t, "start.trap") + waitForExitFile(t, exitFile) + data, err := os.ReadFile(telemetryFile) + if err != nil { + t.Fatalf("failed to read file: %v", err) + } + got := sanitize(counter.DecodeStack(string(data))) + want := "crash/crash\n" + + "runtime.gopanic:--\n" + + "runtime.panicmem:--\n" + + "runtime.sigpanic:--\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.grandchildTrap:=90\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.childTrap:+2\n" + + "golang.org/x/telemetry/internal/crashmonitor_test.TestMain.func4:+1\n" + + "runtime.goexit:--" + if got != want { + t.Errorf("got counter name <<%s>>, want <<%s>>", got, want) + } + }) } // runSelf fork+exec's this test executable using an alternate entry point.