Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CORE-8687 Track vassert messages for crash log #25051

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
16 changes: 15 additions & 1 deletion src/v/base/tests/BUILD
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("//bazel:test.bzl", "redpanda_cc_btest", "redpanda_cc_btest_no_seastar")
load("//bazel:test.bzl", "redpanda_cc_btest", "redpanda_cc_btest_no_seastar", "redpanda_cc_gtest")

redpanda_cc_btest_no_seastar(
name = "outcome_test",
Expand All @@ -23,3 +23,17 @@ redpanda_cc_btest(
"@seastar//:testing",
],
)

redpanda_cc_gtest(
name = "assert_log_holder_test",
timeout = "short",
srcs = [
"assert_log_holder_test.cc",
],
deps = [
"//src/v/base",
"//src/v/test_utils:gtest",
"@googletest//:gtest",
"@seastar",
],
)
7 changes: 7 additions & 0 deletions src/v/base/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,10 @@ rp_test(
BINARY_NAME outcome_utils_test
SOURCES outcome_utils_test.cc
LIBRARIES v::seastar_testing_main v::base)

rp_test(
UNIT_TEST
GTEST
BINARY_NAME assert_log_holder_test
SOURCES assert_log_holder_test.cc
LIBRARIES v::base v::gtest_main)
47 changes: 47 additions & 0 deletions src/v/base/tests/assert_log_holder_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright 2025 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/

#include "base/vassert.h"

#include <seastar/util/backtrace.hh>

#include <gtest/gtest.h>

struct AssertLogHolderTest : public testing::Test {};

TEST_F(AssertLogHolderTest, ValidateAssertLogHolder) {
static ss::sstring message;
static ss::sstring unused_message;
const auto cb = [](std::string_view msg) { message = ss::sstring{msg}; };
const auto unused_cb = [](std::string_view msg) {
unused_message = ss::sstring{msg};
};

auto bt = ss::current_backtrace();
detail::g_assert_log_holder.register_event(
bt, "This is a test event: {}", "test");
EXPECT_TRUE(message.empty());
detail::g_assert_log_holder.register_cb(cb);
detail::g_assert_log_holder.register_event(
bt, "This is a test event: {}", "test");

EXPECT_EQ(message, fmt::format("This is a test event: test"));
EXPECT_TRUE(unused_message.empty());

// Verify that a second call to `register_cb` does not replace the current
// callback
detail::g_assert_log_holder.register_cb(unused_cb);
detail::g_assert_log_holder.register_event(
bt, "This is a second test event: {}", "test");

EXPECT_EQ(message, fmt::format("This is a second test event: test"));
EXPECT_TRUE(unused_message.empty());
}
9 changes: 5 additions & 4 deletions src/v/base/unreachable.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
#define unreachable() \
/* NOLINTNEXTLINE(cppcoreguidelines-avoid-do-while) */ \
do { \
::detail::g_assert_log.l.error( \
"This code should not be reached ({}:{})", __FILE__, __LINE__); \
::detail::g_assert_log.l.error( \
"Backtrace below:\n{}", seastar::current_backtrace()); \
::detail::g_assert_log_holder.register_event( \
ss::current_backtrace(), \
"This code should not be reached ({}:{})", \
__FILE__, \
__LINE__); \
__builtin_trap(); \
} while (0)
56 changes: 53 additions & 3 deletions src/v/base/vassert.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,63 @@

#include <seastar/util/backtrace.hh>
#include <seastar/util/log.hh>
#include <seastar/util/noncopyable_function.hh>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
#include <seastar/util/noncopyable_function.hh>

(I can carry this over to a follow up PR.)


#include <atomic>

namespace detail {
struct dummyassert {
static inline ss::logger l{"assert"};
};
inline dummyassert g_assert_log;
/**
* @brief Class used to format assert messages
*
* This class will format the provided assert message and produce a backtrace
* caused by an assert. It also provides a means of registering a callback
* function that will be called when an assert is triggered.
*/
class assert_log_holder {
public:
// We want to enforce using a non-capturing function for callbacks to ensure
// a static lifetime for the callback as we will not permit unregistering
// a callback to prevent a race condition between unregistering the callback
// on one thread and having the callback called by a different thread.
using assert_cb_func = void (*)(std::string_view);
/**
* @brief Registers a vassert event
*
* @tparam Args The argument template
* @param bt The backtrace from the assert
* @param fmt The format string for the log
* @param args The arguments to @p fmt
*/
template<typename... Args>
void register_event(
ss::saved_backtrace bt,
ss::logger::format_info_t<Args...> fmt,
Args&&... args) noexcept {
auto buffer = fmt::format(
fmt::runtime(fmt.format), std::forward<Args>(args)...);

g_assert_log.l.error("{}", buffer);
g_assert_log.l.error("Backtrace:\n{}", bt);

auto cb_func = _cb_func.load();
if (cb_func != nullptr) {
cb_func(buffer);
}
}

void register_cb(assert_cb_func cb) {
assert_cb_func before = nullptr;
_cb_func.compare_exchange_strong(before, cb);
Comment on lines +68 to +69
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you want first writer wins, or last writer wins?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If by "writer" you mean whoever calls register_cb, I'd say probably first... but in practice there's only one spot where this is going to be called (from recorder.cc)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in that case you could make a stronger guarantee and assert the pointer is null when its set.

}

private:
std::atomic<assert_cb_func> _cb_func{nullptr};
};
inline assert_log_holder g_assert_log_holder;
} // namespace detail

/** Meant to be used in the same way as assert(condition, msg);
Expand All @@ -39,14 +90,13 @@ inline dummyassert g_assert_log;
do { \
/*The !(x) is not an error. see description above*/ \
if (unlikely(!(x))) { \
::detail::g_assert_log.l.error( \
::detail::g_assert_log_holder.register_event( \
ss::current_backtrace(), \
"Assert failure: ({}:{}) '{}' " msg, \
__FILE__, \
__LINE__, \
#x, \
##args); \
::detail::g_assert_log.l.error( \
"Backtrace below:\n{}", ss::current_backtrace()); \
__builtin_trap(); \
} \
} while (0)
19 changes: 19 additions & 0 deletions src/v/crash_tracker/recorder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "crash_tracker/recorder.h"

#include "base/vassert.h"
#include "config/node_config.h"
#include "crash_tracker/logger.h"
#include "crash_tracker/types.h"
Expand Down Expand Up @@ -99,6 +100,8 @@ ss::future<> recorder::start() {
co_await ensure_crashdir_exists();
co_await remove_old_crashfiles();
co_await _writer.initialize(co_await generate_crashfile_name());
::detail::g_assert_log_holder.register_cb(
[](std::string_view msg) { get_recorder().record_crash_vassert(msg); });
}

namespace {
Expand Down Expand Up @@ -212,6 +215,22 @@ void recorder::record_crash_exception(std::exception_ptr eptr) {
_writer.write();
}

void recorder::record_crash_vassert(std::string_view msg) {
auto* cd_opt = _writer.fill();
if (!cd_opt) {
// The writer has already been consumed by another crash
print_skipping();
return;
}
auto& cd = *cd_opt;

record_backtrace(cd);
cd.type = crash_type::assertion;
record_message(cd, msg);

_writer.write();
}

std::chrono::system_clock::time_point
recorder::recorded_crash::timestamp() const {
auto recorded_timestamp_opt = crash
Expand Down
2 changes: 2 additions & 0 deletions src/v/crash_tracker/recorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class recorder {

void record_crash_exception(std::exception_ptr eptr);

void record_crash_vassert(std::string_view msg);

/// Returns the list of recorded crashes in increasing crash_time order
ss::future<std::vector<recorded_crash>> get_recorded_crashes(
include_malformed_files incl_malformed
Expand Down
3 changes: 2 additions & 1 deletion src/v/crash_tracker/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ enum class crash_type {
startup_exception,
segfault,
abort,
illegal_instruction
illegal_instruction,
assertion
};

/// reserved_string is a simple wrapper around a std::array that allows
Expand Down
57 changes: 55 additions & 2 deletions src/v/redpanda/admin/api-doc/debug.json
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,49 @@
]
}
]
},
{
"path": "/v1/debug/ctracker/va/{shard}",
"operations": [
{
"method": "PUT",
"summary": "Simulate va message on specific shard",
"nickname": "put_ctracker_va",
"parameters": [
{
"name": "shard",
"in": "path",
"required": true,
"type": "integer"
},
{
"name": "message",
"in": "body",
"required": true,
"schema": {
"$ref": "#/models/ctracker_va_value"
}
}
],
"responses": {
"200": {
"description": "Ok"
},
"400": {
"description": "Bad request"
},
"401": {
"description": "Unauthorized"
},
"403": {
"description": "Forbidden"
},
"404": {
"description": "Not found"
}
}
}
]
}
],
"models": {
Expand Down Expand Up @@ -1237,7 +1280,7 @@
"id": "partition_producer_state",
"description": "Debug state of a single producer producing to a partition ",
"properties": {
"id" : {
"id": {
"type": "long",
"description": "ID of the producer, assigned by Redpanda."
},
Expand Down Expand Up @@ -1425,6 +1468,16 @@
"description": "Node unique identifier. UUID is generated when Redpanda starts with empty data folder"
}
}
},
"ctracker_va_value": {
"id": "ctracker_va_value",
"description": "Message to store on the shard",
"properties": {
"message": {
"type": "string",
"description": "The message to store"
}
}
}
}
}
}
53 changes: 53 additions & 0 deletions src/v/redpanda/admin/debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/
#include "base/vassert.h"
#include "cloud_storage/cache_service.h"
#include "cluster/cloud_storage_size_reducer.h"
#include "cluster/controller.h"
Expand All @@ -32,7 +33,9 @@
#include "storage/kvstore.h"

#include <seastar/core/shard_id.hh>
#include <seastar/core/smp.hh>
#include <seastar/core/sstring.hh>
#include <seastar/http/exception.hh>
#include <seastar/http/httpd.hh>
#include <seastar/json/json_elements.hh>

Expand Down Expand Up @@ -498,6 +501,16 @@ void admin_server::register_debug_routes() {
-> ss::future<ss::json::json_return_type> {
return get_node_uuid_handler();
});

if constexpr (admin_server::is_store_message_enabled()) {
register_route_raw_async<superuser>(
ss::httpd::debug_json::put_ctracker_va,
[this](
std::unique_ptr<ss::http::request> req,
std::unique_ptr<ss::http::reply> res) {
return put_ctracker_va(std::move(req), std::move(res));
});
}
}

using admin::apply_validator;
Expand Down Expand Up @@ -1029,3 +1042,43 @@ ss::future<ss::json::json_return_type> admin_server::override_node_uuid_handler(

co_return ss::json::json_return_type(ss::json::json_void());
}

ss::future<std::unique_ptr<ss::http::reply>> admin_server::put_ctracker_va(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should it go into the finjector url namespace?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless i'm completely blind I don't see a finjector namespace. There is failure-probes but that's all honey badger related, including the root /v1/failure-probes endpoint.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this a failure probe?

std::unique_ptr<ss::http::request> req,
std::unique_ptr<ss::http::reply> res) {
ss::shard_id shard = 0;
try {
shard = std::stoi(req->get_path_param("shard"));
} catch (...) {
throw ss::httpd::bad_param_exception(
fmt::format("Invalid shard id: {}", req->get_path_param("shard")));
}

if (shard >= ss::smp::count) {
throw ss::httpd::bad_param_exception(
fmt::format("Invalid shard id: {}", shard));
}

auto doc = co_await parse_json_body(req.get());
if (!doc.IsObject()) {
throw ss::httpd::bad_request_exception(
"Request body must be a JSON object");
}
if (!doc.HasMember("message")) {
throw ss::httpd::bad_request_exception(
fmt::format("'message' missing"));
}
if (!doc["message"].IsString()) {
throw ss::httpd::bad_request_exception(
fmt::format("'message' must be a string"));
}
auto msg = ss::sstring{doc["message"].GetString()};

co_await ss::smp::submit_to(shard, [msg = std::move(msg)] {
::detail::g_assert_log_holder.register_event(
ss::current_backtrace(), "{}", msg);
});

res->set_status(ss::http::reply::status_type::ok);
co_return std::move(res);
}
Loading