Skip to content

Commit

Permalink
[fix](brpc) coredump caused by brpc checking (#44047) (#44188)
Browse files Browse the repository at this point in the history
pick #44047
```
/root/doris/be/src/runtime/fragment_mgr.cpp:1064:20: runtime error: member call on null pointer of type 'doris::PBackendService_Stub'

SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /root/doris/be/src/runtime/fragment_mgr.cpp:1064:20 in
*** Query id: 0-0 ***
*** is nereids: 0 ***
*** tablet id: 0 ***
*** Aborted at 1731663847 (unix time) try "date -d @1731663847" if you are using GNU date ***
*** Current BE git commitID: b663df0e50 ***
*** SIGSEGV address not mapped to object (@0x0) received by PID 17169 (TID 17463 OR 0x7f746d21a700) from PID 0; stack trace: ***
0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /root/doris/be/src/common/signal_handler.h:421
1# PosixSignals::chained_handler(int, siginfo_t*, void*) [clone .part.0] in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so
2# JVM_handle_linux_signal in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so
3# 0x00007F7601263090 in /lib/x86_64-linux-gnu/libc.so.6
4# doris::FragmentMgr::_check_brpc_available(std::shared_ptr<doris::PBackendService_Stub> const&, doris::FragmentMgr::BrpcItem const&) in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
5# doris::FragmentMgr::cancel_worker() at /root/doris/be/src/runtime/fragment_mgr.cpp:1022
6# doris::Thread::supervise_thread(void*) at /root/doris/be/src/util/thread.cpp:499
7# start_thread at /build/glibc-SzIz7B/glibc-2.31/nptl/pthread_create.c:478
8# __clone at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
```

### What problem does this PR solve?

Issue Number: close #xxx

Related PR: #xxx

Problem Summary:

### Release note

None

### Check List (For Author)

- Test <!-- At least one of them must be included. -->
    - [ ] Regression test
    - [ ] Unit Test
    - [ ] Manual test (add detailed scripts or steps below)
    - [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
        - [ ] Previous test can cover this change.
        - [ ] No code files have been changed.
        - [ ] Other reason <!-- Add your reason?  -->

- Behavior changed:
    - [ ] No.
    - [ ] Yes. <!-- Explain the behavior change -->

- Does this need documentation?
    - [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
apache/doris-website#1214 -->

### Check List (For Reviewer who merge this PR)

- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
  • Loading branch information
mrhhsg authored Nov 19, 2024
1 parent b4e136b commit eeafe45
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
9 changes: 7 additions & 2 deletions be/src/runtime/fragment_mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,8 +1447,13 @@ void FragmentMgr::cancel_worker() {
std::string("Coordinator dead."));
}

for (auto it : brpc_stub_with_queries) {
_check_brpc_available(it.first, it.second);
if (config::enable_brpc_connection_check) {
for (auto it : brpc_stub_with_queries) {
if (!it.first) {
continue;
}
_check_brpc_available(it.first, it.second);
}
}
} while (!_stop_background_threads_latch.wait_for(
std::chrono::seconds(config::fragment_mgr_cancel_worker_interval_seconds)));
Expand Down
19 changes: 14 additions & 5 deletions be/src/vec/sink/vdata_stream_sender.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,13 @@ Status Channel<Parent>::init(RuntimeState* state) {
_fragment_instance_id, _dest_node_id, &_local_recvr),
"");
} else {
auto network_address = _brpc_dest_addr;
if (_brpc_dest_addr.hostname == BackendOptions::get_localhost()) {
_brpc_stub = state->exec_env()->brpc_internal_client_cache()->get_client(
"127.0.0.1", _brpc_dest_addr.port);
if (config::enable_brpc_connection_check) {
network_address.hostname = "127.0.0.1";
}
} else {
_brpc_stub =
state->exec_env()->brpc_internal_client_cache()->get_client(_brpc_dest_addr);
Expand All @@ -104,6 +108,10 @@ Status Channel<Parent>::init(RuntimeState* state) {
LOG(WARNING) << msg;
return Status::InternalError(msg);
}

if (config::enable_brpc_connection_check) {
state->get_query_ctx()->add_using_brpc_stub(network_address, _brpc_stub);
}
}

_serializer.set_is_local(_is_local);
Expand All @@ -129,19 +137,16 @@ Status Channel<Parent>::init_stub(RuntimeState* state) {
if (_is_local) {
return Status::OK();
}

auto network_address = _brpc_dest_addr;
if (_brpc_dest_addr.hostname == BackendOptions::get_localhost()) {
_brpc_stub = state->exec_env()->brpc_internal_client_cache()->get_client(
"127.0.0.1", _brpc_dest_addr.port);
if (config::enable_brpc_connection_check) {
auto network_address = _brpc_dest_addr;
network_address.hostname = "127.0.0.1";
state->get_query_ctx()->add_using_brpc_stub(network_address, _brpc_stub);
}
} else {
_brpc_stub = state->exec_env()->brpc_internal_client_cache()->get_client(_brpc_dest_addr);
if (config::enable_brpc_connection_check) {
state->get_query_ctx()->add_using_brpc_stub(_brpc_dest_addr, _brpc_stub);
}
}

if (!_brpc_stub) {
Expand All @@ -150,6 +155,10 @@ Status Channel<Parent>::init_stub(RuntimeState* state) {
LOG(WARNING) << msg;
return Status::InternalError(msg);
}

if (config::enable_brpc_connection_check) {
state->get_query_ctx()->add_using_brpc_stub(network_address, _brpc_stub);
}
return Status::OK();
}

Expand Down

0 comments on commit eeafe45

Please sign in to comment.