Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hitting [starpu][_starpu_fetch_data_on_node][assert failure] #56

Open
Muxas opened this issue Sep 5, 2024 · 2 comments
Open

Hitting [starpu][_starpu_fetch_data_on_node][assert failure] #56

Muxas opened this issue Sep 5, 2024 · 2 comments

Comments

@Muxas
Copy link

Muxas commented Sep 5, 2024

Hello, StarPU team!

We are developing an asynchronous inference engine based on StarPU. Main thread of our Python program creates new data handles, inserts tasks with these handles and does starpu_data_unregister_submit() on all the new handles. Due to some reason, we hit the following message:

[starpu][_starpu_fetch_data_on_node][assert failure] Could not find a valid copy of the data, and no handle initialization function

Backtrace is here:

(gdb) bt full
#0  0x00007ffff7cb6720 in abort () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x00007ffc17fd67b6 in _starpu_fetch_data_on_node (handle=0x555564536130, node=0, dst_replicate=0x555564536240, mode=STARPU_R, detached=1, 
    task=task@entry=0x0, is_prefetch=STARPU_IDLEFETCH, async=1, callback_func=0x0, callback_arg=0x0, prio=0, 
    origin=0x7ffc180600c8 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:859
        src_node_mask = <optimized out>
        nnodes = <optimized out>
        n = <optimized out>
        __func__ = "_starpu_fetch_data_on_node"
        r = <optimized out>
        ret = <optimized out>
#2  0x00007ffc17ff0316 in _starpu_data_acquire_launch_fetch (callback_arg=0x0, callback=0x0, async=1, wrapper=0x555573b51710)
    at ../../src/datawizard/user_interactions.c:130
        node = <optimized out>
        handle = <optimized out>
        replicate = <optimized out>
        ret = <optimized out>
        node = <optimized out>
        handle = <optimized out>
        replicate = <optimized out>
        ret = <optimized out>
        __ptrs = <optimized out>
        __n = <optimized out>
#3  __starpu_prefetch_data_on_node_with_mode (handle=0x555564536130, node=<optimized out>, async=1, prefetch=<optimized out>, prio=0, mode=STARPU_R)
    at ../../src/datawizard/user_interactions.c:593
        replicate = 0x555564536240
        wrapper = 0x555573b51710
        __func__ = "__starpu_prefetch_data_on_node_with_mode"
#4  0x00007ffc17fef7c5 in _starpu_data_acquire_fetch_data_callback (arg=0x555564280380) at ../../src/datawizard/user_interactions.c:154
        wrapper = 0x555564280380
        handle = <optimized out>
#5  0x00007ffc17fd6075 in _starpu_create_request_to_fetch_data (handle=handle@entry=0x555564536130, dst_replicate=dst_replicate@entry=0x0, 
    mode=mode@entry=STARPU_R, task=task@entry=0x0, is_prefetch=<optimized out>, async=async@entry=1, 
    callback_func=0x7ffc17fef7a0 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555564280380, prio=0, 
    origin=0x7ffc180600c8 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:627
        requesting_node = <optimized out>
        nwait = <optimized out>
        src_node = <optimized out>
        src_nodes = {2116479353, 1101487691, 2168012191, 1101487691}
        dst_nodes = {25152400, 32764, 0, 0}
        handling_nodes = {5, 4, 25152432, 32764}
        write_invalidation = <optimized out>
        nhops = <optimized out>
        requests = <optimized out>
        reused_requests = <optimized out>
        hop = <optimized out>
#6  0x00007ffc17fd6549 in _starpu_fetch_data_on_node (handle=0x555564536130, node=-2, dst_replicate=0x0, mode=STARPU_R, detached=0, task=task@entry=0x0, 
    is_prefetch=STARPU_FETCH, async=1, callback_func=0x7ffc17fef7a0 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555564280380, prio=0, 
    origin=0x7ffc180600c8 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:880
        __func__ = "_starpu_fetch_data_on_node"
        r = <optimized out>
        ret = <optimized out>
#7  0x00007ffc17feffe4 in _starpu_data_acquire_launch_fetch (callback_arg=0x555564280380, 
    callback=0x7ffc17fef7a0 <_starpu_data_acquire_fetch_data_callback>, async=1, wrapper=0x555564280380) at ../../src/datawizard/user_interactions.c:130
        node = <optimized out>
        handle = <optimized out>
        replicate = <optimized out>
        ret = <optimized out>
--Type <RET> for more, q to quit, c to continue without paging--
        __ptrs = <optimized out>
        __n = <optimized out>
#8  _starpu_data_acquire_continuation_non_blocking (arg=0x555564280380) at ../../src/datawizard/user_interactions.c:169
        wrapper = 0x555564280380
#9  starpu_data_acquire_cb_pre_sync_callback (arg=0x555564280380) at ../../src/datawizard/user_interactions.c:187
        wrapper = 0x555564280380
#10 0x00007ffc17f63de2 in _starpu_handle_job_termination (j=j@entry=0x555564042910) at ../../src/core/jobs.c:573
        time = 0x0
        profiling = 0
        old_status = <optimized out>
        current_task = 0x0
        __func__ = "_starpu_handle_job_termination"
        task = 0x55556423edc0
        end_rdep = <optimized out>
        sched_ctx = 0
        flops = 0
        continuation = 0
        nowhere = <optimized out>
        callback = 0x7ffc17feff20 <starpu_data_acquire_cb_pre_sync_callback>
        destroy = <optimized out>
        detach = <optimized out>
        regenerate = <optimized out>
        synchronous = <optimized out>
        worker = <optimized out>
#11 0x00007ffc17fa7232 in _starpu_repush_task (j=0x555564042910) at ../../src/core/sched_policy.c:648
        task = 0x55556423edc0
        sched_ctx = <optimized out>
        ret = <optimized out>
        can_push = <optimized out>
        continuation = 0
#12 0x00007ffc17f8038a in _starpu_notify_cg_list (pred=pred@entry=0x5555644ca550, successors=successors@entry=0x5555644ca760)
    at ../../src/core/dependencies/cg.c:377
        cg = 0x5555644c66e0
        cg_type = <optimized out>
        succ = <optimized out>
#13 0x00007ffc17f8664c in _starpu_notify_task_dependencies (j=j@entry=0x5555644ca550) at ../../src/core/dependencies/task_deps.c:66
No locals.
#14 0x00007ffc17f80957 in _starpu_notify_dependencies (j=j@entry=0x5555644ca550) at ../../src/core/dependencies/dependencies.c:32
No locals.
#15 0x00007ffc17f63d67 in _starpu_handle_job_termination (j=j@entry=0x5555644ca550) at ../../src/core/jobs.c:542
        __func__ = "_starpu_handle_job_termination"
        task = 0x5555641c5f70
        end_rdep = 0x0
        sched_ctx = 0
        flops = 1544
        continuation = 0
        nowhere = <optimized out>
        callback = 0x0
        destroy = <optimized out>
        detach = <optimized out>
        regenerate = <optimized out>
        synchronous = <optimized out>
        worker = <optimized out>
#16 0x00007ffc18040b8c in _starpu_cpu_driver_execute_task (cpu_worker=cpu_worker@entry=0x7ffc180a1100 <_starpu_config+8384>, 
    task=task@entry=0x5555641c5f70, j=j@entry=0x5555644ca550) at ../../src/drivers/cpu/driver_cpu.c:558
        res = 0
        rank = 0
        is_parallel_task = 0
--Type <RET> for more, q to quit, c to continue without paging--
        perf_arch = 0x7ffc180a1148 <_starpu_config+8456>
        __func__ = "_starpu_cpu_driver_execute_task"
        __ptrs = <optimized out>
        __n = <optimized out>
#17 0x00007ffc18041774 in _starpu_cpu_driver_run_once (cpu_worker=cpu_worker@entry=0x7ffc180a1100 <_starpu_config+8384>)
    at ../../src/drivers/cpu/driver_cpu.c:596
        ret = <optimized out>
        memnode = 0
        workerid = 1
        pi = {conf = 0x0, event_type = starpu_prof_tool_event_end_transfer, starpu_version = {1, 4, 7}, thread_id = 25159232, worker_id = 1, 
          device_number = 1, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 2, bytes_transfered = 2, fun_ptr = 0x0}
        res = <optimized out>
        j = 0x5555644ca550
        task = 0x0
        pending_task = 0x5555641c5f70
        rank = 0
        __func__ = "_starpu_cpu_driver_run_once"
        continuation_wake_up = <optimized out>
#18 0x00007ffc18041c0e in _starpu_cpu_worker (arg=0x7ffc180a1100 <_starpu_config+8384>) at ../../src/drivers/cpu/driver_cpu.c:714
        worker = 0x7ffc180a1100 <_starpu_config+8384>
        pi = {conf = 0x0, event_type = starpu_prof_tool_event_start_transfer, starpu_version = {1, 4, 7}, thread_id = 25159232, worker_id = 1, 
          device_number = 1, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 0, bytes_transfered = 0, fun_ptr = 0x0}
#19 0x00007ffff7d22ac3 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#20 0x00007ffff7db4850 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.

Is it alright to even call starpu_repush_task()? Sounds like a task has failed and has to be pushed once again.

Configuration

../configure --disable-build-doc --disable-build-examples --disable-build-tests --disable-mpi --disable-fortran --disable-opencl --disable-socl --disable-starpufft --disable-starpupy --enable-blas-lib=none --prefix=/home/jovyan/mikhalev/install/starpu-1.4.7 --with-fxt=/home/jovyan/mikhalev/install/fxt-0.3.14/lib/pkgconfig --enable-maxcudadev=8 --disable-openmp --disable-parallel-worker

Configuration result

https://gist.github.com/Muxas/ac5344287d3dc3473aaabcd8aabbff3b

Version of StarPU

We rely on StarPU-1.4 from the GitHub, commit 1f158eb

Version of GPU drivers

CUDA drivers are 450.172.01

@Muxas
Copy link
Author

Muxas commented Sep 5, 2024

Update with --enable-debug StarPU option:

Thread 100 "CPU 1" received signal SIGABRT, Aborted.
[Switching to Thread 0x7ffbf5fff640 (LWP 102796)]
0x00007ffff7d249fc in pthread_kill () from /usr/lib/x86_64-linux-gnu/libc.so.6
(gdb) bt full
#0  0x00007ffff7d249fc in pthread_kill () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x00007ffff7cd0476 in raise () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#2  0x00007ffff7cb67f3 in abort () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#3  0x00007ffc17e73ade in _starpu_fetch_data_on_node (handle=0x5555627a5aa0, node=0, dst_replicate=0x5555627a5bc8, mode=STARPU_R, detached=1, task=0x0, 
    is_prefetch=STARPU_IDLEFETCH, async=1, callback_func=0x0, callback_arg=0x0, prio=0, origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch")
    at ../../src/datawizard/coherency.c:859
        src_node_mask = 0
        nnodes = 2
        n = 2
        __func__ = "_starpu_fetch_data_on_node"
        r = 0x5555627a5aa0
        ret = 32764
#4  0x00007ffc17e98ef6 in _starpu_data_acquire_launch_fetch (wrapper=0x555573c99a10, async=1, callback=0x0, callback_arg=0x0)
    at ../../src/datawizard/user_interactions.c:130
        node = 0
        handle = 0x5555627a5aa0
        replicate = 0x5555627a5bc8
        ret = 1
#5  0x00007ffc17e9b3f6 in __starpu_prefetch_data_on_node_with_mode (handle=0x5555627a5aa0, node=0, async=1, mode=STARPU_R, prefetch=STARPU_IDLEFETCH, 
    prio=0) at ../../src/datawizard/user_interactions.c:593
        replicate = 0x5555627a5bc8
        __func__ = "__starpu_prefetch_data_on_node_with_mode"
        wrapper = 0x555573c99a10
#6  0x00007ffc17e9b9cc in _starpu_data_wont_use (data=0x5555627a5aa0) at ../../src/datawizard/user_interactions.c:692
        node = 16
        handle = 0x5555627a5aa0
        __func__ = "_starpu_data_wont_use"
#7  0x00007ffc17e98fd5 in _starpu_data_acquire_fetch_data_callback (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:154
        wrapper = 0x555565e4a660
        handle = 0x5555627a5aa0
#8  0x00007ffc17e72d90 in _starpu_create_request_to_fetch_data (handle=0x5555627a5aa0, dst_replicate=0x0, mode=STARPU_R, task=0x0, 
    is_prefetch=STARPU_FETCH, async=1, callback_func=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660, prio=0, 
    origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:627
        requesting_node = -1
        nwait = 0
        __func__ = "_starpu_create_request_to_fetch_data"
        src_node = 0
        src_nodes = {401862296, 32764, 1737391320, 21845}
        dst_nodes = {0, 0, 2, 21845}
        handling_nodes = {1706659864, 21845, 4157759490, 32767}
        write_invalidation = 0
        nhops = 0
        requests = <error reading variable requests (value requires 750602383779528 bytes, which is more than max-value-size)>
        reused_requests = <error reading variable reused_requests (value requires 562882842878788 bytes, which is more than max-value-size)>
        hop = 1978223616
#9  0x00007ffc17e73c0b in _starpu_fetch_data_on_node (handle=0x5555627a5aa0, node=-2, dst_replicate=0x0, mode=STARPU_R, detached=0, task=0x0, 
    is_prefetch=STARPU_FETCH, async=1, callback_func=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660, prio=0, 
    origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:880
        __func__ = "_starpu_fetch_data_on_node"
        r = 0x2a2b6d8375e94c00
        ret = 32763
#10 0x00007ffc17e98ef6 in _starpu_data_acquire_launch_fetch (wrapper=0x555565e4a660, async=1, 
    callback=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:130
        node = -2
        handle = 0x5555627a5aa0
--Type <RET> for more, q to quit, c to continue without paging--c
        replicate = 0x0
        ret = 0
#11 0x00007ffc17e99086 in _starpu_data_acquire_continuation_non_blocking (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:169
        wrapper = 0x555565e4a660
#12 0x00007ffc17e990f5 in starpu_data_acquire_cb_pre_sync_callback (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:187
        wrapper = 0x555565e4a660
#13 0x00007ffc17dbf690 in _starpu_handle_job_termination (j=0x555565c2eb90) at ../../src/core/jobs.c:573
        time = 0x0
        profiling = 0
        old_status = STATUS_UNKNOWN
        current_task = 0x0
        __func__ = "_starpu_handle_job_termination"
        task = 0x555565f110f0
        end_rdep = 0x0
        sched_ctx = 0
        flops = 0
        continuation = 0
        nowhere = 1
        callback = 0x7ffc17e9909e <starpu_data_acquire_cb_pre_sync_callback>
        destroy = 1737019232
        detach = 21845
        regenerate = 402233832
        synchronous = 32764
        worker = 0x7ffc17f998d0 <_starpu_config+10352>
#14 0x00007ffc17e2c355 in _starpu_repush_task (j=0x555565c2eb90) at ../../src/core/sched_policy.c:648
        task = 0x555565f110f0
        sched_ctx = 0x7ffc18009f48 <_starpu_config+470760>
        ret = 0
        can_push = 1
        continuation = 0
#15 0x00007ffc17e2b798 in _starpu_push_task (j=0x555565c2eb90) at ../../src/core/sched_policy.c:548
        __func__ = "_starpu_push_task"
#16 0x00007ffc17dc0931 in _starpu_enforce_deps_starting_from_task (j=0x555565c2eb90) at ../../src/core/jobs.c:991
        ret = 0
        __func__ = "_starpu_enforce_deps_starting_from_task"
#17 0x00007ffc17dec16b in _starpu_notify_cg (pred=0x5555676d6db0, cg=0x55556255aa10) at ../../src/core/dependencies/cg.c:277
        job_successors = 0x555565c2ed98
        j = 0x555565c2eb90
        ndeps_completed = 1
        remaining = 0
        __func__ = "_starpu_notify_cg"
#18 0x00007ffc17dec6f8 in _starpu_notify_cg_list (pred=0x5555676d6db0, successors=0x5555676d6fb8) at ../../src/core/dependencies/cg.c:377
        cg = 0x55556255aa10
        cg_type = 4
        succ = 1
        __func__ = "_starpu_notify_cg_list"
#19 0x00007ffc17df6b96 in _starpu_notify_task_dependencies (j=0x5555676d6db0) at ../../src/core/dependencies/task_deps.c:66
No locals.
#20 0x00007ffc17decb91 in _starpu_notify_dependencies (j=0x5555676d6db0) at ../../src/core/dependencies/dependencies.c:32
No locals.
#21 0x00007ffc17dbf542 in _starpu_handle_job_termination (j=0x5555676d6db0) at ../../src/core/jobs.c:542
        __func__ = "_starpu_handle_job_termination"
        task = 0x55556788cf60
        end_rdep = 0x0
        sched_ctx = 0
        flops = 3076
        continuation = 0
        nowhere = 0
        callback = 0x0
        destroy = 4127192640
        detach = 32763
        regenerate = 402233624
        synchronous = 32764
        worker = 0x7ffc18376a80 <nntile::starpu::prod_slice::cpu<nntile::fp32_t>(void**, void*)>
#22 0x00007ffc17f27b6a in _starpu_cpu_driver_execute_task (cpu_worker=0x7ffc17f998d0 <_starpu_config+10352>, task=0x55556788cf60, j=0x5555676d6db0)
    at ../../src/drivers/cpu/driver_cpu.c:558
        res = 0
        rank = 0
        is_parallel_task = 0
        perf_arch = 0x7ffc17f99918 <_starpu_config+10424>
        __func__ = "_starpu_cpu_driver_execute_task"
#23 0x00007ffc17f27de1 in _starpu_cpu_driver_run_once (cpu_worker=0x7ffc17f998d0 <_starpu_config+10352>) at ../../src/drivers/cpu/driver_cpu.c:596
        ret = 0
        memnode = 0
        workerid = 2
        pi = {conf = 0x0, event_type = starpu_prof_tool_event_end_transfer, starpu_version = {1, 4, 7}, thread_id = -167774656, worker_id = 2, 
          device_number = 2, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 2, bytes_transfered = 2, fun_ptr = 0x0}
        res = 0
        j = 0x5555676d6db0
        task = 0x0
        pending_task = 0x55556788cf60
        rank = 0
        __func__ = "_starpu_cpu_driver_run_once"
        continuation_wake_up = 0
#24 0x00007ffc17f28847 in _starpu_cpu_worker (arg=0x7ffc17f998d0 <_starpu_config+10352>) at ../../src/drivers/cpu/driver_cpu.c:714
        worker = 0x7ffc17f998d0 <_starpu_config+10352>
        pi = {conf = 0x0, event_type = starpu_prof_tool_event_start_transfer, starpu_version = {1, 4, 7}, thread_id = -167774656, worker_id = 2, 
          device_number = 2, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 0, bytes_transfered = 0, fun_ptr = 0x0}
#25 0x00007ffff7d22ac3 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#26 0x00007ffff7db4850 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.

@Muxas
Copy link
Author

Muxas commented Sep 5, 2024

Disabling starpu_data_wont_use() seems to resolve my issue.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant