-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hitting [starpu][_starpu_fetch_data_on_node][assert failure] #56
Comments
Update with Thread 100 "CPU 1" received signal SIGABRT, Aborted.
[Switching to Thread 0x7ffbf5fff640 (LWP 102796)]
0x00007ffff7d249fc in pthread_kill () from /usr/lib/x86_64-linux-gnu/libc.so.6
(gdb) bt full
#0 0x00007ffff7d249fc in pthread_kill () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1 0x00007ffff7cd0476 in raise () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#2 0x00007ffff7cb67f3 in abort () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#3 0x00007ffc17e73ade in _starpu_fetch_data_on_node (handle=0x5555627a5aa0, node=0, dst_replicate=0x5555627a5bc8, mode=STARPU_R, detached=1, task=0x0,
is_prefetch=STARPU_IDLEFETCH, async=1, callback_func=0x0, callback_arg=0x0, prio=0, origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch")
at ../../src/datawizard/coherency.c:859
src_node_mask = 0
nnodes = 2
n = 2
__func__ = "_starpu_fetch_data_on_node"
r = 0x5555627a5aa0
ret = 32764
#4 0x00007ffc17e98ef6 in _starpu_data_acquire_launch_fetch (wrapper=0x555573c99a10, async=1, callback=0x0, callback_arg=0x0)
at ../../src/datawizard/user_interactions.c:130
node = 0
handle = 0x5555627a5aa0
replicate = 0x5555627a5bc8
ret = 1
#5 0x00007ffc17e9b3f6 in __starpu_prefetch_data_on_node_with_mode (handle=0x5555627a5aa0, node=0, async=1, mode=STARPU_R, prefetch=STARPU_IDLEFETCH,
prio=0) at ../../src/datawizard/user_interactions.c:593
replicate = 0x5555627a5bc8
__func__ = "__starpu_prefetch_data_on_node_with_mode"
wrapper = 0x555573c99a10
#6 0x00007ffc17e9b9cc in _starpu_data_wont_use (data=0x5555627a5aa0) at ../../src/datawizard/user_interactions.c:692
node = 16
handle = 0x5555627a5aa0
__func__ = "_starpu_data_wont_use"
#7 0x00007ffc17e98fd5 in _starpu_data_acquire_fetch_data_callback (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:154
wrapper = 0x555565e4a660
handle = 0x5555627a5aa0
#8 0x00007ffc17e72d90 in _starpu_create_request_to_fetch_data (handle=0x5555627a5aa0, dst_replicate=0x0, mode=STARPU_R, task=0x0,
is_prefetch=STARPU_FETCH, async=1, callback_func=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660, prio=0,
origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:627
requesting_node = -1
nwait = 0
__func__ = "_starpu_create_request_to_fetch_data"
src_node = 0
src_nodes = {401862296, 32764, 1737391320, 21845}
dst_nodes = {0, 0, 2, 21845}
handling_nodes = {1706659864, 21845, 4157759490, 32767}
write_invalidation = 0
nhops = 0
requests = <error reading variable requests (value requires 750602383779528 bytes, which is more than max-value-size)>
reused_requests = <error reading variable reused_requests (value requires 562882842878788 bytes, which is more than max-value-size)>
hop = 1978223616
#9 0x00007ffc17e73c0b in _starpu_fetch_data_on_node (handle=0x5555627a5aa0, node=-2, dst_replicate=0x0, mode=STARPU_R, detached=0, task=0x0,
is_prefetch=STARPU_FETCH, async=1, callback_func=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660, prio=0,
origin=0x7ffc17f50b40 "_starpu_data_acquire_launch_fetch") at ../../src/datawizard/coherency.c:880
__func__ = "_starpu_fetch_data_on_node"
r = 0x2a2b6d8375e94c00
ret = 32763
#10 0x00007ffc17e98ef6 in _starpu_data_acquire_launch_fetch (wrapper=0x555565e4a660, async=1,
callback=0x7ffc17e98f62 <_starpu_data_acquire_fetch_data_callback>, callback_arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:130
node = -2
handle = 0x5555627a5aa0
--Type <RET> for more, q to quit, c to continue without paging--c
replicate = 0x0
ret = 0
#11 0x00007ffc17e99086 in _starpu_data_acquire_continuation_non_blocking (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:169
wrapper = 0x555565e4a660
#12 0x00007ffc17e990f5 in starpu_data_acquire_cb_pre_sync_callback (arg=0x555565e4a660) at ../../src/datawizard/user_interactions.c:187
wrapper = 0x555565e4a660
#13 0x00007ffc17dbf690 in _starpu_handle_job_termination (j=0x555565c2eb90) at ../../src/core/jobs.c:573
time = 0x0
profiling = 0
old_status = STATUS_UNKNOWN
current_task = 0x0
__func__ = "_starpu_handle_job_termination"
task = 0x555565f110f0
end_rdep = 0x0
sched_ctx = 0
flops = 0
continuation = 0
nowhere = 1
callback = 0x7ffc17e9909e <starpu_data_acquire_cb_pre_sync_callback>
destroy = 1737019232
detach = 21845
regenerate = 402233832
synchronous = 32764
worker = 0x7ffc17f998d0 <_starpu_config+10352>
#14 0x00007ffc17e2c355 in _starpu_repush_task (j=0x555565c2eb90) at ../../src/core/sched_policy.c:648
task = 0x555565f110f0
sched_ctx = 0x7ffc18009f48 <_starpu_config+470760>
ret = 0
can_push = 1
continuation = 0
#15 0x00007ffc17e2b798 in _starpu_push_task (j=0x555565c2eb90) at ../../src/core/sched_policy.c:548
__func__ = "_starpu_push_task"
#16 0x00007ffc17dc0931 in _starpu_enforce_deps_starting_from_task (j=0x555565c2eb90) at ../../src/core/jobs.c:991
ret = 0
__func__ = "_starpu_enforce_deps_starting_from_task"
#17 0x00007ffc17dec16b in _starpu_notify_cg (pred=0x5555676d6db0, cg=0x55556255aa10) at ../../src/core/dependencies/cg.c:277
job_successors = 0x555565c2ed98
j = 0x555565c2eb90
ndeps_completed = 1
remaining = 0
__func__ = "_starpu_notify_cg"
#18 0x00007ffc17dec6f8 in _starpu_notify_cg_list (pred=0x5555676d6db0, successors=0x5555676d6fb8) at ../../src/core/dependencies/cg.c:377
cg = 0x55556255aa10
cg_type = 4
succ = 1
__func__ = "_starpu_notify_cg_list"
#19 0x00007ffc17df6b96 in _starpu_notify_task_dependencies (j=0x5555676d6db0) at ../../src/core/dependencies/task_deps.c:66
No locals.
#20 0x00007ffc17decb91 in _starpu_notify_dependencies (j=0x5555676d6db0) at ../../src/core/dependencies/dependencies.c:32
No locals.
#21 0x00007ffc17dbf542 in _starpu_handle_job_termination (j=0x5555676d6db0) at ../../src/core/jobs.c:542
__func__ = "_starpu_handle_job_termination"
task = 0x55556788cf60
end_rdep = 0x0
sched_ctx = 0
flops = 3076
continuation = 0
nowhere = 0
callback = 0x0
destroy = 4127192640
detach = 32763
regenerate = 402233624
synchronous = 32764
worker = 0x7ffc18376a80 <nntile::starpu::prod_slice::cpu<nntile::fp32_t>(void**, void*)>
#22 0x00007ffc17f27b6a in _starpu_cpu_driver_execute_task (cpu_worker=0x7ffc17f998d0 <_starpu_config+10352>, task=0x55556788cf60, j=0x5555676d6db0)
at ../../src/drivers/cpu/driver_cpu.c:558
res = 0
rank = 0
is_parallel_task = 0
perf_arch = 0x7ffc17f99918 <_starpu_config+10424>
__func__ = "_starpu_cpu_driver_execute_task"
#23 0x00007ffc17f27de1 in _starpu_cpu_driver_run_once (cpu_worker=0x7ffc17f998d0 <_starpu_config+10352>) at ../../src/drivers/cpu/driver_cpu.c:596
ret = 0
memnode = 0
workerid = 2
pi = {conf = 0x0, event_type = starpu_prof_tool_event_end_transfer, starpu_version = {1, 4, 7}, thread_id = -167774656, worker_id = 2,
device_number = 2, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 2, bytes_transfered = 2, fun_ptr = 0x0}
res = 0
j = 0x5555676d6db0
task = 0x0
pending_task = 0x55556788cf60
rank = 0
__func__ = "_starpu_cpu_driver_run_once"
continuation_wake_up = 0
#24 0x00007ffc17f28847 in _starpu_cpu_worker (arg=0x7ffc17f998d0 <_starpu_config+10352>) at ../../src/drivers/cpu/driver_cpu.c:714
worker = 0x7ffc17f998d0 <_starpu_config+10352>
pi = {conf = 0x0, event_type = starpu_prof_tool_event_start_transfer, starpu_version = {1, 4, 7}, thread_id = -167774656, worker_id = 2,
device_number = 2, driver_type = starpu_prof_tool_driver_cpu, memnode = 0, bytes_to_transfer = 0, bytes_transfered = 0, fun_ptr = 0x0}
#25 0x00007ffff7d22ac3 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#26 0x00007ffff7db4850 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available. |
Disabling |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, StarPU team!
We are developing an asynchronous inference engine based on StarPU. Main thread of our Python program creates new data handles, inserts tasks with these handles and does
starpu_data_unregister_submit()
on all the new handles. Due to some reason, we hit the following message:Backtrace is here:
Is it alright to even call
starpu_repush_task()
? Sounds like a task has failed and has to be pushed once again.Configuration
../configure --disable-build-doc --disable-build-examples --disable-build-tests --disable-mpi --disable-fortran --disable-opencl --disable-socl --disable-starpufft --disable-starpupy --enable-blas-lib=none --prefix=/home/jovyan/mikhalev/install/starpu-1.4.7 --with-fxt=/home/jovyan/mikhalev/install/fxt-0.3.14/lib/pkgconfig --enable-maxcudadev=8 --disable-openmp --disable-parallel-worker
Configuration result
https://gist.github.com/Muxas/ac5344287d3dc3473aaabcd8aabbff3b
Version of StarPU
We rely on StarPU-1.4 from the GitHub, commit 1f158eb
Version of GPU drivers
CUDA drivers are 450.172.01
The text was updated successfully, but these errors were encountered: