You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello,
The test hydro5_4proc_3sd_4proc does not finish and all the MPI (intelMPI) processes are stuck in MPI_Barrier from MPIParallelSuperMng destructor.
Backtrace:
(gdb) bt
#0 0x00007f19195c75d8 in MPIDI_SHMGR_release_generic (opcode=447039912, mpir_comm=0x7fff2b79f788, root=1, localbuf=0x1b2, count=515801232, datatype=33046036, errflag=0x7fff2b7b2b70,
knomial_factor=4, algo_type=MPIDI_SHMGR_ALGO_KNOMIAL) at ../../src/mpid/ch4/src/intel/ch4_shm_coll_templates.h:231 #1 0x00007f19195bca27 in MPIDI_SHMGR_Release (comm=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788, algo_type=MPIDI_SHMGR_ALGO_KNOMIAL, radix=434)
at ../../src/mpid/ch4/src/intel/ch4_shm_coll.c:2609 #2 0x00007f1919533444 in MPIDI_Barrier_intra_composition_zeta (comm_ptr=, errflag=, ch4_algo_parameters_container=)
at ../../src/mpid/ch4/src/intel/ch4_coll_impl.h:319 #3 MPID_Barrier_invoke (comm=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788, ch4_algo_parameters_container=0x1) at ../../src/mpid/ch4/src/intel/autoreg_ch4_coll.h:56 #4 0x00007f1919509621 in MPIDI_coll_invoke (coll_sig=0x7f191aa549a8 <PVAR_TIMER_idle+8>, container=0x7fff2b79f788, req=0x1) at ../../src/mpid/ch4/src/intel/ch4_coll_select_utils.c:3138 #5 0x00007f19194ed6aa in MPIDI_coll_select (coll_sig=0x7f191aa549a8 <PVAR_TIMER_idle+8>, req=0x7fff2b79f788) at ../../src/mpid/ch4/src/intel/ch4_coll_globals_default.c:130 #6 0x00007f19195d6d45 in MPID_Barrier (comm=, errflag=) at ../../src/mpid/ch4/src/intel/ch4_coll.h:31 #7 MPIR_Barrier (comm_ptr=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788) at ../../src/mpi/coll/intel/coll_impl.c:349 #8 0x00007f19194cd7ed in PMPI_Barrier (comm=447039912) at ../../src/mpi/coll/barrier/barrier.c:266 #9 0x00007f191df29b88 in Arcane::MpiParallelSuperMng::MpiParallelSuperMng (this=0xbb7ac0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Communicator.h:139 #10 0x00007f191df29bd9 in Arcane::MpiParallelSuperMng::MpiParallelSuperMng (this=0xbb7ac0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Communicator.h:123 #11 0x00007f191affd84a in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0xc0dd50) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:161 #12 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0xc0dd50) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:161 #13 0x00007f191afefa2e in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::__shared_count (this=, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1154 #14 std::__shared_ptr<Arcane::IParallelSuperMng, (__gnu_cxx::_Lock_policy)2>::__shared_ptr (this=, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1154 #15 std::__shared_ptr<Arcane::IParallelSuperMng, (__gnu_cxx::_Lock_policy)2>::reset (this=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1272 #16 Arccore::Ref<Arcane::IParallelSuperMng, 0>::reset (this=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/CheckedPointer.h:250 #17 Arcane::Application::~Application (this=0xb9efd0, __in_chrg=) at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/concurrency/arccore/concurrency/ArcaneMain.h:189 #18 0x00007f191afeff09 in Arcane::Application::~Application (this=0xb9efd0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/concurrency/arccore/concurrency/ArcaneMain.h:159 #19 0x00007f191b027597 in Arcane::ArcaneMain::~ArcaneMain (this=0xb7e6f0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:1204 #20 0x00007f191b0613dd in Arcane::ArcaneMainBatch::~ArcaneMainBatch (this=0xb7e6f0, __in_chrg=)
at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/IFunctor.h:558 #21 Arcane::ArcaneMainBatch::~ArcaneMainBatch (this=0xb7e6f0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ParallelReplication.h:271 #22 0x00007f191b02836a in Arcane::ArcaneMainExecInfo::finalize (this=this@entry=0x7fff2b7b2cb0)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:483 #23 0x00007f191b02a446 in Arcane::ArcaneMain::_arcaneMain (app_info=..., factory=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:518 #24 0x00007f191b02a52a in Arcane::ArcaneMain::arcaneMain (app_info=..., factory=0xb7fe90)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:899 #25 0x00007f191b02a5d9 in Arcane::ArcaneMain::run () at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:976 #26 0x00007f191eabf1d5 in Arcane::ArcaneLauncher::run () at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/Array.h:114 #27 0x0000000000401e3a in _mainHelper (argc=,
argv=)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/std_function.h:86 #28 0x0000000000401ec9 in operator() (__closure=)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/std_function.h:97 #29 std::__invoke_impl<void, main(int, char**)::<lambda()>&> (__f=...)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/char_traits.h:61 #30 std::__invoke_r<void, main(int, char**)::<lambda()>&> (__fn=...)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/char_traits.h:111 #31 std::_Function_handler<void(), main(int, char**)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/tests/ArcaneTestMain.cc:291
--Type for more, q to quit, c to continue without paging-- #32 0x00007f19188178bf in std::function<void ()>::operator()() const (this=this@entry=0x7fff2b7b2e50)
at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/Exception.h:556 #33 Arcane::arcaneCallFunctionAndCatchException(std::function<void ()>) (function=...) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/utils/Exception.cc:93 #34 0x0000000000401a63 in main (argc=, argv=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/tests/ArcaneTestMain.cc:236
The text was updated successfully, but these errors were encountered:
Yes this problem had been identified and this test is not run in the CI for IFPEN. But I think this problem specific to IntelMPI because this test runs fine with OpenMPI and MPICH.
I will try to reproduce your problem but I need to know the version of IntelMPI you are using.
Hello,
The test hydro5_4proc_3sd_4proc does not finish and all the MPI (intelMPI) processes are stuck in MPI_Barrier from MPIParallelSuperMng destructor.
Backtrace:
(gdb) bt
#0 0x00007f19195c75d8 in MPIDI_SHMGR_release_generic (opcode=447039912, mpir_comm=0x7fff2b79f788, root=1, localbuf=0x1b2, count=515801232, datatype=33046036, errflag=0x7fff2b7b2b70,
knomial_factor=4, algo_type=MPIDI_SHMGR_ALGO_KNOMIAL) at ../../src/mpid/ch4/src/intel/ch4_shm_coll_templates.h:231
#1 0x00007f19195bca27 in MPIDI_SHMGR_Release (comm=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788, algo_type=MPIDI_SHMGR_ALGO_KNOMIAL, radix=434)
at ../../src/mpid/ch4/src/intel/ch4_shm_coll.c:2609
#2 0x00007f1919533444 in MPIDI_Barrier_intra_composition_zeta (comm_ptr=, errflag=, ch4_algo_parameters_container=)
at ../../src/mpid/ch4/src/intel/ch4_coll_impl.h:319
#3 MPID_Barrier_invoke (comm=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788, ch4_algo_parameters_container=0x1) at ../../src/mpid/ch4/src/intel/autoreg_ch4_coll.h:56
#4 0x00007f1919509621 in MPIDI_coll_invoke (coll_sig=0x7f191aa549a8 <PVAR_TIMER_idle+8>, container=0x7fff2b79f788, req=0x1) at ../../src/mpid/ch4/src/intel/ch4_coll_select_utils.c:3138
#5 0x00007f19194ed6aa in MPIDI_coll_select (coll_sig=0x7f191aa549a8 <PVAR_TIMER_idle+8>, req=0x7fff2b79f788) at ../../src/mpid/ch4/src/intel/ch4_coll_globals_default.c:130
#6 0x00007f19195d6d45 in MPID_Barrier (comm=, errflag=) at ../../src/mpid/ch4/src/intel/ch4_coll.h:31
#7 MPIR_Barrier (comm_ptr=0x7f191aa549a8 <PVAR_TIMER_idle+8>, errflag=0x7fff2b79f788) at ../../src/mpi/coll/intel/coll_impl.c:349
#8 0x00007f19194cd7ed in PMPI_Barrier (comm=447039912) at ../../src/mpi/coll/barrier/barrier.c:266
#9 0x00007f191df29b88 in Arcane::MpiParallelSuperMng::
MpiParallelSuperMng (this=0xbb7ac0, __in_chrg=)__shared_ptr (this=, __in_chrg=)at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Communicator.h:139
#10 0x00007f191df29bd9 in Arcane::MpiParallelSuperMng::
MpiParallelSuperMng (this=0xbb7ac0, __in_chrg=)__shared_count (this=, __in_chrg=)at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Communicator.h:123
#11 0x00007f191affd84a in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0xc0dd50) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:161
#12 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0xc0dd50) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:161
#13 0x00007f191afefa2e in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1154
#14 std::__shared_ptr<Arcane::IParallelSuperMng, (__gnu_cxx::_Lock_policy)2>::
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1154
#15 std::__shared_ptr<Arcane::IParallelSuperMng, (__gnu_cxx::_Lock_policy)2>::reset (this=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/Ref.h:1272
#16 Arccore::Ref<Arcane::IParallelSuperMng, 0>::reset (this=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/impl/CheckedPointer.h:250
#17 Arcane::Application::~Application (this=0xb9efd0, __in_chrg=) at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/concurrency/arccore/concurrency/ArcaneMain.h:189
#18 0x00007f191afeff09 in Arcane::Application::~Application (this=0xb9efd0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/concurrency/arccore/concurrency/ArcaneMain.h:159
#19 0x00007f191b027597 in Arcane::ArcaneMain::~ArcaneMain (this=0xb7e6f0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:1204
#20 0x00007f191b0613dd in Arcane::ArcaneMainBatch::~ArcaneMainBatch (this=0xb7e6f0, __in_chrg=)
at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/IFunctor.h:558
#21 Arcane::ArcaneMainBatch::~ArcaneMainBatch (this=0xb7e6f0, __in_chrg=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ParallelReplication.h:271
#22 0x00007f191b02836a in Arcane::ArcaneMainExecInfo::finalize (this=this@entry=0x7fff2b7b2cb0)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:483
#23 0x00007f191b02a446 in Arcane::ArcaneMain::_arcaneMain (app_info=..., factory=)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:518
#24 0x00007f191b02a52a in Arcane::ArcaneMain::arcaneMain (app_info=..., factory=0xb7fe90)
at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:899
#25 0x00007f191b02a5d9 in Arcane::ArcaneMain::run () at /work/guignont/work/ArcaneV3/GIT/framework/arccore/src/collections/arccore/collections/ArcaneMain.h:976
#26 0x00007f191eabf1d5 in Arcane::ArcaneLauncher::run () at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/Array.h:114
#27 0x0000000000401e3a in _mainHelper (argc=,
argv=)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/std_function.h:86
#28 0x0000000000401ec9 in operator() (__closure=)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/std_function.h:97
#29 std::__invoke_impl<void, main(int, char**)::<lambda()>&> (__f=...)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/char_traits.h:61
#30 std::__invoke_r<void, main(int, char**)::<lambda()>&> (__fn=...)
at /work/guignont/work/ArcaneV3/GIT/framework/cmake-build-release/_common/build_all/arcane/src/arcane/tests/char_traits.h:111
#31 std::_Function_handler<void(), main(int, char**)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/tests/ArcaneTestMain.cc:291
--Type for more, q to quit, c to continue without paging--
#32 0x00007f19188178bf in std::function<void ()>::operator()() const (this=this@entry=0x7fff2b7b2e50)
at /soft/irsrvsoft1/expl/eb/r11/el_8-x86_64/easybuild/software/GCCcore/11.2.0/include/c++/11.2.0/bits/Exception.h:556
#33 Arcane::arcaneCallFunctionAndCatchException(std::function<void ()>) (function=...) at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/utils/Exception.cc:93
#34 0x0000000000401a63 in main (argc=, argv=)
at /work/guignont/work/ArcaneV3/GIT/framework/arcane/src/arcane/tests/ArcaneTestMain.cc:236
The text was updated successfully, but these errors were encountered: