Thanks for very helpful information!
Turned out the problem wasn’t fixed, it just happened to work one time.
I tried your method and got some information out.
Are you able to help with this info?
where:
#0 0x00002ab1200519db in MPIDI_SHMGR_release_generic (opcode=554624168, mpir_comm=0x7ffcf8e5fbd8, root=1277, localbuf=0x25b, count=1, datatype=934203872, errflag=0x7ffcf8e72540, knomial_factor=4,
algo_type=MPIDI_SHMGR_ALGO_KNOMIAL) at …/…/src/mpid/ch4/src/intel/ch4_shm_coll_templates.h:231
#1 0x00002ab120044dc7 in MPIDI_SHMGR_Release (comm=0x2ab1210ee4a8 <PVAR_TIMER_idle+8>, errflag=0x7ffcf8e5fbd8, algo_type=1277, radix=603) at …/…/src/mpid/ch4/src/intel/ch4_shm_coll.c:2666
#2 0x00002ab11ffce305 in MPIDI_Barrier_intra_composition_zeta (comm_ptr=, errflag=, ch4_algo_parameters_container=)
at …/…/src/mpid/ch4/src/intel/ch4_coll_impl.h:323
#3 MPID_Barrier_invoke (comm=0x2ab1210ee4a8 <PVAR_TIMER_idle+8>, errflag=0x7ffcf8e5fbd8, ch4_algo_parameters_container=0x4fd) at …/…/src/mpid/ch4/src/intel/autoreg_ch4_coll.h:55
#4 0x00002ab11ffa6690 in MPIDI_coll_invoke (coll_sig=0x2ab1210ee4a8 <PVAR_TIMER_idle+8>, container=0x7ffcf8e5fbd8, req=0x4fd) at …/…/src/mpid/ch4/src/intel/ch4_coll_select_utils.c:3137
#5 0x00002ab11ff8bd8c in MPIDI_coll_select (coll_sig=0x2ab1210ee4a8 <PVAR_TIMER_idle+8>, req=0x7ffcf8e5fbd8) at …/…/src/mpid/ch4/src/intel/ch4_coll_globals_default.c:129
#6 0x00002ab120061645 in MPID_Barrier (comm=, errflag=) at …/…/src/mpid/ch4/src/intel/ch4_coll.h:31
#7 MPIR_Barrier (comm_ptr=0x2ab1210ee4a8 <PVAR_TIMER_idle+8>, errflag=0x7ffcf8e5fbd8) at …/…/src/mpi/coll/intel/coll_impl.c:355
#8 0x00002ab11ff6b63f in PMPI_Barrier (comm=554624168) at …/…/src/mpi/coll/barrier/barrier.c:267
#9 0x000000000047a61d in LAMMPS_NS::Error::all(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) () at …/error.cpp:140
#10 0x000000000047a846 in LAMMPS_NS::Error::_all(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, fmt::v7_lmp::basic_string_view, fmt::v7_lmp::format_args) () at …/fmt/core.h:2063
#11 0x00000000009ffcff in all<char [43]> (format=…, line=144, file=…, this=0x2eda910) at /cluster/software/GCCcore/10.3.0/include/c++/10.3.0/bits/char_traits.h:371
#12 LAMMPS_NS::ComputeTempRotate::compute_scalar() () at …/compute_temp_rotate.cpp:144
#13 0x0000000000b8cb85 in LAMMPS_NS::FixAveTime::invoke_scalar(long) () at /cluster/software/GCCcore/10.3.0/include/c++/10.3.0/ext/new_allocator.h:89
#14 0x0000000000743718 in LAMMPS_NS::Modify::setup (this=0x3329ef0, vflag=2) at …/modify.cpp:303
#15 0x00000000004d4126 in LAMMPS_NS::Verlet::setup(int) () at …/verlet.cpp:155
#16 0x00000000006d1ff8 in LAMMPS_NS::Run::command(int, char**) () at …/run.cpp:171
#17 0x0000000000470694 in LAMMPS_NS::Input::execute_command() () at …/input.cpp:789
#18 0x0000000000470f43 in LAMMPS_NS::Input::file() () at …/input.cpp:268
#19 0x00000000004435c8 in main () at …/main.cpp:98
#20 0x00002ab121850555 in __libc_start_main () from /lib64/libc.so.6
#21 0x0000000000443c7f in _start ()
where (another PID):
#0 0x00002b64d48e2a32 in MPIDI_OFI_progress (vci=40698188, blocking=1) at …/…/src/mpid/ch4/netmod/ofi/ofi_progress.c:42
#1 0x00002b64d44e4a6f in MPIDI_Progress_test (flags=) at …/…/src/mpid/ch4/src/ch4_progress.c:181
#2 MPID_Progress_test () at …/…/src/mpid/ch4/src/ch4_progress.c:236
#3 0x00002b64d495b0d9 in MPIR_Reduce_intra_tree_pcb_generic (sendbuf=0x26d014c, recvbuf=0x1, count=8, datatype=241, op=1, root=-335638048, comm_ptr=0x2b64d55d0160 <MPIR_Comm_builtin>,
errflag=0x7ffd57660c00, seg_size_requested=4096, childs=0x7ffd57654320, n_childs=3, parent=2) at …/…/src/mpi/coll/intel/reduce/reduce_intra_tree.c:49
#4 0x00002b64d4959286 in MPIR_Reduce_intra_tree (sendbuf=0x26d014c, recvbuf=0x1, count=8, datatype=241, op=1, root=-335638048, comm_ptr=0x4c00080b, errflag=0x7f2000c3aa14, cnt=0x7ffd57660c00)
at …/…/src/mpi/coll/intel/reduce/reduce_intra_tree.c:444
#5 0x00002b64d449b25d in MPIDI_NM_mpi_reduce (sendbuf=, recvbuf=, count=, datatype=, op=, root=,
comm_ptr=, errflag=, ch4_algo_parameters_container_in=0x1) at …/…/src/mpid/ch4/netmod/include/…/ofi/intel/ofi_coll.h:796
#6 MPIDI_Reduce_intra_composition_beta (sendbuf=, recvbuf=, count=, datatype=, op=, root=,
comm_ptr=, errflag=, ch4_algo_parameters_container=) at …/…/src/mpid/ch4/src/intel/ch4_coll_impl.h:1446
#7 MPID_Reduce_invoke (sendbuf=, recvbuf=0x7ffd57660c60, count=, datatype=, op=, root=-335638048, comm=0x2b64d55d0160 <MPIR_Comm_builtin>,
errflag=0x7ffd57660c00, ch4_algo_parameters_container=) at …/…/src/mpid/ch4/src/intel/ch4_coll_select_utils.c:2148
#8 MPIDI_coll_invoke (coll_sig=0x26d014c, container=0x1, req=0x8) at …/…/src/mpid/ch4/src/intel/ch4_coll_select_utils.c:3190
#9 0x00002b64d4487d8c in MPIDI_coll_select (coll_sig=0x26d014c, req=0x1) at …/…/src/mpid/ch4/src/intel/ch4_coll_globals_default.c:129
#10 0x00002b64d455da10 in MPID_Reduce (sendbuf=, recvbuf=, count=, datatype=, op=, root=, comm=,
errflag=) at …/…/src/mpid/ch4/src/intel/ch4_coll.h:285
#11 MPIR_Reduce (sendbuf=0x26d014c, recvbuf=0x1, count=8, datatype=241, op=1, root=-335638048, comm_ptr=0x2b64d55d0160 <MPIR_Comm_builtin>, errflag=0x7ffd57660c00)
at …/…/src/mpi/coll/intel/coll_impl.c:563
#12 0x00002b64d495d525 in PMPI_Reduce (sendbuf=0x26d014c, sendbuf@entry=0x7ffd57660c50, recvbuf=0x1, recvbuf@entry=0x7ffd57660c60, count=8, count@entry=1, datatype=241, datatype@entry=1275070475, op=1,
op@entry=1476395011, root=-335638048, root@entry=0, comm=0) at …/…/src/mpi/coll/reduce/reduce.c:489
#13 0x000000000069bb41 in LAMMPS_NS::Output::memory_usage (this=this@entry=0x2ccbb00) at …/output.cpp:815
#14 0x000000000069cde1 in LAMMPS_NS::Output::setup(int) () at …/output.cpp:246
#15 0x00000000004d4135 in LAMMPS_NS::Verlet::setup(int) () at …/verlet.cpp:156
#16 0x00000000006d1ff8 in LAMMPS_NS::Run::command(int, char**) () at …/run.cpp:171
#17 0x0000000000470694 in LAMMPS_NS::Input::execute_command() () at …/input.cpp:789
#18 0x0000000000470f43 in LAMMPS_NS::Input::file() () at …/input.cpp:268
#19 0x00000000004435c8 in main () at …/main.cpp:98
#20 0x00002b64d5d4c555 in __libc_start_main () from /lib64/libc.so.6
#21 0x0000000000443c7f in _start ()