Using lammps_create_atoms and "run 0" in a Kokkos/CUDA interface

Hello,

I am trying to use lammps c++ api. Lammps has been compiled with Kokkos/CUDA in mind.
My test case is a simple simulation box creation and creating some atoms using the lammps_create_atoms C API.

Unfortunately, as soon as I do a lmp->input->one("run 0") to ask for the neighbour list to be rebuilt after the creation of some atoms, I get an CUDA Illegal memory access.

I tracked down the issue using gdb:

LAMMPS_NS::AtomKokkos::map_set_device() at lammps-29Aug2024/src/KOKKOS/atom_map_kokkos.cpp:258
lammps_create_atoms() at lammps-29Aug2024/src/library.cpp:5556

It seems there is an issue with the tag deep_copy.

My code is the following:

//LAMMPS includes
#include "lammps.h"
#include "input.h"
#include "atom_masks.h"
#include "atom.h"
#include "library.h"
#include "universe.h"
#include "fix.h"
#include "domain.h"
#include "irregular.h"
#include "error.h"
#include "info.h"
#include "platform.h"
#include "pair.h"
#include "force.h"
#include "version.h"


#include "kokkos.h"
/*#include "accelerator_kokkos.h"
#include "neighbor_kokkos.h"
#include "neigh_list_kokkos.h"
#include "kokkos_type.h"
#include "Kokkos_Core.hpp"
#include "Kokkos_StdAlgorithms.hpp"
*/

#include <mpi.h>
#include <iostream>
#include <algorithm>
#include <ranges>
#include <random>


//--------------------------------------------------------------------------------------------------
/**
 * @brief Flatten a vector, using ranges and concept!
 * @link https://stackoverflow.com/a/71680819
 */
template<typename R>
concept nested_range = std::ranges::input_range<R> && std::ranges::range<std::ranges::range_reference_t<R>>;

struct flatten_t
{
  template<nested_range R>
  auto operator()(R && r) const
  {
    return std::forward<R>(r) | std::ranges::views::transform(*this) | std::ranges::views::join;
  }
  
  template<typename T>
  auto operator()(T && t) const
  {
    return std::forward<T>(t);
  }
};

template<typename T>
auto operator |(T && t, flatten_t f)
{
    return f(std::forward<T>(t));
}
constexpr flatten_t flatten;

//--------------------------------------------------------------------------------------------------
/**
 * @brief Initialize the lammps object
 * @param lammps handle
 */
void Initialize(LAMMPS_NS::LAMMPS *lmp){

  lmp->input->one("clear");
  lmp->input->one("units metal");
  lmp->input->one("dimension 3");
  lmp->input->one("boundary p p f");
  lmp->input->one("atom_style atomic");
  lmp->input->one("atom_modify map array");

  lmp->input->one("region box block 0 100 0 100 0 300");
  lmp->input->one("create_box 1 box");

  lmp->input->one("pair_style tersoff");
  lmp->input->one("pair_coeff * * ~/lammps-29Aug2024/potentials/Si.tersoff Si");
  lmp->input->one("mass 1 28.0855");    //Silicium

  lmp->input->one("compute 1 all pe/atom");
  lmp->input->one("thermo 10");
  lmp->input->one("thermo_style custom step temp ke pe etotal");

  return void();
}

//--------------------------------------------------------------------------------------------------
/**
 * @brief test lammps_create_atoms function
 * @param lammps handle
 * @param k step
 */
void test_create_atoms(LAMMPS_NS::LAMMPS *lmp, const unsigned long long int& k){

  //Create a grid
  std::vector<std::array<double,3>> PES;
  for(int i=0;i<100; ++i){
    for(int j=0;j<100; ++j)
      PES.push_back({0.5+i, 0.5+j,1.0*k});
  }

  std::vector<std::array<double,3>> part_PES;
  std::ranges::sample(PES, std::back_inserter(part_PES), 50, std::mt19937_64 { std::random_device{}()});

  //view::common trick from htpps://stackoverfow.com/a/63116423
  auto view_flattened = part_PES | flatten | std::ranges::views::common;
  std::vector<double> out(view_flattened.begin(), view_flattened.end());

  std::vector<int> types(part_PES.size(),1);
  lammps_create_atoms(lmp, part_PES.size(), nullptr, types.data(), out.data(), nullptr, nullptr, 1);

  return void();
}

//--------------------------------------------------------------------------------------------------
//--------------------------------------------------------------------------------------------------
//--------------------------------------------------------------------------------------------------
int main(int argc, char **argv){

  setenv("OMP_PROC_BIND", "spread", true);
  setenv("OMP_PLACES", "threads", true);

  const char* lmpargv[] {"liblammps", "-nocite", "-echo", "none", "-log", "none", "-k", "on", "g", "1", "t", "16", "-sf", "kk", "-pk", "kokkos", "newton", "on"
                        , "neigh", "half", "comm", "device"};
  constexpr int lmpargc = sizeof(lmpargv)/sizeof(const char *);

  MPI_Init(&argc, &argv);

  LAMMPS_NS::LAMMPS *lmp;
  lmp = new LAMMPS_NS::LAMMPS(lmpargc, (char** )lmpargv, MPI_COMM_WORLD);

  std::cout << "LAMMPS version id = " << lmp->num_ver << std::endl;

  Initialize(lmp);
  lmp->input->one("run 0");

  lmp->input->one("write_dump all custom/gz ~/test_lmp/test_0.atom.gz type id x y z");
  unsigned long long int k(0);
  for(;k<235;++k){
    test_create_atoms(lmp,k);
    lmp->input->one("run 0");
    lmp->input->one("write_dump all custom/gz ~/test_lmp/test_" + std::to_string(k) + ".atom.gz type id x y z");
  }
  lmp->input->one("write_dump all custom/gz ~/test_lmp/test_" + std::to_string(k) + ".atom.gz type id x y z");

  delete lmp;
  Kokkos::finalize();
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  return 0;
}

From the code above, if I remove the run 0 from the for loop, the program runs correctly.
My question is, is this a bug or I am just not using correctly the API?

Thanks in advance,
Regards

@Dy_LMP Can you try with the latest version of LAMMPS (19Nov2024)? I fixed a bug in the KOKKOS atom map last month that may have helped.

1 Like

Hello @stamoor,
Thanks for checking my issue.
I just tested with the 19Nov2024 version. Same issue after a few loop iteration:

GNU gdb (Ubuntu 15.0.50.20240403-0ubuntu1) 15.0.50.20240403-git
Copyright (C) 2024 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ~/test_lmp/main.e...
(gdb) r
Starting program: ~/test_lmp/main.e
LAMMPS (19 Nov 2024)
KOKKOS mode with Kokkos version 4.4.1 is enabled (src/KOKKOS/kokkos.cpp:72)
  will use up to 1 GPU(s) per node
[New Thread 0x7ffff39ff000 (LWP 156347)]
[New Thread 0x7ffff0bff000 (LWP 156348)]
[New Thread 0x7fffe5fff000 (LWP 156349)]
[New Thread 0x7fffe57fe000 (LWP 156350)]
[New Thread 0x7fffe4ffd000 (LWP 156351)]
[New Thread 0x7fffdffff000 (LWP 156352)]
[New Thread 0x7fffdf7fe000 (LWP 156353)]
[New Thread 0x7fffdeffd000 (LWP 156354)]
[New Thread 0x7fffde7fc000 (LWP 156355)]
[New Thread 0x7fffddffb000 (LWP 156356)]
[New Thread 0x7fffdd7fa000 (LWP 156357)]
[New Thread 0x7fffdcff9000 (LWP 156358)]
[New Thread 0x7fffdc7f8000 (LWP 156359)]
[New Thread 0x7fffdbff7000 (LWP 156360)]
[New Thread 0x7fffdb7f6000 (LWP 156361)]
[New Thread 0x7fffdaff5000 (LWP 156362)]
  using 16 OpenMP thread(s) per MPI task
LAMMPS version id = 20241119
  using 16 OpenMP thread(s) per MPI task
Created orthogonal box = (0 0 0) to (100 100 300)
  1 by 1 by 1 MPI processor grid
Reading tersoff potential file ~/soft/lammps-29Aug2024/potentials/Si.tersoff with DATE: 2007-10-25
WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 5.2
  ghost atom cutoff = 5.2
  binsize = 5.2, bins = 20 20 58
  1 neighbor lists, perpetual/occasional/extra = 1 0 0
  (1) pair tersoff/kk, perpetual
      attributes: full, newton on, kokkos_device
      pair build: full/bin/kk/device
      stencil: full/bin/3d
      bin: kk/device
Setting up Verlet run ...
  Unit style    : metal
  Current step  : 0
  Time step     : 0.001
Per MPI rank memory allocation (min/avg/max) = 6.506 | 6.506 | 6.506 Mbytes
   Step          Temp          KinEng         PotEng         TotEng
         0   0             -0              0              0
Loop time of 8.6685e-05 on 16 procs for 0 steps with 0 atoms

79.6% CPU use with 1 MPI tasks x 16 OpenMP threads

MPI task timing breakdown:
Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
Pair    | 0          | 0          | 0          |   0.0 |  0.00
Neigh   | 0          | 0          | 0          |   0.0 |  0.00
Comm    | 0          | 0          | 0          |   0.0 |  0.00
Output  | 0          | 0          | 0          |   0.0 |  0.00
Modify  | 0          | 0          | 0          |   0.0 |  0.00
Other   |            | 8.668e-05  |            |       |100.00

Nlocal:              0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Nghost:              0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Neighs:              0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0
FullNghs:            0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0

Total # of neighbors = 0
Neighbor list builds = 0
Dangerous builds = 0
WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
Setting up Verlet run ...
  Unit style    : metal
  Current step  : 0
  Time step     : 0.001
Per MPI rank memory allocation (min/avg/max) = 6.561 | 6.561 | 6.561 Mbytes
   Step          Temp          KinEng         PotEng         TotEng
         0   0              0              12.443565      12.443565
Loop time of 8.1451e-05 on 16 procs for 0 steps with 50 atoms

78.6% CPU use with 1 MPI tasks x 16 OpenMP threads

MPI task timing breakdown:
Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
Pair    | 0          | 0          | 0          |   0.0 |  0.00
Neigh   | 0          | 0          | 0          |   0.0 |  0.00
Comm    | 0          | 0          | 0          |   0.0 |  0.00
Output  | 0          | 0          | 0          |   0.0 |  0.00
Modify  | 0          | 0          | 0          |   0.0 |  0.00
Other   |            | 8.145e-05  |            |       |100.00

Nlocal:             50 ave          50 max          50 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Nghost:             16 ave          16 max          16 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Neighs:              0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0
FullNghs:           18 ave          18 max          18 min
Histogram: 1 0 0 0 0 0 0 0 0 0

Total # of neighbors = 18
Ave neighs/atom = 0.36
Neighbor list builds = 0
Dangerous builds = 0
WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
Setting up Verlet run ...
  Unit style    : metal
  Current step  : 0
  Time step     : 0.001
Per MPI rank memory allocation (min/avg/max) = 6.613 | 6.613 | 6.613 Mbytes
   Step          Temp          KinEng         PotEng         TotEng
         0   0              0              411.71436      411.71436
Loop time of 7.7669e-05 on 16 procs for 0 steps with 100 atoms

79.8% CPU use with 1 MPI tasks x 16 OpenMP threads

MPI task timing breakdown:
Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
Pair    | 0          | 0          | 0          |   0.0 |  0.00
Neigh   | 0          | 0          | 0          |   0.0 |  0.00
Comm    | 0          | 0          | 0          |   0.0 |  0.00
Output  | 0          | 0          | 0          |   0.0 |  0.00
Modify  | 0          | 0          | 0          |   0.0 |  0.00
Other   |            | 7.767e-05  |            |       |100.00

Nlocal:            100 ave         100 max         100 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Nghost:             32 ave          32 max          32 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Neighs:              0 ave           0 max           0 min
Histogram: 1 0 0 0 0 0 0 0 0 0
FullNghs:           90 ave          90 max          90 min
Histogram: 1 0 0 0 0 0 0 0 0 0

Total # of neighbors = 90
Ave neighs/atom = 0.9
Neighbor list builds = 0
Dangerous builds = 0
cudaDeviceSynchronize() error( cudaErrorIllegalAddress): an illegal memory access was encountered ~/soft/lammps-19Nov2024/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp:153
Backtrace:
[0x555556d6c099]
[0x555556d42e90]
[0x555556d74f5d]
[0x555556d7515f]
[0x555556d4476d]
[0x5555559bbe14]
[0x5555557cf41a]
[0x555555769f84]
[0x55555576a3a3]
[0x7ffff74051ca]
[0x7ffff740528b] __libc_start_main
[0x555555769325]

Thread 1 "main.e" received signal SIGABRT, Aborted.
Download failed: Invalid argument.  Continuing without source file ./nptl/./nptl/pthread_kill.c.
__pthread_kill_implementation (no_tid=0, signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:44
warning: 44     ./nptl/pthread_kill.c: No such file or directory
(gdb) where
#0  __pthread_kill_implementation (no_tid=0, signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:44
#1  __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
#2  __GI___pthread_kill (threadid=<optimized out>, signo=signo@entry=6) at ./nptl/pthread_kill.c:89
#3  0x00007ffff742026e in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#4  0x00007ffff74038ff in __GI_abort () at ./stdlib/abort.c:79
#5  0x0000555556d42e9d in Kokkos::Impl::host_abort(char const*) ()
#6  0x0000555556d74f5d in Kokkos::Impl::cuda_internal_error_abort(cudaError, char const*, char const*, int) ()
#7  0x0000555556d7515f in Kokkos::Impl::cuda_device_synchronize(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#8  0x0000555556d4476d in Kokkos::Impl::ExecSpaceManager::static_fence(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#9  0x00005555559bbe14 in LAMMPS_NS::AtomKokkos::map_set_device() ()
#10 0x00005555557cf41a in lammps_create_atoms ()
#11 0x0000555555769f84 in test_create_atoms (lmp=0x55555b73ef20, k=@0x7fffffffdd08: 2) at ~/test_lmp/main.cpp:116
#12 0x000055555576a3a3 in main (argc=1, argv=0x7fffffffdf98) at ~/test_lmp/main.cpp:146

Thank you for confirming, I will take a look when I get a minute