/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@...1735... Lars Winterfeld, lars.winterfeld@...1735... Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #include #include #include #include "cuda.h" #include "atom.h" #include "domain.h" #include "force.h" #include "pair.h" #include "update.h" #include "neighbor.h" #include "neigh_list.h" #include "universe.h" #include "input.h" #include "error.h" #include "cuda_neigh_list.h" //#include "pre_binning_cu.h" #include "binning_cu.h" //#include "reverse_binning_cu.h" #include #include #include "cuda_pair_cu.h" #include "cuda_cu.h" using namespace LAMMPS_NS; Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp) { cuda_exists=true; lmp->cuda=this; if(universe->me==0) printf("# Using LAMMPS_CUDA \n"); shared_data.me=universe->me; device_set=false; Cuda_Cuda_GetCompileSettings(&shared_data); if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4); if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4); if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4); if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4); if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4); if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4); #ifdef FFT_CUFFT if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1); #else if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0); #endif if(shared_data.compile_settings.arch!=CUDA_ARCH) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH); cu_x = 0; cu_v = 0; cu_f = 0; cu_tag = 0; cu_type = 0; cu_mask = 0; cu_image = 0; cu_xhold = 0; cu_q = 0; cu_rmass = 0; cu_mass = 0; cu_virial = 0; cu_eatom = 0; cu_vatom = 0; cu_radius = 0; cu_density = 0; cu_omega = 0; cu_torque = 0; cu_special = 0; cu_nspecial = 0; cu_molecule = 0; cu_x_type = 0; x_type = 0; cu_v_radius = 0; v_radius = 0; cu_omega_rmass = 0; omega_rmass = 0; binned_id = 0; cu_binned_id = 0; binned_idnew = 0; cu_binned_idnew = 0; cu_map_array = 0; copy_buffer=0; copy_buffersize=0; neighbor_decide_by_integrator=0; pinned=true; debugdata=0; new int[2*CUDA_MAX_DEBUG_SIZE]; finished_setup = false; begin_setup = false; finished_run = false; setSharedDataZero(); uploadtime=0; downloadtime=0; dotiming=false; dotestatom = false; testatom = 0; oncpu = true; self_comm = 0; MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");) //cCudaData } Cuda::~Cuda() { print_timings(); if(universe->me==0) printf("# CUDA: Free memory...\n"); delete cu_q; delete cu_x; delete cu_v; delete cu_f; delete cu_tag; delete cu_type; delete cu_mask; delete cu_image; delete cu_xhold; delete cu_mass; delete cu_rmass; delete cu_virial; delete cu_eng_vdwl; delete cu_eng_coul; delete cu_eatom; delete cu_vatom; delete cu_radius; delete cu_density; delete cu_omega; delete cu_torque; delete cu_molecule; delete cu_x_type; delete [] x_type; delete cu_v_radius; delete [] v_radius; delete cu_omega_rmass; delete [] omega_rmass; delete cu_map_array; std::map::iterator p = neigh_lists.begin(); while(p != neigh_lists.end()) { delete p->second; ++p; } } void Cuda::accelerator(int narg, char** arg) { if(device_set) return; if(universe->me==0) printf("# CUDA: Activate GPU \n"); int* devicelist=NULL; int pppn=2; for(int i=0;iall(FLERR,"Invalid Options for 'accelerator' command. Expecting a number after 'gpu/node' option."); pppn=atoi(arg[i]); } if(strcmp(arg[i],"gpu/node/special")==0) { if(++i==narg) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node/special'."); pppn=atoi(arg[i]); if(pppn<1) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); if(i+pppn==narg) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); devicelist=new int[pppn]; for(int k=0;kall(FLERR,"Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); pinned=atoi(arg[i])==0?false:true; if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n"); } if(strcmp(arg[i],"timing")==0) { dotiming=true; } if(strcmp(arg[i],"suffix")==0) { if(++i==narg) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); strcpy(lmp->suffix,arg[i]); } if(strcmp(arg[i],"overlap_comm")==0) { shared_data.overlap_comm=1; } if(strcmp(arg[i],"test")==0) { if(++i==narg) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting a number after 'test' option."); testatom=atof(arg[i]); dotestatom=true; } if(strcmp(arg[i],"override/bpa")==0) { if(++i==narg) error->all(FLERR,"Invalid Options for 'accelerator' command. Expecting a number after 'override/bpa' option."); shared_data.pair.override_block_per_atom = atoi(arg[i]); } } CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist); //if(shared_data.overlap_comm) CudaWrapper_AddStreams(3); cu_x = 0; cu_v = 0; cu_f = 0; cu_tag = 0; cu_type = 0; cu_mask = 0; cu_image = 0; cu_xhold = 0; cu_q = 0; cu_rmass = 0; cu_mass = 0; cu_virial = 0; cu_eatom = 0; cu_vatom = 0; cu_radius = 0; cu_density = 0; cu_omega = 0; cu_torque = 0; cu_special = 0; cu_nspecial = 0; cu_molecule = 0; cu_x_type = 0; cu_v_radius = 0; cu_omega_rmass = 0; cu_binned_id = 0; cu_binned_idnew = 0; device_set=true; allocate(); delete devicelist; } void Cuda::setSharedDataZero() { MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");) shared_data.atom.nlocal = 0; shared_data.atom.nghost = 0; shared_data.atom.nall = 0; shared_data.atom.nmax = 0; shared_data.atom.ntypes = 0; shared_data.atom.q_flag = 0; shared_data.atom.need_eatom = 0; shared_data.atom.need_vatom = 0; shared_data.atom.update_nmax = 1; shared_data.atom.update_nlocal = 1; shared_data.atom.update_neigh = 1; shared_data.pair.cudable_force = 0; shared_data.pair.collect_forces_later = 0; shared_data.pair.use_block_per_atom = 0; shared_data.pair.override_block_per_atom = -1; shared_data.pair.cut = 0; shared_data.pair.cutsq = 0; shared_data.pair.cut_inner = 0; shared_data.pair.cut_coul = 0; shared_data.pair.special_lj = 0; shared_data.pair.special_coul = 0; shared_data.pair.neighall = false; shared_data.pppm.cudable_force = 0; shared_data.buffersize = 0; shared_data.buffer_new = 1; shared_data.buffer = NULL; shared_data.comm.comm_phase=0; shared_data.overlap_comm=0; shared_data.comm.buffer = NULL; shared_data.comm.buffer_size=0; shared_data.comm.overlap_split_ratio=0; // setTimingsZero(); } void Cuda::allocate() { accelerator(0,NULL); MYDBG(printf("# CUDA: Cuda::allocate ...\n");) if(not cu_virial) { cu_virial = new cCudaData (NULL, & shared_data.pair.virial , 6); cu_eng_vdwl = new cCudaData (NULL, & shared_data.pair.eng_vdwl ,1); cu_eng_coul = new cCudaData (NULL, & shared_data.pair.eng_coul ,1); cu_extent = new cCudaData (extent, 6); shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int)); int size=2*CUDA_MAX_DEBUG_SIZE; debugdata = new int[size]; cu_debugdata = new cCudaData (debugdata , size); shared_data.debugdata=cu_debugdata->dev_data(); } checkResize(); setSystemParams(); MYDBG(printf("# CUDA: Cuda::allocate done...\n");) } void Cuda::setSystemParams() { MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");) shared_data.atom.nlocal = atom->nlocal; shared_data.atom.nghost = atom->nghost; shared_data.atom.nall = atom->nlocal + atom->nghost; shared_data.atom.ntypes = atom->ntypes; shared_data.atom.q_flag = atom->q_flag; shared_data.atom.rmass_flag = atom->rmass_flag; MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");) } void Cuda::setDomainParams() { MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");) cuda_shared_domain* cu_domain = &shared_data.domain; cu_domain->triclinic = domain->triclinic; for(short i=0; i<3; ++i) { cu_domain->periodicity[i] = domain->periodicity[i]; cu_domain->sublo[i] = domain->sublo[i]; cu_domain->subhi[i] = domain->subhi[i]; cu_domain->boxlo[i] = domain->boxlo[i]; cu_domain->boxhi[i] = domain->boxhi[i]; cu_domain->prd[i] = domain->prd[i]; } if(domain->triclinic) { for(short i=0; i<3; ++i) { cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i]; cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i]; cu_domain->prd_lamda[i] = domain->prd_lamda[i]; } cu_domain->xy = domain->xy; cu_domain->xz = domain->xz; cu_domain->yz = domain->yz; } for(int i=0;i<6;i++) { cu_domain->h[i]=domain->h[i]; cu_domain->h_inv[i]=domain->h_inv[i]; cu_domain->h_rate[i]=domain->h_rate[i]; } cu_domain->update=2; MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");) } void Cuda::checkResize() { MYDBG(printf("# CUDA: Cuda::checkResize ...\n");) accelerator(0,NULL); cuda_shared_atom* cu_atom = & shared_data.atom; cuda_shared_pair* cu_pair = & shared_data.pair; cu_atom->q_flag = atom->q_flag; cu_atom->rmass_flag = atom->rmass ? 1 : 0; cu_atom->nall = atom->nlocal + atom->nghost; cu_atom->nlocal = atom->nlocal; cu_atom->nghost = atom->nghost; // do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated) if(atom->nmax > cu_atom->nmax || cu_tag == NULL) { delete cu_x; cu_x = new cCudaData ((double*)atom->x , & cu_atom->x , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true); delete cu_v; cu_v = new cCudaData ((double*)atom->v, & cu_atom->v , atom->nmax, 3); delete cu_f; cu_f = new cCudaData ((double*)atom->f, & cu_atom->f , atom->nmax, 3,0,true); delete cu_tag; cu_tag = new cCudaData (atom->tag , & cu_atom->tag , atom->nmax ); delete cu_type; cu_type = new cCudaData (atom->type , & cu_atom->type , atom->nmax ); delete cu_mask; cu_mask = new cCudaData (atom->mask , & cu_atom->mask , atom->nmax ); delete cu_image; cu_image = new cCudaData (atom->image , & cu_atom->image , atom->nmax ); if(atom->rmass) {delete cu_rmass; cu_rmass = new cCudaData (atom->rmass , & cu_atom->rmass , atom->nmax );} if(cu_atom->q_flag) {delete cu_q; cu_q = new cCudaData ((double*)atom->q, & cu_atom->q , atom->nmax );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);} if(atom->radius) { delete cu_radius; cu_radius = new cCudaData (atom->radius , & cu_atom->radius , atom->nmax ); delete cu_v_radius; cu_v_radius = new cCudaData (v_radius , & cu_atom->v_radius , atom->nmax*4); delete cu_omega_rmass; cu_omega_rmass = new cCudaData (omega_rmass , & cu_atom->omega_rmass , atom->nmax*4); } if(atom->omega) {delete cu_omega; cu_omega = new cCudaData (((double*) atom->omega) , & cu_atom->omega , atom->nmax,3 );} if(atom->torque) {delete cu_torque; cu_torque = new cCudaData (((double*) atom->torque) , & cu_atom->torque , atom->nmax,3 );} if(atom->special) {delete cu_special; cu_special = new cCudaData (((int*) &(atom->special[0][0])) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;} if(atom->nspecial) {delete cu_nspecial; cu_nspecial = new cCudaData (((int*) atom->nspecial) , & cu_atom->nspecial , atom->nmax,3 );} if(atom->molecule) {delete cu_molecule; cu_molecule = new cCudaData (((int*) atom->molecule) , & cu_atom->molecule , atom->nmax );} shared_data.atom.special_flag = neighbor->special_flag; shared_data.atom.molecular = atom->molecular; cu_atom->update_nmax = 2; cu_atom->nmax = atom->nmax; delete cu_x_type; cu_x_type = new cCudaData (x_type , & cu_atom->x_type , atom->nmax*4); } if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]maxhold))&&neighbor->xhold) { delete cu_xhold; cu_xhold = new cCudaData ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3); shared_data.atom.maxhold=neighbor->maxhold; } if(atom->mass && !cu_mass) {cu_mass = new cCudaData (atom->mass , & cu_atom->mass , atom->ntypes+1);} cu_atom->mass_host = atom->mass; if(atom->map_style==1) { if((cu_map_array==NULL)) { cu_map_array = new cCudaData (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() ); } else if(cu_map_array->dev_size()/sizeof(int)get_map_size()) { delete cu_map_array; cu_map_array = new cCudaData (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() ); } } // if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer if(cu_x ->get_host_data() != atom->x) cu_x ->set_host_data((double*) (atom->x)); if(cu_v ->get_host_data() != atom->v) cu_v ->set_host_data((double*) (atom->v)); if(cu_f ->get_host_data() != atom->f) cu_f ->set_host_data((double*) (atom->f)); if(cu_tag ->get_host_data() != atom->tag) cu_tag ->set_host_data(atom->tag); if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type); if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask); if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image); if(cu_xhold) if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold)); if(atom->rmass) if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass)); if(cu_atom->q_flag) if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q)); if(atom->radius) if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius)); if(atom->omega) if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega)); if(atom->torque) if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque)); if(atom->special) if(cu_special->get_host_data() != atom->special) {delete cu_special; cu_special = new cCudaData (((int*) atom->special) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;} if(atom->nspecial) if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial)); if(atom->molecule) if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule)); if(force) if(cu_virial ->get_host_data() != force->pair->virial) cu_virial ->set_host_data(force->pair->virial); if(force) if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl) cu_eng_vdwl ->set_host_data(&force->pair->eng_vdwl); if(force) if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul) cu_eng_coul ->set_host_data(&force->pair->eng_coul); cu_atom->update_nlocal = 2; MYDBG(printf("# CUDA: Cuda::checkResize done...\n");) } void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom) { if(eflag_atom) { if(not cu_eatom) cu_eatom = new cCudaData (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} if(cu_eatom->get_dim()[0]!=atom->nmax) { //delete cu_eatom; //cu_eatom = new cCudaData (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} shared_data.atom.update_nmax=2; } cu_eatom->set_host_data(force->pair->eatom); cu_eatom->memset_device(0); } if(vflag_atom) { if(not cu_vatom) cu_vatom = new cCudaData ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} if(cu_vatom->get_dim()[0]!=atom->nmax) { //delete cu_vatom; //cu_vatom = new cCudaData ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} shared_data.atom.update_nmax=2; } cu_vatom->set_host_data((double*)force->pair->vatom); cu_vatom->memset_device(0); } } void Cuda::uploadAll() { MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");) timespec starttime; timespec endtime; if(atom->nmax!=shared_data.atom.nmax) checkResize(); clock_gettime(CLOCK_REALTIME,&starttime); cu_x ->upload(); cu_v ->upload(); cu_f ->upload(); cu_tag ->upload(); cu_type->upload(); cu_mask->upload(); cu_image->upload(); if(shared_data.atom.q_flag) cu_q ->upload(); if(atom->rmass) cu_rmass->upload(); if(atom->radius) cu_radius->upload(); if(atom->omega) cu_omega->upload(); if(atom->torque) cu_torque->upload(); if(atom->special) cu_special->upload(); if(atom->nspecial) cu_nspecial->upload(); if(atom->molecule) cu_molecule->upload(); if(cu_eatom) cu_eatom->upload(); if(cu_vatom) cu_vatom->upload(); clock_gettime(CLOCK_REALTIME,&endtime); uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);) CUDA_IF_BINNING(Cuda_Binning (& shared_data);) shared_data.atom.triggerneighsq=neighbor->triggersq; MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");) } void Cuda::downloadAll() { MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");) timespec starttime; timespec endtime; if(atom->nmax!=shared_data.atom.nmax) checkResize(); CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); ) clock_gettime(CLOCK_REALTIME,&starttime); cu_x ->download(); cu_v ->download(); cu_f ->download(); cu_type->download(); cu_tag ->download(); cu_mask->download(); cu_image->download(); //if(shared_data.atom.need_eatom) cu_eatom->download(); //if(shared_data.atom.need_vatom) cu_vatom->download(); if(shared_data.atom.q_flag) cu_q ->download(); if(atom->rmass) cu_rmass->download(); if(atom->radius) cu_radius->download(); if(atom->omega) cu_omega->download(); if(atom->torque) cu_torque->download(); if(atom->special) cu_special->download(); if(atom->nspecial) cu_nspecial->download(); if(atom->molecule) cu_molecule->download(); if(cu_eatom) cu_eatom->download(); if(cu_vatom) cu_vatom->download(); clock_gettime(CLOCK_REALTIME,&endtime); downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");) } void Cuda::downloadX() { Cuda_Pair_RevertXType(& this->shared_data); cu_x->download(); } CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list) { MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");) std::map::iterator p = neigh_lists.find(neigh_list); if(p != neigh_lists.end()) return p->second; else { CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list); neigh_lists.insert(std::pair(neigh_list, neigh_list_cuda)); return neigh_list_cuda; } MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");) } void Cuda::uploadAllNeighborLists() { MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");) std::map::iterator p = neigh_lists.begin(); while(p != neigh_lists.end()) { p->second->nl_upload(); if(not (p->second->neigh_list->cuda_list->build_cuda)) for(int i=0;inlocal;i++) p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ; ++p; } MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");) } void Cuda::downloadAllNeighborLists() { MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");) std::map::iterator p = neigh_lists.begin(); while(p != neigh_lists.end()) { p->second->nl_download(); ++p; } } void Cuda::update_xhold(int &maxhold,double* xhold) { if(this->shared_data.atom.maxholdnmax) { maxhold = atom->nmax; delete this->cu_xhold; this->cu_xhold = new cCudaData ((double*)xhold, & this->shared_data.atom.xhold , maxhold, 3); } this->shared_data.atom.maxhold=maxhold; CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT)); } void Cuda::setTimingsZero() { shared_data.cuda_timings.test1=0; shared_data.cuda_timings.test2=0; //communication shared_data.cuda_timings.comm_forward_total = 0; shared_data.cuda_timings.comm_forward_mpi_upper = 0; shared_data.cuda_timings.comm_forward_mpi_lower = 0; shared_data.cuda_timings.comm_forward_kernel_pack = 0; shared_data.cuda_timings.comm_forward_kernel_unpack = 0; shared_data.cuda_timings.comm_forward_upload = 0; shared_data.cuda_timings.comm_forward_download = 0; shared_data.cuda_timings.comm_exchange_total = 0; shared_data.cuda_timings.comm_exchange_mpi = 0; shared_data.cuda_timings.comm_exchange_kernel_pack = 0; shared_data.cuda_timings.comm_exchange_kernel_unpack = 0; shared_data.cuda_timings.comm_exchange_kernel_fill = 0; shared_data.cuda_timings.comm_exchange_cpu_pack= 0; shared_data.cuda_timings.comm_exchange_upload = 0; shared_data.cuda_timings.comm_exchange_download = 0; shared_data.cuda_timings.comm_border_total = 0; shared_data.cuda_timings.comm_border_mpi = 0; shared_data.cuda_timings.comm_border_kernel_pack = 0; shared_data.cuda_timings.comm_border_kernel_unpack = 0; shared_data.cuda_timings.comm_border_kernel_buildlist = 0; shared_data.cuda_timings.comm_border_kernel_self = 0; shared_data.cuda_timings.comm_border_upload = 0; shared_data.cuda_timings.comm_border_download = 0; //pair forces shared_data.cuda_timings.pair_xtype_conversion = 0; shared_data.cuda_timings.pair_kernel = 0; shared_data.cuda_timings.pair_virial = 0; shared_data.cuda_timings.pair_force_collection = 0; //neighbor shared_data.cuda_timings.neigh_bin = 0; shared_data.cuda_timings.neigh_build = 0; shared_data.cuda_timings.neigh_special = 0; //PPPM shared_data.cuda_timings.pppm_particle_map = 0; shared_data.cuda_timings.pppm_make_rho = 0; shared_data.cuda_timings.pppm_brick2fft = 0; shared_data.cuda_timings.pppm_poisson = 0; shared_data.cuda_timings.pppm_fillbrick = 0; shared_data.cuda_timings.pppm_fieldforce = 0; shared_data.cuda_timings.pppm_compute = 0; CudaWrapper_CheckUploadTime(true); CudaWrapper_CheckDownloadTime(true); CudaWrapper_CheckCPUBufUploadTime(true); CudaWrapper_CheckCPUBufDownloadTime(true); } void Cuda::print_timings() { if(universe->me!=0) return; if(not dotiming) return; printf("\n # CUDA: Special timings\n\n"); printf("\n Transfer Times\n"); printf(" PCIe Upload: \t %lf s\n",CudaWrapper_CheckUploadTime()); printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime()); printf(" CPU Tempbbuf Upload: \t %lf \n",CudaWrapper_CheckCPUBufUploadTime()); printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime()); printf("\n Communication \n"); printf(" Forward Total \t %lf \n",shared_data.cuda_timings.comm_forward_total); printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper); printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower); printf(" Forward Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack); printf(" Forward Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack); printf(" Forward Kernel Self \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self); printf(" Forward Upload \t %lf \n",shared_data.cuda_timings.comm_forward_upload); printf(" Forward Download \t %lf \n",shared_data.cuda_timings.comm_forward_download); printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio); printf("\n"); printf(" Exchange Total \t %lf \n",shared_data.cuda_timings.comm_exchange_total); printf(" Exchange MPI \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi); printf(" Exchange Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack); printf(" Exchange Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack); printf(" Exchange Kernel Fill \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill); printf(" Exchange CPU Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack); printf(" Exchange Upload \t %lf \n",shared_data.cuda_timings.comm_exchange_upload); printf(" Exchange Download \t %lf \n",shared_data.cuda_timings.comm_exchange_download); printf("\n"); printf(" Border Total \t %lf \n",shared_data.cuda_timings.comm_border_total); printf(" Border MPI \t %lf \n",shared_data.cuda_timings.comm_border_mpi); printf(" Border Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack); printf(" Border Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack); printf(" Border Kernel Self \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self); printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist); printf(" Border Upload \t %lf \n",shared_data.cuda_timings.comm_border_upload); printf(" Border Download \t %lf \n",shared_data.cuda_timings.comm_border_download); printf("\n"); //pair forces printf(" Pair XType Conversion \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion ); printf(" Pair Kernel \t %lf \n",shared_data.cuda_timings.pair_kernel ); printf(" Pair Virial \t %lf \n",shared_data.cuda_timings.pair_virial ); printf(" Pair Force Collection \t %lf \n",shared_data.cuda_timings.pair_force_collection ); printf("\n"); //neighbor printf(" Neighbor Binning \t %lf \n",shared_data.cuda_timings.neigh_bin ); printf(" Neighbor Build \t %lf \n",shared_data.cuda_timings.neigh_build ); printf(" Neighbor Special \t %lf \n",shared_data.cuda_timings.neigh_special ); printf("\n"); //pppm if(force->kspace) { printf(" PPPM Total \t %lf \n",shared_data.cuda_timings.pppm_compute ); printf(" PPPM Particle Map \t %lf \n",shared_data.cuda_timings.pppm_particle_map ); printf(" PPPM Make Rho \t %lf \n",shared_data.cuda_timings.pppm_make_rho ); printf(" PPPM Brick2fft \t %lf \n",shared_data.cuda_timings.pppm_brick2fft ); printf(" PPPM Poisson \t %lf \n",shared_data.cuda_timings.pppm_poisson ); printf(" PPPM Fillbrick \t %lf \n",shared_data.cuda_timings.pppm_fillbrick ); printf(" PPPM Fieldforce \t %lf \n",shared_data.cuda_timings.pppm_fieldforce ); printf("\n"); } printf(" Debug Test 1 \t %lf \n",shared_data.cuda_timings.test1); printf(" Debug Test 2 \t %lf \n",shared_data.cuda_timings.test2); printf("\n"); }