| 1 | /* |
| 2 | * vector_dist_comm.hpp |
| 3 | * |
| 4 | * Created on: Aug 18, 2016 |
| 5 | * Author: i-bird |
| 6 | */ |
| 7 | |
| 8 | #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_ |
| 9 | #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_ |
| 10 | |
| 11 | #define TEST1 |
| 12 | |
| 13 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 14 | #include "Vector/cuda/vector_dist_cuda_funcs.cuh" |
| 15 | #include "util/cuda/kernels.cuh" |
| 16 | #endif |
| 17 | |
| 18 | #include "Vector/util/vector_dist_funcs.hpp" |
| 19 | #include "cuda/vector_dist_comm_util_funcs.cuh" |
| 20 | #include "util/cuda/scan_ofp.cuh" |
| 21 | |
| 22 | template<typename T> |
| 23 | struct DEBUG |
| 24 | { |
| 25 | static float ret(T & tmp) |
| 26 | { |
| 27 | return 0.0; |
| 28 | } |
| 29 | }; |
| 30 | |
| 31 | template<> |
| 32 | struct DEBUG<float &> |
| 33 | { |
| 34 | static float ret(float & tmp) |
| 35 | { |
| 36 | return tmp; |
| 37 | } |
| 38 | }; |
| 39 | |
| 40 | /*! \brief compute the communication options from the ghost_get/put options |
| 41 | * |
| 42 | * |
| 43 | */ |
| 44 | inline static size_t compute_options(size_t opt) |
| 45 | { |
| 46 | size_t opt_ = NONE; |
| 47 | if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING) |
| 48 | {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;} |
| 49 | |
| 50 | if (opt & RUN_ON_DEVICE) |
| 51 | { |
| 52 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 53 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
| 54 | opt_ |= MPI_GPU_DIRECT; |
| 55 | #else |
| 56 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
| 57 | #endif |
| 58 | } |
| 59 | |
| 60 | return opt_; |
| 61 | } |
| 62 | |
| 63 | /*! \brief template selector for asynchronous or not asynchronous |
| 64 | * |
| 65 | * \tparam impl implementation |
| 66 | * \tparam prp properties |
| 67 | * |
| 68 | */ |
| 69 | template<unsigned int impl, template<typename> class layout_base, unsigned int ... prp> |
| 70 | struct ghost_exchange_comm_impl |
| 71 | { |
| 72 | template<typename Vcluster_type, typename vector_prop_type, |
| 73 | typename vector_pos_type, typename send_vector, |
| 74 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 75 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
| 76 | typename g_opart_sz_type> |
| 77 | static inline void sendrecv_prp(Vcluster_type & v_cl, |
| 78 | openfpm::vector<send_vector> & g_send_prp, |
| 79 | vector_prop_type & v_prp, |
| 80 | vector_pos_type & v_pos, |
| 81 | prc_g_opart_type & prc_g_opart, |
| 82 | prc_recv_get_type & prc_recv_get, |
| 83 | recv_sz_get_type & recv_sz_get, |
| 84 | recv_sz_get_byte_type & recv_sz_get_byte, |
| 85 | g_opart_sz_type & g_opart_sz, |
| 86 | size_t g_m, |
| 87 | size_t opt) |
| 88 | { |
| 89 | // if there are no properties skip |
| 90 | // SSendRecvP send everything when we do not give properties |
| 91 | |
| 92 | if (sizeof...(prp) != 0) |
| 93 | { |
| 94 | size_t opt_ = compute_options(opt); |
| 95 | if (opt & SKIP_LABELLING) |
| 96 | { |
| 97 | if (opt & RUN_ON_DEVICE) |
| 98 | { |
| 99 | op_ssend_gg_recv_merge_run_device opm(g_m); |
| 100 | v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 101 | } |
| 102 | else |
| 103 | { |
| 104 | op_ssend_gg_recv_merge opm(g_m); |
| 105 | v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 106 | } |
| 107 | } |
| 108 | else |
| 109 | {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
| 110 | |
| 111 | // fill g_opart_sz |
| 112 | g_opart_sz.resize(prc_g_opart.size()); |
| 113 | |
| 114 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
| 115 | g_opart_sz.get(i) = g_send_prp.get(i).size(); |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | template<typename Vcluster_type, typename vector_prop_type, |
| 120 | typename vector_pos_type, typename send_pos_vector, |
| 121 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 122 | typename recv_sz_get_type> |
| 123 | static inline void sendrecv_pos(Vcluster_type & v_cl, |
| 124 | openfpm::vector<send_pos_vector> & g_pos_send, |
| 125 | vector_prop_type & v_prp, |
| 126 | vector_pos_type & v_pos, |
| 127 | prc_recv_get_type & prc_recv_get, |
| 128 | recv_sz_get_type & recv_sz_get, |
| 129 | prc_g_opart_type & prc_g_opart, |
| 130 | size_t opt) |
| 131 | { |
| 132 | size_t opt_ = compute_options(opt); |
| 133 | if (opt & SKIP_LABELLING) |
| 134 | { |
| 135 | v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 136 | } |
| 137 | else |
| 138 | { |
| 139 | prc_recv_get.clear(); |
| 140 | recv_sz_get.clear(); |
| 141 | v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | template<typename Vcluster_type, typename vector_prop_type, |
| 146 | typename vector_pos_type, typename send_pos_vector, |
| 147 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 148 | typename recv_sz_get_type> |
| 149 | static inline void sendrecv_pos_wait(Vcluster_type & v_cl, |
| 150 | openfpm::vector<send_pos_vector> & g_pos_send, |
| 151 | vector_prop_type & v_prp, |
| 152 | vector_pos_type & v_pos, |
| 153 | prc_recv_get_type & prc_recv_get, |
| 154 | recv_sz_get_type & recv_sz_get, |
| 155 | prc_g_opart_type & prc_g_opart, |
| 156 | size_t opt) |
| 157 | {} |
| 158 | |
| 159 | template<typename Vcluster_type, typename vector_prop_type, |
| 160 | typename vector_pos_type, typename send_vector, |
| 161 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 162 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
| 163 | typename g_opart_sz_type> |
| 164 | static inline void sendrecv_prp_wait(Vcluster_type & v_cl, |
| 165 | openfpm::vector<send_vector> & g_send_prp, |
| 166 | vector_prop_type & v_prp, |
| 167 | vector_pos_type & v_pos, |
| 168 | prc_g_opart_type & prc_g_opart, |
| 169 | prc_recv_get_type & prc_recv_get, |
| 170 | recv_sz_get_type & recv_sz_get, |
| 171 | recv_sz_get_byte_type & recv_sz_get_byte, |
| 172 | g_opart_sz_type & g_opart_sz, |
| 173 | size_t g_m, |
| 174 | size_t opt) |
| 175 | {} |
| 176 | }; |
| 177 | |
| 178 | |
| 179 | template<template<typename> class layout_base, unsigned int ... prp> |
| 180 | struct ghost_exchange_comm_impl<GHOST_ASYNC,layout_base, prp ... > |
| 181 | { |
| 182 | template<typename Vcluster_type, typename vector_prop_type, |
| 183 | typename vector_pos_type, typename send_vector, |
| 184 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 185 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
| 186 | typename g_opart_sz_type> |
| 187 | static inline void sendrecv_prp(Vcluster_type & v_cl, |
| 188 | openfpm::vector<send_vector> & g_send_prp, |
| 189 | vector_prop_type & v_prp, |
| 190 | vector_pos_type & v_pos, |
| 191 | prc_g_opart_type & prc_g_opart, |
| 192 | prc_recv_get_type & prc_recv_get, |
| 193 | recv_sz_get_type & recv_sz_get, |
| 194 | recv_sz_get_byte_type & recv_sz_get_byte, |
| 195 | g_opart_sz_type & g_opart_sz, |
| 196 | size_t g_m, |
| 197 | size_t opt) |
| 198 | { |
| 199 | prc_recv_get.clear(); |
| 200 | recv_sz_get.clear(); |
| 201 | |
| 202 | // if there are no properties skip |
| 203 | // SSendRecvP send everything when we do not give properties |
| 204 | |
| 205 | if (sizeof...(prp) != 0) |
| 206 | { |
| 207 | size_t opt_ = compute_options(opt); |
| 208 | if (opt & SKIP_LABELLING) |
| 209 | { |
| 210 | if (opt & RUN_ON_DEVICE) |
| 211 | { |
| 212 | op_ssend_gg_recv_merge_run_device opm(g_m); |
| 213 | v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 214 | } |
| 215 | else |
| 216 | { |
| 217 | op_ssend_gg_recv_merge opm(g_m); |
| 218 | v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 219 | } |
| 220 | } |
| 221 | else |
| 222 | {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
| 223 | } |
| 224 | |
| 225 | // fill g_opart_sz |
| 226 | g_opart_sz.resize(prc_g_opart.size()); |
| 227 | |
| 228 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
| 229 | {g_opart_sz.get(i) = g_send_prp.get(i).size();} |
| 230 | } |
| 231 | |
| 232 | template<typename Vcluster_type, typename vector_prop_type, |
| 233 | typename vector_pos_type, typename send_pos_vector, |
| 234 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 235 | typename recv_sz_get_type> |
| 236 | static inline void sendrecv_pos(Vcluster_type & v_cl, |
| 237 | openfpm::vector<send_pos_vector> & g_pos_send, |
| 238 | vector_prop_type & v_prp, |
| 239 | vector_pos_type & v_pos, |
| 240 | prc_recv_get_type & prc_recv_get, |
| 241 | recv_sz_get_type & recv_sz_get, |
| 242 | prc_g_opart_type & prc_g_opart, |
| 243 | size_t opt) |
| 244 | { |
| 245 | prc_recv_get.clear(); |
| 246 | recv_sz_get.clear(); |
| 247 | |
| 248 | size_t opt_ = compute_options(opt); |
| 249 | if (opt & SKIP_LABELLING) |
| 250 | { |
| 251 | v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 252 | } |
| 253 | else |
| 254 | { |
| 255 | prc_recv_get.clear(); |
| 256 | recv_sz_get.clear(); |
| 257 | v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | template<typename Vcluster_type, typename vector_prop_type, |
| 262 | typename vector_pos_type, typename send_pos_vector, |
| 263 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 264 | typename recv_sz_get_type> |
| 265 | static inline void sendrecv_pos_wait(Vcluster_type & v_cl, |
| 266 | openfpm::vector<send_pos_vector> & g_pos_send, |
| 267 | vector_prop_type & v_prp, |
| 268 | vector_pos_type & v_pos, |
| 269 | prc_recv_get_type & prc_recv_get, |
| 270 | recv_sz_get_type & recv_sz_get, |
| 271 | prc_g_opart_type & prc_g_opart, |
| 272 | size_t opt) |
| 273 | { |
| 274 | size_t opt_ = compute_options(opt); |
| 275 | if (opt & SKIP_LABELLING) |
| 276 | { |
| 277 | v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 278 | } |
| 279 | else |
| 280 | { |
| 281 | v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | template<typename Vcluster_type, typename vector_prop_type, |
| 286 | typename vector_pos_type, typename send_vector, |
| 287 | typename prc_recv_get_type, typename prc_g_opart_type, |
| 288 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
| 289 | typename g_opart_sz_type> |
| 290 | static inline void sendrecv_prp_wait(Vcluster_type & v_cl, |
| 291 | openfpm::vector<send_vector> & g_send_prp, |
| 292 | vector_prop_type & v_prp, |
| 293 | vector_pos_type & v_pos, |
| 294 | prc_g_opart_type & prc_g_opart, |
| 295 | prc_recv_get_type & prc_recv_get, |
| 296 | recv_sz_get_type & recv_sz_get, |
| 297 | recv_sz_get_byte_type & recv_sz_get_byte, |
| 298 | g_opart_sz_type & g_opart_sz, |
| 299 | size_t g_m, |
| 300 | size_t opt) |
| 301 | { |
| 302 | // if there are no properties skip |
| 303 | // SSendRecvP send everything when we do not give properties |
| 304 | |
| 305 | if (sizeof...(prp) != 0) |
| 306 | { |
| 307 | size_t opt_ = compute_options(opt); |
| 308 | if (opt & SKIP_LABELLING) |
| 309 | { |
| 310 | if (opt & RUN_ON_DEVICE) |
| 311 | { |
| 312 | op_ssend_gg_recv_merge_run_device opm(g_m); |
| 313 | v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 314 | } |
| 315 | else |
| 316 | { |
| 317 | op_ssend_gg_recv_merge opm(g_m); |
| 318 | v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
| 319 | } |
| 320 | } |
| 321 | else |
| 322 | {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
| 323 | } |
| 324 | } |
| 325 | }; |
| 326 | |
| 327 | |
| 328 | /*! \brief This class is an helper for the communication of vector_dist |
| 329 | * |
| 330 | * \tparam dim Dimensionality of the space where the elements lives |
| 331 | * \tparam St type of space float, double ... |
| 332 | * \tparam prop properties the vector element store in OpenFPM data structure format |
| 333 | * \tparam Decomposition Decomposition strategy to use CartDecomposition ... |
| 334 | * \tparam Memory Memory pool where store the information HeapMemory ... |
| 335 | * |
| 336 | * \see vector_dist |
| 337 | * |
| 338 | */ |
| 339 | |
| 340 | template<unsigned int dim, |
| 341 | typename St, |
| 342 | typename prop, |
| 343 | typename Decomposition = CartDecomposition<dim,St>, |
| 344 | typename Memory = HeapMemory, |
| 345 | template<typename> class layout_base = memory_traits_lin> |
| 346 | class vector_dist_comm |
| 347 | { |
| 348 | //! Number of units for each sub-domain |
| 349 | size_t v_sub_unit_factor = 64; |
| 350 | |
| 351 | //! definition of the send vector for position |
| 352 | typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector; |
| 353 | |
| 354 | //! VCluster |
| 355 | Vcluster<Memory> & v_cl; |
| 356 | |
| 357 | //! Domain decomposition |
| 358 | Decomposition dec; |
| 359 | |
| 360 | //! It map the processor id with the communication request into map procedure |
| 361 | openfpm::vector<size_t> p_map_req; |
| 362 | |
| 363 | //! For each near processor, outgoing particle id |
| 364 | //! \warning opart is assumed to be an ordered list |
| 365 | //! first id particle id |
| 366 | //! second id shift id |
| 367 | //! third id is the processor id |
| 368 | openfpm::vector<aggregate<int,int,int>, |
| 369 | Memory, |
| 370 | layout_base > m_opart; |
| 371 | |
| 372 | //! Per processor ordered particles id for ghost_get (see prc_g_opart) |
| 373 | //! For each processor the internal vector store the id of the |
| 374 | //! particles that must be communicated to the other processors |
| 375 | openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart; |
| 376 | |
| 377 | //! Same as g_opart but on device, the vector of vector is flatten into a single vector |
| 378 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
| 379 | CudaMemory, |
| 380 | memory_traits_inte> g_opart_device; |
| 381 | |
| 382 | //! Helper buffer for computation (on GPU) of local particles (position) |
| 383 | openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp; |
| 384 | |
| 385 | //! Helper buffer for computation (on GPU) of local particles (properties) |
| 386 | openfpm::vector<prop,Memory,layout_base> v_prp_tmp; |
| 387 | |
| 388 | //! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size() |
| 389 | openfpm::vector<size_t> g_opart_sz; |
| 390 | |
| 391 | //! processor rank list of g_opart |
| 392 | openfpm::vector<size_t> prc_g_opart; |
| 393 | |
| 394 | //! It store the list of processor that communicate with us (local processor) |
| 395 | //! from the last ghost get |
| 396 | openfpm::vector<size_t> prc_recv_get_pos; |
| 397 | openfpm::vector<size_t> prc_recv_get_prp; |
| 398 | |
| 399 | //! the same as prc_recv_get but for put |
| 400 | openfpm::vector<size_t> prc_recv_put; |
| 401 | |
| 402 | //! the same as prc_recv_get but for map |
| 403 | openfpm::vector<size_t> prc_recv_map; |
| 404 | |
| 405 | //! It store the size of the elements added for each processor that communicate with us (local processor) |
| 406 | //! from the last ghost get |
| 407 | openfpm::vector<size_t> recv_sz_get_pos; |
| 408 | openfpm::vector<size_t> recv_sz_get_prp; |
| 409 | //! Conversion to byte of recv_sz_get |
| 410 | openfpm::vector<size_t> recv_sz_get_byte; |
| 411 | |
| 412 | |
| 413 | //! The same as recv_sz_get but for put |
| 414 | openfpm::vector<size_t> recv_sz_put; |
| 415 | |
| 416 | //! The same as recv_sz_get but for map |
| 417 | openfpm::vector<size_t> recv_sz_map; |
| 418 | |
| 419 | //! elements sent for each processors (ghost_get) |
| 420 | openfpm::vector<size_t> prc_sz_gg; |
| 421 | |
| 422 | //! temporary buffer to processors ids |
| 423 | openfpm::vector<aggregate<unsigned int>, |
| 424 | Memory, |
| 425 | layout_base> proc_id_out; |
| 426 | |
| 427 | //! temporary buffer for the scan result |
| 428 | openfpm::vector<aggregate<unsigned int>, |
| 429 | Memory, |
| 430 | layout_base> starts; |
| 431 | |
| 432 | //! Processor communication size |
| 433 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset; |
| 434 | |
| 435 | |
| 436 | //! Temporary CudaMemory to do stuff |
| 437 | CudaMemory mem; |
| 438 | |
| 439 | //! Local ghost marker (across the ghost particles it mark from where we have the) |
| 440 | //! replicated ghost particles that are local |
| 441 | size_t lg_m; |
| 442 | |
| 443 | //! Sending buffer |
| 444 | openfpm::vector_fr<Memory> hsmem; |
| 445 | |
| 446 | //! process the particle with properties |
| 447 | template<typename prp_object, int ... prp> |
| 448 | struct proc_with_prp |
| 449 | { |
| 450 | //! process the particle |
| 451 | template<typename T1, typename T2> inline static void proc(size_t lbl, size_t cnt, size_t id, T1 & v_prp, T2 & m_prp) |
| 452 | { |
| 453 | // source object type |
| 454 | typedef encapc<1, prop, typename openfpm::vector<prop>::layout_type> encap_src; |
| 455 | // destination object type |
| 456 | typedef encapc<1, prp_object, typename openfpm::vector<prp_object>::layout_type> encap_dst; |
| 457 | |
| 458 | // Copy only the selected properties |
| 459 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(id), m_prp.get(lbl).get(cnt)); |
| 460 | } |
| 461 | }; |
| 462 | |
| 463 | /*! \brief Get the number of particles received from each processor during the last ghost_get |
| 464 | * |
| 465 | * |
| 466 | * \param i processor (list index) |
| 467 | * \return the number of particles |
| 468 | */ |
| 469 | size_t get_last_ghost_get_received_parts(size_t i) |
| 470 | { |
| 471 | // If the last ghost_get did not have properties the information about the number of particles |
| 472 | // received is in recv_sz_get_ois |
| 473 | if (recv_sz_get_prp.size() != 0) |
| 474 | {return recv_sz_get_prp.get(i);} |
| 475 | else |
| 476 | {return recv_sz_get_pos.get(i);} |
| 477 | } |
| 478 | |
| 479 | /*! \brief Get the number of processor involved during the last ghost_get |
| 480 | * |
| 481 | * \return the number of processor |
| 482 | */ |
| 483 | size_t get_last_ghost_get_num_proc() |
| 484 | { |
| 485 | if (prc_recv_get_prp.size() != 0) |
| 486 | {return prc_recv_get_prp.size();} |
| 487 | else |
| 488 | {return prc_recv_get_pos.size();} |
| 489 | } |
| 490 | |
| 491 | /*! \brief Get the number of processor involved during the last ghost_get |
| 492 | * |
| 493 | * \return the number of processor |
| 494 | */ |
| 495 | openfpm::vector<size_t> & get_last_ghost_get_num_proc_vector() |
| 496 | { |
| 497 | if (prc_recv_get_prp.size() != 0) |
| 498 | {return prc_recv_get_prp;} |
| 499 | else |
| 500 | {return prc_recv_get_pos;} |
| 501 | } |
| 502 | |
| 503 | /*! \brief Calculate sending buffer size for each processor |
| 504 | * |
| 505 | * \param prc_sz_r processor size |
| 506 | * \param prc_r processor ids |
| 507 | * |
| 508 | */ |
| 509 | inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, |
| 510 | openfpm::vector<size_t> & prc_sz_r, |
| 511 | openfpm::vector<size_t> & prc_r, |
| 512 | size_t opt) |
| 513 | { |
| 514 | if (opt & RUN_ON_DEVICE) |
| 515 | { |
| 516 | #ifndef TEST1 |
| 517 | size_t prev_off = 0; |
| 518 | for (size_t i = 0; i < prc_sz.size() ; i++) |
| 519 | { |
| 520 | if (prc_sz.template get<1>(i) != (unsigned int)-1) |
| 521 | { |
| 522 | prc_r.add(prc_sz.template get<1>(i)); |
| 523 | prc_sz_r.add(prc_sz.template get<0>(i) - prev_off); |
| 524 | } |
| 525 | prev_off = prc_sz.template get<0>(i); |
| 526 | } |
| 527 | #else |
| 528 | |
| 529 | // Calculate the sending buffer size for each processor, put this information in |
| 530 | // a contiguous buffer |
| 531 | |
| 532 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
| 533 | { |
| 534 | if (prc_sz.template get<0>(i) != 0 && v_cl.rank() != i) |
| 535 | { |
| 536 | prc_r.add(i); |
| 537 | prc_sz_r.add(prc_sz.template get<0>(i)); |
| 538 | } |
| 539 | } |
| 540 | |
| 541 | #endif |
| 542 | } |
| 543 | else |
| 544 | { |
| 545 | // Calculate the sending buffer size for each processor, put this information in |
| 546 | // a contiguous buffer |
| 547 | |
| 548 | p_map_req.resize(v_cl.getProcessingUnits()); |
| 549 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
| 550 | { |
| 551 | if (prc_sz.template get<0>(i) != 0) |
| 552 | { |
| 553 | p_map_req.get(i) = prc_r.size(); |
| 554 | prc_r.add(i); |
| 555 | prc_sz_r.add(prc_sz.template get<0>(i)); |
| 556 | } |
| 557 | } |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | //! From which decomposition the shift boxes are calculated |
| 562 | long int shift_box_ndec = -1; |
| 563 | |
| 564 | //! this map is used to check if a combination is already present |
| 565 | std::unordered_map<size_t, size_t> map_cmb; |
| 566 | |
| 567 | //! The boxes touching the border of the domain are divided in groups (first vector) |
| 568 | //! each group contain internal ghost coming from sub-domains of the same section |
| 569 | openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f; |
| 570 | |
| 571 | //! The boxes touching the border of the domain + shift vector linearized from where they come from |
| 572 | openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev; |
| 573 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv; |
| 574 | |
| 575 | //! Store the sector for each group (previous vector) |
| 576 | openfpm::vector_std<comb<dim>> box_cmb; |
| 577 | |
| 578 | //! Id of the local particle to replicate for ghost_get |
| 579 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc; |
| 580 | |
| 581 | //! Processor communication size |
| 582 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz; |
| 583 | |
| 584 | /*! \brief For every internal ghost box we create a structure that order such internal local ghost box in |
| 585 | * shift vectors |
| 586 | * |
| 587 | */ |
| 588 | void createShiftBox() |
| 589 | { |
| 590 | if (shift_box_ndec == (long int)dec.get_ndec()) |
| 591 | {return;} |
| 592 | |
| 593 | struct sh_box |
| 594 | { |
| 595 | size_t shift_id; |
| 596 | |
| 597 | unsigned int box_f_sv; |
| 598 | Box<dim,St> box_f_dev; |
| 599 | |
| 600 | bool operator<(const sh_box & tmp) const |
| 601 | { |
| 602 | return shift_id < tmp.shift_id; |
| 603 | } |
| 604 | |
| 605 | }; |
| 606 | openfpm::vector<sh_box> reord_shift; |
| 607 | box_f.clear(); |
| 608 | map_cmb.clear(); |
| 609 | box_cmb.clear(); |
| 610 | |
| 611 | // Add local particles coming from periodic boundary, the only boxes that count are the one |
| 612 | // touching the border |
| 613 | for (size_t i = 0; i < dec.getNLocalSub(); i++) |
| 614 | { |
| 615 | size_t Nl = dec.getLocalNIGhost(i); |
| 616 | |
| 617 | for (size_t j = 0; j < Nl; j++) |
| 618 | { |
| 619 | // If the ghost does not come from the intersection with an out of |
| 620 | // border sub-domain the combination is all zero and n_zero return dim |
| 621 | if (dec.getLocalIGhostPos(i, j).n_zero() == dim) |
| 622 | continue; |
| 623 | |
| 624 | // Check if we already have boxes with such combination |
| 625 | auto it = map_cmb.find(dec.getLocalIGhostPos(i, j).lin()); |
| 626 | if (it == map_cmb.end()) |
| 627 | { |
| 628 | // we do not have it |
| 629 | box_f.add(); |
| 630 | box_f.last().add(dec.getLocalIGhostBox(i, j)); |
| 631 | box_cmb.add(dec.getLocalIGhostPos(i, j)); |
| 632 | map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1; |
| 633 | } |
| 634 | else |
| 635 | { |
| 636 | // we have it |
| 637 | box_f.get(it->second).add(dec.getLocalIGhostBox(i, j)); |
| 638 | } |
| 639 | |
| 640 | reord_shift.add(); |
| 641 | reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin(); |
| 642 | reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j); |
| 643 | reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j)); |
| 644 | } |
| 645 | } |
| 646 | |
| 647 | // now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles |
| 648 | reord_shift.sort(); |
| 649 | |
| 650 | box_f_dev.resize(reord_shift.size()); |
| 651 | box_f_sv.resize(reord_shift.size()); |
| 652 | |
| 653 | for (size_t i = 0 ; i < reord_shift.size() ; i++) |
| 654 | { |
| 655 | box_f_dev.get(i) = reord_shift.get(i).box_f_dev; |
| 656 | box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv; |
| 657 | } |
| 658 | |
| 659 | #ifdef CUDA_GPU |
| 660 | |
| 661 | // move box_f_dev and box_f_sv to device |
| 662 | box_f_dev.template hostToDevice<0,1>(); |
| 663 | box_f_sv.template hostToDevice<0>(); |
| 664 | |
| 665 | #endif |
| 666 | |
| 667 | shift_box_ndec = dec.get_ndec(); |
| 668 | } |
| 669 | |
| 670 | /*! \brief Local ghost from labeled particles |
| 671 | * |
| 672 | * \param v_pos vector of particle positions |
| 673 | * \param v_prp vector of particles properties |
| 674 | * \param opt options |
| 675 | * |
| 676 | */ |
| 677 | void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 678 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 679 | size_t opt) |
| 680 | { |
| 681 | // get the shift vectors |
| 682 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
| 683 | |
| 684 | if (!(opt & NO_POSITION)) |
| 685 | { |
| 686 | if (opt & RUN_ON_DEVICE) |
| 687 | { |
| 688 | local_ghost_from_opart_impl<true,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
| 689 | ::run(o_part_loc,shifts,v_pos,v_prp,opt); |
| 690 | } |
| 691 | else |
| 692 | { |
| 693 | for (size_t i = 0 ; i < o_part_loc.size() ; i++) |
| 694 | { |
| 695 | size_t lin_id = o_part_loc.template get<1>(i); |
| 696 | size_t key = o_part_loc.template get<0>(i); |
| 697 | |
| 698 | Point<dim, St> p = v_pos.get(key); |
| 699 | // shift |
| 700 | p -= shifts.get(lin_id); |
| 701 | |
| 702 | // add this particle shifting its position |
| 703 | v_pos.add(p); |
| 704 | v_prp.get(lg_m+i) = v_prp.get(key); |
| 705 | } |
| 706 | } |
| 707 | } |
| 708 | else |
| 709 | { |
| 710 | if (opt & RUN_ON_DEVICE) |
| 711 | { |
| 712 | local_ghost_from_opart_impl<false,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
| 713 | ::run(o_part_loc,shifts,v_pos,v_prp,opt); |
| 714 | } |
| 715 | else |
| 716 | { |
| 717 | for (size_t i = 0 ; i < o_part_loc.size() ; i++) |
| 718 | { |
| 719 | size_t key = o_part_loc.template get<0>(i); |
| 720 | |
| 721 | v_prp.get(lg_m+i) = v_prp.get(key); |
| 722 | } |
| 723 | } |
| 724 | } |
| 725 | } |
| 726 | |
| 727 | /*! \brief Local ghost from decomposition |
| 728 | * |
| 729 | * \param v_pos vector of particle positions |
| 730 | * \param v_prp vector of particle properties |
| 731 | * \param g_m ghost marker |
| 732 | * |
| 733 | */ |
| 734 | void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 735 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 736 | size_t g_m,size_t opt) |
| 737 | { |
| 738 | o_part_loc.clear(); |
| 739 | |
| 740 | // get the shift vectors |
| 741 | const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
| 742 | |
| 743 | if (opt & RUN_ON_DEVICE) |
| 744 | { |
| 745 | local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
| 746 | ::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt); |
| 747 | } |
| 748 | else |
| 749 | { |
| 750 | // Label the internal (assigned) particles |
| 751 | auto it = v_pos.getIteratorTo(g_m); |
| 752 | |
| 753 | while (it.isNext()) |
| 754 | { |
| 755 | auto key = it.get(); |
| 756 | |
| 757 | // If particles are inside these boxes |
| 758 | for (size_t i = 0; i < box_f.size(); i++) |
| 759 | { |
| 760 | for (size_t j = 0; j < box_f.get(i).size(); j++) |
| 761 | { |
| 762 | if (box_f.get(i).get(j).isInsideNP(v_pos.get(key)) == true) |
| 763 | { |
| 764 | size_t lin_id = dec.convertShift(box_cmb.get(i)); |
| 765 | |
| 766 | o_part_loc.add(); |
| 767 | o_part_loc.template get<0>(o_part_loc.size()-1) = key; |
| 768 | o_part_loc.template get<1>(o_part_loc.size()-1) = lin_id; |
| 769 | |
| 770 | Point<dim, St> p = v_pos.get(key); |
| 771 | // shift |
| 772 | p -= shifts.get(lin_id); |
| 773 | |
| 774 | // add this particle shifting its position |
| 775 | v_pos.add(p); |
| 776 | v_prp.add(); |
| 777 | v_prp.last() = v_prp.get(key); |
| 778 | |
| 779 | // boxes in one group can be overlapping |
| 780 | // we do not have to search for the other |
| 781 | // boxes otherwise we will have duplicate particles |
| 782 | // |
| 783 | // A small note overlap of boxes across groups is fine |
| 784 | // (and needed) because each group has different shift |
| 785 | // producing non overlapping particles |
| 786 | // |
| 787 | break; |
| 788 | } |
| 789 | } |
| 790 | } |
| 791 | |
| 792 | ++it; |
| 793 | } |
| 794 | } |
| 795 | } |
| 796 | |
| 797 | /*! \brief Add local particles based on the boundary conditions |
| 798 | * |
| 799 | * In order to understand what this function use the following |
| 800 | * |
| 801 | \verbatim |
| 802 | |
| 803 | [1,1] |
| 804 | +---------+------------------------+---------+ |
| 805 | | (1,-1) | | (1,1) | |
| 806 | | | | (1,0) --> 7 | | | |
| 807 | | v | | v | |
| 808 | | 6 | | 8 | |
| 809 | +--------------------------------------------+ |
| 810 | | | | | |
| 811 | | | | | |
| 812 | | | | | |
| 813 | | (-1,0) | | (1,0) | |
| 814 | | | | | | | |
| 815 | | v | (0,0) --> 4 | v | |
| 816 | | 3 | | 5 | |
| 817 | | | | | |
| 818 | B | | | A | |
| 819 | * | | | * | |
| 820 | | | | | |
| 821 | | | | | |
| 822 | | | | | |
| 823 | +--------------------------------------------+ |
| 824 | | (-1,-1) | | (-1,1) | |
| 825 | | | | (-1,0) --> 1 | | | |
| 826 | | v | | v | |
| 827 | | 0 | | 2 | |
| 828 | +---------+------------------------+---------+ |
| 829 | |
| 830 | |
| 831 | \endverbatim |
| 832 | |
| 833 | * |
| 834 | * The box is the domain, while all boxes at the border (so not (0,0) ) are the |
| 835 | * ghost part at the border of the domain. If a particle A is in the position in figure |
| 836 | * a particle B must be created. This function duplicate the particle A, if A and B are |
| 837 | * local |
| 838 | * |
| 839 | * \param v_pos vector of particle of positions |
| 840 | * \param v_prp vector of particle properties |
| 841 | * \param g_m ghost marker |
| 842 | * \param opt options |
| 843 | * |
| 844 | */ |
| 845 | void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 846 | openfpm::vector<prop,Memory,layout_base> & v_prp , |
| 847 | size_t & g_m, |
| 848 | size_t opt) |
| 849 | { |
| 850 | // Create the shift boxes |
| 851 | createShiftBox(); |
| 852 | |
| 853 | if (!(opt & SKIP_LABELLING)) |
| 854 | lg_m = v_prp.size(); |
| 855 | |
| 856 | if (box_f.size() == 0) |
| 857 | return; |
| 858 | else |
| 859 | { |
| 860 | if (opt & SKIP_LABELLING) |
| 861 | {local_ghost_from_opart(v_pos,v_prp,opt);} |
| 862 | else |
| 863 | {local_ghost_from_dec(v_pos,v_prp,g_m,opt);} |
| 864 | } |
| 865 | } |
| 866 | |
| 867 | /*! \brief This function fill the send buffer for the particle position after the particles has been label with labelParticles |
| 868 | * |
| 869 | * \param v_pos vector of particle positions |
| 870 | * \param g_pos_send Send buffer to fill |
| 871 | * |
| 872 | */ |
| 873 | void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 874 | openfpm::vector<size_t> & prc_sz, |
| 875 | openfpm::vector<send_pos_vector> & g_pos_send, |
| 876 | size_t opt, |
| 877 | bool async) |
| 878 | { |
| 879 | // get the shift vectors |
| 880 | const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
| 881 | |
| 882 | // create a number of send buffers equal to the near processors |
| 883 | g_pos_send.resize(prc_sz.size()); |
| 884 | |
| 885 | size_t old_hsmem_size = 0; |
| 886 | |
| 887 | // if we do async |
| 888 | if (async == true) |
| 889 | { |
| 890 | old_hsmem_size = hsmem.size(); |
| 891 | resize_retained_buffer(hsmem,g_pos_send.size() + hsmem.size()); |
| 892 | } |
| 893 | else |
| 894 | {resize_retained_buffer(hsmem,g_pos_send.size());} |
| 895 | |
| 896 | for (size_t i = 0; i < g_pos_send.size(); i++) |
| 897 | { |
| 898 | // Buffer must retained and survive the destruction of the |
| 899 | // vector |
| 900 | if (hsmem.get(i+old_hsmem_size).ref() == 0) |
| 901 | {hsmem.get(i+old_hsmem_size).incRef();} |
| 902 | |
| 903 | // Set the memory for retain the send buffer |
| 904 | g_pos_send.get(i).setMemory(hsmem.get(i+old_hsmem_size)); |
| 905 | |
| 906 | // resize the sending vector (No allocation is produced) |
| 907 | g_pos_send.get(i).resize(prc_sz.get(i)); |
| 908 | } |
| 909 | |
| 910 | if (opt & RUN_ON_DEVICE) |
| 911 | { |
| 912 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 913 | |
| 914 | size_t offset = 0; |
| 915 | |
| 916 | // Fill the sending buffers |
| 917 | for (size_t i = 0 ; i < g_pos_send.size() ; i++) |
| 918 | { |
| 919 | auto ite = g_pos_send.get(i).getGPUIterator(); |
| 920 | |
| 921 | CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>), |
| 922 | ite, |
| 923 | g_opart_device.toKernel(), g_pos_send.get(i).toKernel(), |
| 924 | v_pos.toKernel(),shifts.toKernel(),offset); |
| 925 | |
| 926 | offset += prc_sz.get(i); |
| 927 | } |
| 928 | |
| 929 | #else |
| 930 | |
| 931 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
| 932 | |
| 933 | #endif |
| 934 | } |
| 935 | else |
| 936 | { |
| 937 | // Fill the send buffer |
| 938 | for (size_t i = 0; i < g_opart.size(); i++) |
| 939 | { |
| 940 | for (size_t j = 0; j < g_opart.get(i).size(); j++) |
| 941 | { |
| 942 | Point<dim, St> s = v_pos.get(g_opart.get(i).template get<0>(j)); |
| 943 | s -= shifts.get(g_opart.get(i).template get<1>(j)); |
| 944 | g_pos_send.get(i).set(j, s); |
| 945 | } |
| 946 | } |
| 947 | } |
| 948 | } |
| 949 | |
| 950 | /*! \brief This function fill the send buffer for ghost_put |
| 951 | * |
| 952 | * \tparam send_vector type used to send data |
| 953 | * \tparam prp_object object containing only the properties to send |
| 954 | * \tparam prp set of properties to send |
| 955 | * |
| 956 | * \param v_prp vector of particle properties |
| 957 | * \param g_send_prp Send buffer to fill |
| 958 | * \param g_m ghost marker |
| 959 | * |
| 960 | */ |
| 961 | template<typename send_vector, typename prp_object, int ... prp> |
| 962 | void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 963 | openfpm::vector<send_vector> & g_send_prp, |
| 964 | size_t & g_m, |
| 965 | size_t opt) |
| 966 | { |
| 967 | // create a number of send buffers equal to the near processors |
| 968 | // from which we received |
| 969 | |
| 970 | // NOTE in some case the information can be in prc_recv_get_pos |
| 971 | |
| 972 | size_t nproc = get_last_ghost_get_num_proc(); |
| 973 | |
| 974 | g_send_prp.resize(nproc); |
| 975 | |
| 976 | resize_retained_buffer(hsmem,g_send_prp.size()); |
| 977 | |
| 978 | for (size_t i = 0; i < g_send_prp.size(); i++) |
| 979 | { |
| 980 | // Buffer must retained and survive the destruction of the |
| 981 | // vector |
| 982 | if (hsmem.get(i).ref() == 0) |
| 983 | hsmem.get(i).incRef(); |
| 984 | |
| 985 | // Set the memory for retain the send buffer |
| 986 | g_send_prp.get(i).setMemory(hsmem.get(i)); |
| 987 | |
| 988 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
| 989 | |
| 990 | // resize the sending vector (No allocation is produced) |
| 991 | g_send_prp.get(i).resize(n_part_recv); |
| 992 | } |
| 993 | |
| 994 | size_t accum = g_m; |
| 995 | |
| 996 | if (opt & RUN_ON_DEVICE) |
| 997 | { |
| 998 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 999 | |
| 1000 | if (sizeof...(prp) != 0) |
| 1001 | { |
| 1002 | // Fill the sending buffers |
| 1003 | for (size_t i = 0 ; i < g_send_prp.size() ; i++) |
| 1004 | { |
| 1005 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
| 1006 | |
| 1007 | auto ite = g_send_prp.get(i).getGPUIterator(); |
| 1008 | |
| 1009 | if (ite.nblocks() == 0) {continue;} |
| 1010 | |
| 1011 | CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>), |
| 1012 | ite, |
| 1013 | g_send_prp.get(i).toKernel(), |
| 1014 | v_prp.toKernel(),accum); |
| 1015 | |
| 1016 | accum = accum + n_part_recv; |
| 1017 | } |
| 1018 | } |
| 1019 | |
| 1020 | #else |
| 1021 | |
| 1022 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
| 1023 | |
| 1024 | #endif |
| 1025 | } |
| 1026 | else |
| 1027 | { |
| 1028 | // Fill the send buffer |
| 1029 | for (size_t i = 0; i < g_send_prp.size(); i++) |
| 1030 | { |
| 1031 | size_t j2 = 0; |
| 1032 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
| 1033 | |
| 1034 | for (size_t j = accum; j < accum + n_part_recv; j++) |
| 1035 | { |
| 1036 | // source object type |
| 1037 | typedef encapc<1, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src; |
| 1038 | // destination object type |
| 1039 | typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst; |
| 1040 | |
| 1041 | // Copy only the selected properties |
| 1042 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2)); |
| 1043 | |
| 1044 | j2++; |
| 1045 | } |
| 1046 | |
| 1047 | accum = accum + n_part_recv; |
| 1048 | } |
| 1049 | } |
| 1050 | } |
| 1051 | |
| 1052 | /*! \brief resize the retained buffer by nbf |
| 1053 | * |
| 1054 | * |
| 1055 | */ |
| 1056 | void resize_retained_buffer(openfpm::vector_fr<Memory> & rt_buf, size_t nbf) |
| 1057 | { |
| 1058 | // Release all the buffer that are going to be deleted |
| 1059 | for (size_t i = nbf ; i < rt_buf.size() ; i++) |
| 1060 | { |
| 1061 | rt_buf.get(i).decRef(); |
| 1062 | } |
| 1063 | |
| 1064 | hsmem.resize(nbf); |
| 1065 | } |
| 1066 | |
| 1067 | /*! \brief Set the buffer for each property |
| 1068 | * |
| 1069 | * |
| 1070 | */ |
| 1071 | template<typename send_vector, typename v_mpl> |
| 1072 | struct set_mem_retained_buffers_inte |
| 1073 | { |
| 1074 | openfpm::vector<send_vector> & g_send_prp; |
| 1075 | |
| 1076 | size_t i; |
| 1077 | |
| 1078 | openfpm::vector_fr<Memory> & hsmem; |
| 1079 | |
| 1080 | size_t j; |
| 1081 | |
| 1082 | set_mem_retained_buffers_inte(openfpm::vector<send_vector> & g_send_prp, size_t i , |
| 1083 | openfpm::vector_fr<Memory> & hsmem, size_t j) |
| 1084 | :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j) |
| 1085 | {} |
| 1086 | |
| 1087 | //! It call the setMemory function for each property |
| 1088 | template<typename T> |
| 1089 | inline void operator()(T& t) |
| 1090 | { |
| 1091 | g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j)); |
| 1092 | |
| 1093 | j++; |
| 1094 | } |
| 1095 | }; |
| 1096 | |
| 1097 | template<bool inte_or_lin,typename send_vector, typename v_mpl> |
| 1098 | struct set_mem_retained_buffers |
| 1099 | { |
| 1100 | static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp, |
| 1101 | openfpm::vector<size_t> & prc_sz, |
| 1102 | size_t i, |
| 1103 | openfpm::vector_fr<Memory> & hsmem, |
| 1104 | size_t j) |
| 1105 | { |
| 1106 | // Set the memory for retain the send buffer |
| 1107 | g_send_prp.get(i).setMemory(hsmem.get(j)); |
| 1108 | |
| 1109 | // resize the sending vector (No allocation is produced) |
| 1110 | g_send_prp.get(i).resize(prc_sz.get(i)); |
| 1111 | |
| 1112 | return j+1; |
| 1113 | } |
| 1114 | }; |
| 1115 | |
| 1116 | template<typename send_vector, typename v_mpl> |
| 1117 | struct set_mem_retained_buffers<true,send_vector,v_mpl> |
| 1118 | { |
| 1119 | static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp, |
| 1120 | openfpm::vector<size_t> & prc_sz, |
| 1121 | size_t i, |
| 1122 | openfpm::vector_fr<Memory> & hsmem, |
| 1123 | size_t j) |
| 1124 | { |
| 1125 | set_mem_retained_buffers_inte<send_vector,v_mpl> smrbi(g_send_prp,i,hsmem,j); |
| 1126 | |
| 1127 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi); |
| 1128 | |
| 1129 | // if we do not send properties do not reallocate |
| 1130 | if (boost::mpl::size<v_mpl>::type::value != 0) |
| 1131 | { |
| 1132 | // resize the sending vector (No allocation is produced) |
| 1133 | g_send_prp.get(i).resize(prc_sz.get(i)); |
| 1134 | } |
| 1135 | |
| 1136 | return smrbi.j; |
| 1137 | } |
| 1138 | }; |
| 1139 | |
| 1140 | /*! \brief This function fill the send buffer for properties after the particles has been label with labelParticles |
| 1141 | * |
| 1142 | * \tparam send_vector type used to send data |
| 1143 | * \tparam prp_object object containing only the properties to send |
| 1144 | * \tparam prp set of properties to send |
| 1145 | * |
| 1146 | * \param v_prp vector of particle properties |
| 1147 | * \param g_send_prp Send buffer to fill |
| 1148 | * |
| 1149 | */ |
| 1150 | template<typename send_vector, typename prp_object, int ... prp> |
| 1151 | void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1152 | openfpm::vector<size_t> & prc_sz, |
| 1153 | openfpm::vector<send_vector> & g_send_prp, |
| 1154 | size_t opt) |
| 1155 | { |
| 1156 | size_t factor = 1; |
| 1157 | |
| 1158 | typedef typename to_boost_vmpl<prp...>::type v_mpl; |
| 1159 | |
| 1160 | if (is_layout_inte<layout_base<prop>>::value == true) {factor *= sizeof...(prp);} |
| 1161 | |
| 1162 | // create a number of send buffers equal to the near processors |
| 1163 | g_send_prp.resize(prc_sz.size()); |
| 1164 | |
| 1165 | resize_retained_buffer(hsmem,g_send_prp.size()*factor); |
| 1166 | |
| 1167 | for (size_t i = 0; i < hsmem.size(); i++) |
| 1168 | { |
| 1169 | // Buffer must retained and survive the destruction of the |
| 1170 | // vector |
| 1171 | if (hsmem.get(i).ref() == 0) |
| 1172 | {hsmem.get(i).incRef();} |
| 1173 | } |
| 1174 | |
| 1175 | size_t j = 0; |
| 1176 | for (size_t i = 0; i < g_send_prp.size(); i++) |
| 1177 | { |
| 1178 | j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,prc_sz,i,hsmem,j); |
| 1179 | } |
| 1180 | |
| 1181 | if (opt & RUN_ON_DEVICE) |
| 1182 | { |
| 1183 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 1184 | |
| 1185 | size_t offset = 0; |
| 1186 | |
| 1187 | if (sizeof...(prp) != 0) |
| 1188 | { |
| 1189 | // Fill the sending buffers |
| 1190 | for (size_t i = 0 ; i < g_send_prp.size() ; i++) |
| 1191 | { |
| 1192 | auto ite = g_send_prp.get(i).getGPUIterator(); |
| 1193 | |
| 1194 | CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>), |
| 1195 | ite, |
| 1196 | g_opart_device.toKernel(), g_send_prp.get(i).toKernel(), |
| 1197 | v_prp.toKernel(),offset); |
| 1198 | |
| 1199 | offset += prc_sz.get(i); |
| 1200 | } |
| 1201 | } |
| 1202 | |
| 1203 | #else |
| 1204 | |
| 1205 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
| 1206 | |
| 1207 | #endif |
| 1208 | } |
| 1209 | else |
| 1210 | { |
| 1211 | // if no properties must be sent skip this step |
| 1212 | if (sizeof...(prp) == 0) {return;} |
| 1213 | |
| 1214 | // Fill the send buffer |
| 1215 | for (size_t i = 0; i < g_opart.size(); i++) |
| 1216 | { |
| 1217 | for (size_t j = 0; j < g_opart.get(i).size(); j++) |
| 1218 | { |
| 1219 | // source object type |
| 1220 | typedef decltype(v_prp.get(g_opart.get(i).template get<0>(j))) encap_src; |
| 1221 | // destination object type |
| 1222 | typedef decltype(g_send_prp.get(i).get(j)) encap_dst; |
| 1223 | |
| 1224 | // Copy only the selected properties |
| 1225 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j)); |
| 1226 | } |
| 1227 | } |
| 1228 | } |
| 1229 | } |
| 1230 | |
| 1231 | /*! \brief allocate and fill the send buffer for the map function |
| 1232 | * |
| 1233 | * \param v_pos vector of particle positions |
| 1234 | * \param v_prp vector of particles properties |
| 1235 | * \param prc_sz_r For each processor in the list the size of the message to send |
| 1236 | * \param m_pos sending buffer for position |
| 1237 | * \param m_prp sending buffer for properties |
| 1238 | * \param offset from where start the list of the particles that migrate in o_part |
| 1239 | * This parameter is used only in case of RUN_ON_DEVICE option |
| 1240 | * |
| 1241 | */ |
| 1242 | void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 1243 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1244 | openfpm::vector<size_t> & prc_sz_r, |
| 1245 | openfpm::vector<size_t> & prc_r, |
| 1246 | openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos, |
| 1247 | openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp, |
| 1248 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz, |
| 1249 | size_t opt) |
| 1250 | { |
| 1251 | m_prp.resize(prc_sz_r.size()); |
| 1252 | m_pos.resize(prc_sz_r.size()); |
| 1253 | openfpm::vector<size_t> cnt(prc_sz_r.size()); |
| 1254 | |
| 1255 | for (size_t i = 0; i < prc_sz_r.size() ; i++) |
| 1256 | { |
| 1257 | // set the size and allocate, using mem warant that pos and prp is contiguous |
| 1258 | m_pos.get(i).resize(prc_sz_r.get(i)); |
| 1259 | m_prp.get(i).resize(prc_sz_r.get(i)); |
| 1260 | cnt.get(i) = 0; |
| 1261 | } |
| 1262 | |
| 1263 | if (opt & RUN_ON_DEVICE) |
| 1264 | { |
| 1265 | if (v_cl.size() == 1) |
| 1266 | {return;} |
| 1267 | |
| 1268 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 1269 | |
| 1270 | // The first part of m_opart and prc_sz contain the local particles |
| 1271 | |
| 1272 | #ifndef TEST1 |
| 1273 | |
| 1274 | v_pos_tmp.resize(prc_sz.template get<0>(0)); |
| 1275 | v_prp_tmp.resize(prc_sz.template get<0>(0)); |
| 1276 | |
| 1277 | auto ite = v_pos_tmp.getGPUIterator(); |
| 1278 | |
| 1279 | // fill v_pos_tmp and v_prp_tmp with local particles |
| 1280 | process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), |
| 1281 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> |
| 1282 | <<<ite.wthr,ite.thr>>> |
| 1283 | (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), |
| 1284 | v_pos.toKernel(),v_prp.toKernel(),0); |
| 1285 | |
| 1286 | size_t offset = prc_sz.template get<0>(0); |
| 1287 | |
| 1288 | // Fill the sending buffers |
| 1289 | for (size_t i = 0 ; i < m_pos.size() ; i++) |
| 1290 | { |
| 1291 | auto ite = m_pos.get(i).getGPUIterator(); |
| 1292 | |
| 1293 | process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), |
| 1294 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> |
| 1295 | <<<ite.wthr,ite.thr>>> |
| 1296 | (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), |
| 1297 | v_pos.toKernel(),v_prp.toKernel(),offset); |
| 1298 | |
| 1299 | offset += prc_sz_r.size(); |
| 1300 | } |
| 1301 | |
| 1302 | // old local particles with the actual local particles |
| 1303 | v_pos_tmp.swap(v_pos); |
| 1304 | v_prp_tmp.swap(v_prp); |
| 1305 | |
| 1306 | #else |
| 1307 | |
| 1308 | int rank = v_cl.rank(); |
| 1309 | |
| 1310 | v_pos_tmp.resize(prc_sz.template get<0>(rank)); |
| 1311 | v_prp_tmp.resize(prc_sz.template get<0>(rank)); |
| 1312 | |
| 1313 | auto ite = v_pos_tmp.getGPUIterator(); |
| 1314 | |
| 1315 | starts.template deviceToHost<0>(); |
| 1316 | size_t offset = starts.template get<0>(rank); |
| 1317 | |
| 1318 | // no work to do |
| 1319 | if (ite.wthr.x != 0) |
| 1320 | { |
| 1321 | // fill v_pos_tmp and v_prp_tmp with local particles |
| 1322 | CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), |
| 1323 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), |
| 1324 | ite, |
| 1325 | m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), |
| 1326 | v_pos.toKernel(),v_prp.toKernel(),offset); |
| 1327 | } |
| 1328 | |
| 1329 | // Fill the sending buffers |
| 1330 | for (size_t i = 0 ; i < m_pos.size() ; i++) |
| 1331 | { |
| 1332 | size_t offset = starts.template get<0>(prc_r.template get<0>(i)); |
| 1333 | |
| 1334 | auto ite = m_pos.get(i).getGPUIterator(); |
| 1335 | |
| 1336 | // no work to do |
| 1337 | if (ite.wthr.x != 0) |
| 1338 | { |
| 1339 | |
| 1340 | CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), |
| 1341 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), |
| 1342 | ite, |
| 1343 | m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), |
| 1344 | v_pos.toKernel(),v_prp.toKernel(),offset); |
| 1345 | |
| 1346 | } |
| 1347 | } |
| 1348 | |
| 1349 | // old local particles with the actual local particles |
| 1350 | v_pos_tmp.swap(v_pos); |
| 1351 | v_prp_tmp.swap(v_prp); |
| 1352 | |
| 1353 | #endif |
| 1354 | #else |
| 1355 | |
| 1356 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
| 1357 | |
| 1358 | #endif |
| 1359 | } |
| 1360 | else |
| 1361 | { |
| 1362 | // end vector point |
| 1363 | long int id_end = v_pos.size(); |
| 1364 | |
| 1365 | // end opart point |
| 1366 | long int end = m_opart.size()-1; |
| 1367 | |
| 1368 | // Run through all the particles and fill the sending buffer |
| 1369 | for (size_t i = 0; i < m_opart.size(); i++) |
| 1370 | { |
| 1371 | process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt); |
| 1372 | } |
| 1373 | |
| 1374 | v_pos.resize(v_pos.size() - m_opart.size()); |
| 1375 | v_prp.resize(v_prp.size() - m_opart.size()); |
| 1376 | } |
| 1377 | } |
| 1378 | |
| 1379 | |
| 1380 | /*! \brief allocate and fill the send buffer for the map function |
| 1381 | * |
| 1382 | * \tparam prp_object object type to send |
| 1383 | * \tparam prp properties to send |
| 1384 | * |
| 1385 | * \param v_pos vector of particle positions |
| 1386 | * \param v_prp vector of particle properties |
| 1387 | * \param prc_sz_r number of particles to send for each processor |
| 1388 | * \param m_pos sending buffer for position |
| 1389 | * \param m_prp sending buffer for properties |
| 1390 | * |
| 1391 | */ |
| 1392 | template<typename prp_object,int ... prp> |
| 1393 | void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos, |
| 1394 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1395 | openfpm::vector<size_t> & prc_sz_r, |
| 1396 | openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, |
| 1397 | openfpm::vector<openfpm::vector<prp_object>> & m_prp) |
| 1398 | { |
| 1399 | m_prp.resize(prc_sz_r.size()); |
| 1400 | m_pos.resize(prc_sz_r.size()); |
| 1401 | openfpm::vector<size_t> cnt(prc_sz_r.size()); |
| 1402 | |
| 1403 | for (size_t i = 0; i < prc_sz_r.size(); i++) |
| 1404 | { |
| 1405 | // set the size and allocate, using mem warant that pos and prp is contiguous |
| 1406 | m_pos.get(i).resize(prc_sz_r.get(i)); |
| 1407 | m_prp.get(i).resize(prc_sz_r.get(i)); |
| 1408 | cnt.get(i) = 0; |
| 1409 | } |
| 1410 | |
| 1411 | // end vector point |
| 1412 | long int id_end = v_pos.size(); |
| 1413 | |
| 1414 | // end opart point |
| 1415 | long int end = m_opart.size()-1; |
| 1416 | |
| 1417 | // Run through all the particles and fill the sending buffer |
| 1418 | for (size_t i = 0; i < m_opart.size(); i++) |
| 1419 | { |
| 1420 | process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt); |
| 1421 | } |
| 1422 | |
| 1423 | v_pos.resize(v_pos.size() - m_opart.size()); |
| 1424 | v_prp.resize(v_prp.size() - m_opart.size()); |
| 1425 | } |
| 1426 | |
| 1427 | /*! \brief Label particles for mappings |
| 1428 | * |
| 1429 | * \param v_pos vector of particle positions |
| 1430 | * \param lbl_p Particle labeled |
| 1431 | * \param prc_sz For each processor the number of particles to send |
| 1432 | * \param opt options |
| 1433 | * |
| 1434 | */ |
| 1435 | template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 1436 | openfpm::vector<aggregate<int,int,int>, |
| 1437 | Memory, |
| 1438 | layout_base> & lbl_p, |
| 1439 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, |
| 1440 | size_t opt) |
| 1441 | { |
| 1442 | if (opt == RUN_ON_DEVICE) |
| 1443 | { |
| 1444 | #ifdef __NVCC__ |
| 1445 | |
| 1446 | // Map directly on gpu |
| 1447 | |
| 1448 | lbl_p.resize(v_pos.size()); |
| 1449 | |
| 1450 | // labelling kernel |
| 1451 | |
| 1452 | prc_sz.template fill<0>(0); |
| 1453 | |
| 1454 | auto ite = v_pos.getGPUIterator(); |
| 1455 | if (ite.wthr.x == 0) |
| 1456 | { |
| 1457 | starts.resize(v_cl.size()); |
| 1458 | starts.template fill<0>(0); |
| 1459 | return; |
| 1460 | } |
| 1461 | |
| 1462 | // we have one process we can skip ... |
| 1463 | if (v_cl.size() == 1) |
| 1464 | { |
| 1465 | // ... but we have to apply the boundary conditions |
| 1466 | |
| 1467 | periodicity_int<dim> bc; |
| 1468 | |
| 1469 | for (size_t i = 0 ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);} |
| 1470 | |
| 1471 | CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,dec.getDomain(),bc,v_pos.toKernel()); |
| 1472 | |
| 1473 | return; |
| 1474 | } |
| 1475 | |
| 1476 | // label particle processor |
| 1477 | CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>), |
| 1478 | ite, |
| 1479 | dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank()); |
| 1480 | |
| 1481 | |
| 1482 | #ifndef TEST1 |
| 1483 | |
| 1484 | // sort particles |
| 1485 | mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
| 1486 | |
| 1487 | mem.allocate(sizeof(int)); |
| 1488 | mem.fill(0); |
| 1489 | |
| 1490 | // Find the buffer bases |
| 1491 | find_buffer_offsets<1,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>> |
| 1492 | (lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel()); |
| 1493 | |
| 1494 | #error "should not be here" |
| 1495 | |
| 1496 | // Trasfer the number of offsets on CPU |
| 1497 | mem.deviceToHost(); |
| 1498 | prc_sz.template deviceToHost<0,1>(); |
| 1499 | // get also the last element from lbl_p; |
| 1500 | lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1); |
| 1501 | |
| 1502 | mem.deviceToHost(); |
| 1503 | int noff = *(int *)mem.getPointer(); |
| 1504 | prc_sz.resize(noff+1); |
| 1505 | prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size(); |
| 1506 | prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1); |
| 1507 | |
| 1508 | #else |
| 1509 | |
| 1510 | starts.resize(v_cl.size()); |
| 1511 | openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
| 1512 | |
| 1513 | // move prc_sz to host |
| 1514 | prc_sz.template deviceToHost<0>(); |
| 1515 | |
| 1516 | ite = lbl_p.getGPUIterator(); |
| 1517 | |
| 1518 | // we order lbl_p |
| 1519 | CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel()); |
| 1520 | |
| 1521 | #endif |
| 1522 | |
| 1523 | #else |
| 1524 | |
| 1525 | std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl; |
| 1526 | |
| 1527 | #endif |
| 1528 | } |
| 1529 | else |
| 1530 | { |
| 1531 | // reset lbl_p |
| 1532 | lbl_p.clear(); |
| 1533 | prc_sz_gg.clear(); |
| 1534 | o_part_loc.clear(); |
| 1535 | g_opart.clear(); |
| 1536 | prc_g_opart.clear(); |
| 1537 | |
| 1538 | // resize the label buffer |
| 1539 | prc_sz.template fill<0>(0); |
| 1540 | |
| 1541 | auto it = v_pos.getIterator(); |
| 1542 | |
| 1543 | // Label all the particles with the processor id where they should go |
| 1544 | while (it.isNext()) |
| 1545 | { |
| 1546 | auto key = it.get(); |
| 1547 | |
| 1548 | // Apply the boundary conditions |
| 1549 | dec.applyPointBC(v_pos.get(key)); |
| 1550 | |
| 1551 | size_t p_id = 0; |
| 1552 | |
| 1553 | // Check if the particle is inside the domain |
| 1554 | if (dec.getDomain().isInside(v_pos.get(key)) == true) |
| 1555 | {p_id = dec.processorID(v_pos.get(key));} |
| 1556 | else |
| 1557 | {p_id = obp::out(key, v_cl.getProcessUnitID());} |
| 1558 | |
| 1559 | // Particle to move |
| 1560 | if (p_id != v_cl.getProcessUnitID()) |
| 1561 | { |
| 1562 | if ((long int) p_id != -1) |
| 1563 | { |
| 1564 | prc_sz.template get<0>(p_id)++; |
| 1565 | lbl_p.add(); |
| 1566 | lbl_p.last().template get<0>() = key; |
| 1567 | lbl_p.last().template get<2>() = p_id; |
| 1568 | } |
| 1569 | else |
| 1570 | { |
| 1571 | lbl_p.add(); |
| 1572 | lbl_p.last().template get<0>() = key; |
| 1573 | lbl_p.last().template get<2>() = p_id; |
| 1574 | } |
| 1575 | } |
| 1576 | |
| 1577 | // Add processors and add size |
| 1578 | |
| 1579 | ++it; |
| 1580 | } |
| 1581 | } |
| 1582 | } |
| 1583 | |
| 1584 | /*! \brief Label the particles |
| 1585 | * |
| 1586 | * It count the number of particle to send to each processors and save its ids |
| 1587 | * |
| 1588 | * \see nn_prcs::getShiftvectors() |
| 1589 | * |
| 1590 | * \param v_pos vector of particle positions |
| 1591 | * \param v_prp vector of particle properties |
| 1592 | * \param prc for each particle it label the processor id (the owner of the particle, or where it should go the particle) |
| 1593 | * \param g_m ghost marker |
| 1594 | * \param opt ghost_get options |
| 1595 | * |
| 1596 | */ |
| 1597 | void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 1598 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1599 | openfpm::vector<size_t> & prc, |
| 1600 | openfpm::vector<size_t> & prc_sz, |
| 1601 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
| 1602 | size_t & g_m, |
| 1603 | size_t opt) |
| 1604 | { |
| 1605 | // Buffer that contain for each processor the id of the particle to send |
| 1606 | prc_sz.clear(); |
| 1607 | g_opart.clear(); |
| 1608 | g_opart.resize(dec.getNNProcessors()); |
| 1609 | prc_g_opart.clear(); |
| 1610 | |
| 1611 | if (opt & RUN_ON_DEVICE) |
| 1612 | { |
| 1613 | labelParticlesGhost_impl<dim,St,prop,Memory,layout_base, |
| 1614 | Decomposition,std::is_same<Memory,CudaMemory>::value> |
| 1615 | ::run(mem,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt); |
| 1616 | } |
| 1617 | else |
| 1618 | { |
| 1619 | // Iterate over all particles |
| 1620 | auto it = v_pos.getIteratorTo(g_m); |
| 1621 | while (it.isNext()) |
| 1622 | { |
| 1623 | auto key = it.get(); |
| 1624 | |
| 1625 | // Given a particle, it return which processor require it (first id) and shift id, second id |
| 1626 | // For an explanation about shifts vectors please consult getShiftVector in ie_ghost |
| 1627 | const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE); |
| 1628 | |
| 1629 | for (size_t i = 0; i < vp_id.size(); i++) |
| 1630 | { |
| 1631 | // processor id |
| 1632 | size_t p_id = vp_id.get(i).first; |
| 1633 | |
| 1634 | // add particle to communicate |
| 1635 | g_opart.get(p_id).add(); |
| 1636 | g_opart.get(p_id).last().template get<0>() = key; |
| 1637 | g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second; |
| 1638 | } |
| 1639 | |
| 1640 | ++it; |
| 1641 | } |
| 1642 | |
| 1643 | // remove all zero entry and construct prc (the list of the sending processors) |
| 1644 | openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f; |
| 1645 | |
| 1646 | // count the non zero element |
| 1647 | for (size_t i = 0 ; i < g_opart.size() ; i++) |
| 1648 | { |
| 1649 | if (g_opart.get(i).size() != 0) |
| 1650 | { |
| 1651 | prc_sz.add(g_opart.get(i).size()); |
| 1652 | g_opart_f.add(); |
| 1653 | g_opart.get(i).swap(g_opart_f.last()); |
| 1654 | prc.add(dec.IDtoProc(i)); |
| 1655 | } |
| 1656 | } |
| 1657 | |
| 1658 | g_opart.swap(g_opart_f); |
| 1659 | } |
| 1660 | #ifdef EXTREA_TRACE_PRE_COMM |
| 1661 | Extrae_user_function (0); |
| 1662 | #endif |
| 1663 | } |
| 1664 | |
| 1665 | /*! \brief Call-back to allocate buffer to receive incoming elements (particles) |
| 1666 | * |
| 1667 | * \param msg_i size required to receive the message from i |
| 1668 | * \param total_msg total size to receive from all the processors |
| 1669 | * \param total_p the total number of processor that want to communicate with you |
| 1670 | * \param i processor id |
| 1671 | * \param ri request id (it is an id that goes from 0 to total_p, and is unique |
| 1672 | * every time message_alloc is called) |
| 1673 | * \param ptr a pointer to the vector_dist structure |
| 1674 | * |
| 1675 | * \return the pointer where to store the message for the processor i |
| 1676 | * |
| 1677 | */ |
| 1678 | static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr) |
| 1679 | { |
| 1680 | // cast the pointer |
| 1681 | vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd = static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *>(ptr); |
| 1682 | |
| 1683 | vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits()); |
| 1684 | vd->recv_mem_gm.get(i).resize(msg_i); |
| 1685 | |
| 1686 | return vd->recv_mem_gm.get(i).getPointer(); |
| 1687 | } |
| 1688 | |
| 1689 | public: |
| 1690 | |
| 1691 | /*! \brief Copy Constructor |
| 1692 | * |
| 1693 | * \param v vector to copy |
| 1694 | * |
| 1695 | */ |
| 1696 | vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & v) |
| 1697 | :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0) |
| 1698 | { |
| 1699 | this->operator=(v); |
| 1700 | } |
| 1701 | |
| 1702 | |
| 1703 | /*! \brief Constructor |
| 1704 | * |
| 1705 | * \param dec Domain decompositon |
| 1706 | * |
| 1707 | */ |
| 1708 | vector_dist_comm(const Decomposition & dec) |
| 1709 | :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0) |
| 1710 | { |
| 1711 | |
| 1712 | } |
| 1713 | |
| 1714 | /*! \brief Constructor |
| 1715 | * |
| 1716 | * \param dec Domain decompositon |
| 1717 | * |
| 1718 | */ |
| 1719 | vector_dist_comm(Decomposition && dec) |
| 1720 | :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0) |
| 1721 | { |
| 1722 | |
| 1723 | } |
| 1724 | |
| 1725 | /*! \brief Constructor |
| 1726 | * |
| 1727 | */ |
| 1728 | vector_dist_comm() |
| 1729 | :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0) |
| 1730 | { |
| 1731 | } |
| 1732 | |
| 1733 | /*! \brief Destructor |
| 1734 | * |
| 1735 | * Release the retained buffer |
| 1736 | * |
| 1737 | */ |
| 1738 | ~vector_dist_comm() |
| 1739 | { |
| 1740 | for (size_t i = 0 ; i < hsmem.size() ; i++) |
| 1741 | { |
| 1742 | if (hsmem.get(i).ref() == 1) |
| 1743 | hsmem.get(i).decRef(); |
| 1744 | else |
| 1745 | std::cout << __FILE__ << ":" << __LINE__ << " internal error memory is in an invalid state " << std::endl; |
| 1746 | } |
| 1747 | |
| 1748 | } |
| 1749 | |
| 1750 | /*! \brief Get the number of minimum sub-domain per processor |
| 1751 | * |
| 1752 | * \return minimum number |
| 1753 | * |
| 1754 | */ |
| 1755 | size_t getDecompositionGranularity() |
| 1756 | { |
| 1757 | return v_sub_unit_factor; |
| 1758 | } |
| 1759 | |
| 1760 | /*! \brief Set the minimum number of sub-domain per processor |
| 1761 | * |
| 1762 | * \param n_sub |
| 1763 | * |
| 1764 | */ |
| 1765 | void setDecompositionGranularity(size_t n_sub) |
| 1766 | { |
| 1767 | this->v_sub_unit_factor = n_sub; |
| 1768 | } |
| 1769 | |
| 1770 | /*! \brief Initialize the decomposition |
| 1771 | * |
| 1772 | * \param box domain |
| 1773 | * \param bc boundary conditions |
| 1774 | * \param g ghost extension |
| 1775 | * \param opt additional options |
| 1776 | * |
| 1777 | */ |
| 1778 | void init_decomposition(Box<dim,St> & box, |
| 1779 | const size_t (& bc)[dim], |
| 1780 | const Ghost<dim,St> & g, |
| 1781 | size_t opt, |
| 1782 | const grid_sm<dim,void> & gdist) |
| 1783 | { |
| 1784 | size_t div[dim]; |
| 1785 | |
| 1786 | if (opt & BIND_DEC_TO_GHOST) |
| 1787 | { |
| 1788 | // padding |
| 1789 | size_t pad = 0; |
| 1790 | |
| 1791 | // CellDecomposer |
| 1792 | CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm; |
| 1793 | |
| 1794 | // Calculate the divisions for the symmetric Cell-lists |
| 1795 | cl_param_calculateSym<dim,St>(box,cd_sm,g,pad); |
| 1796 | |
| 1797 | for (size_t i = 0 ; i < dim ; i++) |
| 1798 | {div[i] = cd_sm.getDiv()[i] - 2*pad;} |
| 1799 | |
| 1800 | // Create the sub-domains |
| 1801 | dec.setParameters(div, box, bc, g, gdist); |
| 1802 | } |
| 1803 | else |
| 1804 | { |
| 1805 | dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist); |
| 1806 | } |
| 1807 | dec.decompose(); |
| 1808 | } |
| 1809 | |
| 1810 | /*! \brief Initialize the decomposition |
| 1811 | * |
| 1812 | * \param box domain |
| 1813 | * \param bc boundary conditions |
| 1814 | * \param g ghost extension |
| 1815 | * \param opt additional options |
| 1816 | * |
| 1817 | */ |
| 1818 | void init_decomposition_gr_cell(Box<dim,St> & box, |
| 1819 | const size_t (& bc)[dim], |
| 1820 | const Ghost<dim,St> & g, |
| 1821 | size_t opt, |
| 1822 | const grid_sm<dim,void> & gdist) |
| 1823 | { |
| 1824 | size_t div[dim]; |
| 1825 | |
| 1826 | for (size_t i = 0 ; i < dim ; i++) |
| 1827 | {div[i] = gdist.size(i);} |
| 1828 | |
| 1829 | // Create the sub-domains |
| 1830 | dec.setParameters(div, box, bc, g); |
| 1831 | |
| 1832 | dec.decompose(); |
| 1833 | } |
| 1834 | |
| 1835 | /*! \brief It synchronize the properties and position of the ghost particles |
| 1836 | * |
| 1837 | * \tparam prp list of properties to get synchronize |
| 1838 | * |
| 1839 | * \param opt options WITH_POSITION, it send also the positional information of the particles |
| 1840 | * \param v_pos vector of position to update |
| 1841 | * \param v_prp vector of properties to update |
| 1842 | * \param g_m marker between real and ghost particles |
| 1843 | * |
| 1844 | */ |
| 1845 | template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 1846 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1847 | size_t & g_m, |
| 1848 | size_t opt = WITH_POSITION) |
| 1849 | { |
| 1850 | #ifdef PROFILE_SCOREP |
| 1851 | SCOREP_USER_REGION("ghost_get" ,SCOREP_USER_REGION_TYPE_FUNCTION) |
| 1852 | #endif |
| 1853 | |
| 1854 | // Sending property object |
| 1855 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
| 1856 | |
| 1857 | // send vector for each processor |
| 1858 | typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; |
| 1859 | |
| 1860 | if (!(opt & NO_POSITION)) |
| 1861 | {v_pos.resize(g_m);} |
| 1862 | |
| 1863 | // reset the ghost part |
| 1864 | |
| 1865 | if (!(opt & SKIP_LABELLING)) |
| 1866 | {v_prp.resize(g_m);} |
| 1867 | |
| 1868 | // Label all the particles |
| 1869 | if ((opt & SKIP_LABELLING) == false) |
| 1870 | {labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz_gg,prc_offset,g_m,opt);} |
| 1871 | |
| 1872 | { |
| 1873 | // Send and receive ghost particle information |
| 1874 | openfpm::vector<send_vector> g_send_prp; |
| 1875 | |
| 1876 | fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz_gg,g_send_prp,opt); |
| 1877 | |
| 1878 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 1879 | cudaDeviceSynchronize(); |
| 1880 | #endif |
| 1881 | |
| 1882 | // if there are no properties skip |
| 1883 | // SSendRecvP send everything when we do not give properties |
| 1884 | |
| 1885 | ghost_exchange_comm_impl<impl,layout_base,prp ...>::template |
| 1886 | sendrecv_prp(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart, |
| 1887 | prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt); |
| 1888 | } |
| 1889 | |
| 1890 | if (!(opt & NO_POSITION)) |
| 1891 | { |
| 1892 | // Sending buffer for the ghost particles position |
| 1893 | openfpm::vector<send_pos_vector> g_pos_send; |
| 1894 | |
| 1895 | fill_send_ghost_pos_buf(v_pos,prc_sz_gg,g_pos_send,opt,impl == GHOST_ASYNC); |
| 1896 | |
| 1897 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 1898 | cudaDeviceSynchronize(); |
| 1899 | #endif |
| 1900 | |
| 1901 | ghost_exchange_comm_impl<impl,layout_base,prp ...>::template |
| 1902 | sendrecv_pos(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt); |
| 1903 | |
| 1904 | // fill g_opart_sz |
| 1905 | g_opart_sz.resize(prc_g_opart.size()); |
| 1906 | |
| 1907 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
| 1908 | g_opart_sz.get(i) = g_pos_send.get(i).size(); |
| 1909 | } |
| 1910 | |
| 1911 | // Important to ensure that the number of particles in v_prp must be equal to v_pos |
| 1912 | // Note that if we do not give properties sizeof...(prp) == 0 in general at this point |
| 1913 | // v_prp.size() != v_pos.size() |
| 1914 | if (!(opt & SKIP_LABELLING)) |
| 1915 | { |
| 1916 | v_prp.resize(v_pos.size()); |
| 1917 | } |
| 1918 | |
| 1919 | add_loc_particles_bc(v_pos,v_prp,g_m,opt); |
| 1920 | } |
| 1921 | |
| 1922 | /*! \brief It synchronize the properties and position of the ghost particles |
| 1923 | * |
| 1924 | * \tparam prp list of properties to get synchronize |
| 1925 | * |
| 1926 | * \param opt options WITH_POSITION, it send also the positional information of the particles |
| 1927 | * \param v_pos vector of position to update |
| 1928 | * \param v_prp vector of properties to update |
| 1929 | * \param g_m marker between real and ghost particles |
| 1930 | * |
| 1931 | */ |
| 1932 | template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 1933 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 1934 | size_t & g_m, |
| 1935 | size_t opt = WITH_POSITION) |
| 1936 | { |
| 1937 | // Sending property object |
| 1938 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
| 1939 | |
| 1940 | // send vector for each processor |
| 1941 | typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; |
| 1942 | |
| 1943 | // Send and receive ghost particle information |
| 1944 | openfpm::vector<send_vector> g_send_prp; |
| 1945 | openfpm::vector<send_pos_vector> g_pos_send; |
| 1946 | |
| 1947 | ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template |
| 1948 | sendrecv_prp_wait(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart, |
| 1949 | prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt); |
| 1950 | |
| 1951 | |
| 1952 | ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template |
| 1953 | sendrecv_pos_wait(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt); |
| 1954 | } |
| 1955 | |
| 1956 | /*! \brief It move all the particles that does not belong to the local processor to the respective processor |
| 1957 | * |
| 1958 | * \tparam out of bound policy it specify what to do when the particles are detected out of bound |
| 1959 | * |
| 1960 | * In general this function is called after moving the particles to move the |
| 1961 | * elements out the local processor. Or just after initialization if each processor |
| 1962 | * contain non local particles |
| 1963 | * |
| 1964 | * \tparam prp properties to communicate |
| 1965 | * |
| 1966 | * \param v_pos vector of particle positions |
| 1967 | * \param v_prp vector of particle properties |
| 1968 | * \param g_m ghost marker |
| 1969 | * \param opt options |
| 1970 | * |
| 1971 | */ |
| 1972 | template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt) |
| 1973 | { |
| 1974 | if (opt & RUN_ON_DEVICE) |
| 1975 | { |
| 1976 | std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " map_list is unsupported on device (coming soon)" << std::endl; |
| 1977 | return; |
| 1978 | } |
| 1979 | |
| 1980 | typedef KillParticle obp; |
| 1981 | |
| 1982 | // Processor communication size |
| 1983 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits()); |
| 1984 | |
| 1985 | // map completely reset the ghost part |
| 1986 | v_pos.resize(g_m); |
| 1987 | v_prp.resize(g_m); |
| 1988 | |
| 1989 | // m_opart, Contain the processor id of each particle (basically where they have to go) |
| 1990 | labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt); |
| 1991 | |
| 1992 | // Calculate the sending buffer size for each processor, put this information in |
| 1993 | // a contiguous buffer |
| 1994 | p_map_req.resize(v_cl.getProcessingUnits()); |
| 1995 | openfpm::vector<size_t> prc_sz_r; |
| 1996 | openfpm::vector<size_t> prc_r; |
| 1997 | |
| 1998 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
| 1999 | { |
| 2000 | if (prc_sz.template get<0>(i) != 0) |
| 2001 | { |
| 2002 | p_map_req.get(i) = prc_r.size(); |
| 2003 | prc_r.add(i); |
| 2004 | prc_sz_r.add(prc_sz.template get<0>(i)); |
| 2005 | } |
| 2006 | } |
| 2007 | |
| 2008 | if (opt & MAP_LOCAL) |
| 2009 | { |
| 2010 | // if the map is local we indicate that we receive only from the neighborhood processors |
| 2011 | |
| 2012 | prc_recv_map.clear(); |
| 2013 | for (size_t i = 0 ; i < dec.getNNProcessors() ; i++) |
| 2014 | {prc_recv_map.add(dec.IDtoProc(i));} |
| 2015 | } |
| 2016 | |
| 2017 | // Sending property object |
| 2018 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
| 2019 | |
| 2020 | //! position vector |
| 2021 | openfpm::vector<openfpm::vector<Point<dim, St>>> m_pos; |
| 2022 | //! properties vector |
| 2023 | openfpm::vector<openfpm::vector<prp_object>> m_prp; |
| 2024 | |
| 2025 | fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp); |
| 2026 | |
| 2027 | v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt); |
| 2028 | v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt); |
| 2029 | |
| 2030 | // mark the ghost part |
| 2031 | |
| 2032 | g_m = v_pos.size(); |
| 2033 | } |
| 2034 | |
| 2035 | /*! \brief It move all the particles that does not belong to the local processor to the respective processor |
| 2036 | * |
| 2037 | * \tparam out of bound policy it specify what to do when the particles are detected out of bound |
| 2038 | * |
| 2039 | * In general this function is called after moving the particles to move the |
| 2040 | * elements out the local processor. Or just after initialization if each processor |
| 2041 | * contain non local particles |
| 2042 | * |
| 2043 | * \param v_pos vector of particle positions |
| 2044 | * \param v_prp vector of particle properties |
| 2045 | * \param g_m ghost marker |
| 2046 | * |
| 2047 | */ |
| 2048 | template<typename obp = KillParticle> |
| 2049 | void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 2050 | openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, |
| 2051 | size_t opt) |
| 2052 | { |
| 2053 | #ifdef PROFILE_SCOREP |
| 2054 | SCOREP_USER_REGION("map" ,SCOREP_USER_REGION_TYPE_FUNCTION) |
| 2055 | #endif |
| 2056 | |
| 2057 | prc_sz.resize(v_cl.getProcessingUnits()); |
| 2058 | |
| 2059 | // map completely reset the ghost part |
| 2060 | v_pos.resize(g_m); |
| 2061 | v_prp.resize(g_m); |
| 2062 | |
| 2063 | // Contain the processor id of each particle (basically where they have to go) |
| 2064 | labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt); |
| 2065 | |
| 2066 | openfpm::vector<size_t> prc_sz_r; |
| 2067 | openfpm::vector<size_t> prc_r; |
| 2068 | |
| 2069 | // Calculate the sending buffer size for each processor, put this information in |
| 2070 | // a contiguous buffer |
| 2071 | calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt); |
| 2072 | |
| 2073 | //! position vector |
| 2074 | openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos; |
| 2075 | //! properties vector |
| 2076 | openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp; |
| 2077 | |
| 2078 | fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt); |
| 2079 | |
| 2080 | size_t opt_ = 0; |
| 2081 | if (opt & RUN_ON_DEVICE) |
| 2082 | { |
| 2083 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 2084 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
| 2085 | cudaDeviceSynchronize(); |
| 2086 | opt_ |= MPI_GPU_DIRECT; |
| 2087 | #else |
| 2088 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
| 2089 | #endif |
| 2090 | } |
| 2091 | |
| 2092 | v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>, |
| 2093 | openfpm::vector<Point<dim, St>,Memory,layout_base>, |
| 2094 | layout_base> |
| 2095 | (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_); |
| 2096 | |
| 2097 | v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>, |
| 2098 | openfpm::vector<prop,Memory,layout_base>, |
| 2099 | layout_base> |
| 2100 | (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_); |
| 2101 | |
| 2102 | // mark the ghost part |
| 2103 | |
| 2104 | g_m = v_pos.size(); |
| 2105 | } |
| 2106 | |
| 2107 | /*! \brief Get the decomposition |
| 2108 | * |
| 2109 | * \return |
| 2110 | * |
| 2111 | */ |
| 2112 | inline Decomposition & getDecomposition() |
| 2113 | { |
| 2114 | return dec; |
| 2115 | } |
| 2116 | |
| 2117 | /*! \brief Get the decomposition |
| 2118 | * |
| 2119 | * \return |
| 2120 | * |
| 2121 | */ |
| 2122 | inline const Decomposition & getDecomposition() const |
| 2123 | { |
| 2124 | return dec; |
| 2125 | } |
| 2126 | |
| 2127 | /*! \brief Copy a vector |
| 2128 | * |
| 2129 | * \param vc vector to copy |
| 2130 | * |
| 2131 | * \return iteself |
| 2132 | * |
| 2133 | */ |
| 2134 | vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc) |
| 2135 | { |
| 2136 | dec = vc.dec; |
| 2137 | |
| 2138 | return *this; |
| 2139 | } |
| 2140 | |
| 2141 | /*! \brief Copy a vector |
| 2142 | * |
| 2143 | * \param vc vector to copy |
| 2144 | * |
| 2145 | * \return itself |
| 2146 | * |
| 2147 | */ |
| 2148 | vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc) |
| 2149 | { |
| 2150 | dec = vc.dec; |
| 2151 | |
| 2152 | return *this; |
| 2153 | } |
| 2154 | |
| 2155 | /*! \brief Ghost put |
| 2156 | * |
| 2157 | * \tparam op operation to apply |
| 2158 | * \tparam prp set of properties |
| 2159 | * |
| 2160 | * \param v_pos vector of particle positions |
| 2161 | * \param v_prp vector od particle properties |
| 2162 | * \param g_m ghost marker |
| 2163 | * \param opt options |
| 2164 | * |
| 2165 | */ |
| 2166 | template<template<typename,typename> class op, int ... prp> |
| 2167 | void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 2168 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 2169 | size_t & g_m, |
| 2170 | size_t opt) |
| 2171 | { |
| 2172 | // Sending property object |
| 2173 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
| 2174 | |
| 2175 | // send vector for each processor |
| 2176 | typedef openfpm::vector<prp_object,Memory,layout_base> send_vector; |
| 2177 | |
| 2178 | openfpm::vector<send_vector> g_send_prp; |
| 2179 | fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt); |
| 2180 | |
| 2181 | if (opt & RUN_ON_DEVICE) |
| 2182 | { |
| 2183 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 2184 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
| 2185 | cudaDeviceSynchronize(); |
| 2186 | #else |
| 2187 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
| 2188 | #endif |
| 2189 | } |
| 2190 | |
| 2191 | // Send and receive ghost particle information |
| 2192 | if (opt & NO_CHANGE_ELEMENTS) |
| 2193 | { |
| 2194 | size_t opt_ = compute_options(opt); |
| 2195 | |
| 2196 | if (opt & RUN_ON_DEVICE) |
| 2197 | { |
| 2198 | op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset); |
| 2199 | v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>, |
| 2200 | send_vector, |
| 2201 | decltype(v_prp), |
| 2202 | layout_base, |
| 2203 | prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_); |
| 2204 | } |
| 2205 | else |
| 2206 | { |
| 2207 | op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart); |
| 2208 | v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>, |
| 2209 | send_vector, |
| 2210 | decltype(v_prp), |
| 2211 | layout_base, |
| 2212 | prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_); |
| 2213 | } |
| 2214 | } |
| 2215 | else |
| 2216 | { |
| 2217 | size_t opt_ = compute_options(opt); |
| 2218 | |
| 2219 | if (opt & RUN_ON_DEVICE) |
| 2220 | { |
| 2221 | op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset); |
| 2222 | v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>, |
| 2223 | send_vector, |
| 2224 | decltype(v_prp), |
| 2225 | layout_base, |
| 2226 | prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_); |
| 2227 | } |
| 2228 | else |
| 2229 | { |
| 2230 | op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart); |
| 2231 | v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>, |
| 2232 | send_vector, |
| 2233 | decltype(v_prp), |
| 2234 | layout_base, |
| 2235 | prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_); |
| 2236 | } |
| 2237 | } |
| 2238 | |
| 2239 | // process also the local replicated particles |
| 2240 | |
| 2241 | if (lg_m < v_prp.size() && v_prp.size() - lg_m != o_part_loc.size()) |
| 2242 | { |
| 2243 | std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Local ghost particles = " << v_prp.size() - lg_m << " != " << o_part_loc.size() << std::endl; |
| 2244 | std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Check that you did a ghost_get before a ghost_put" << std::endl; |
| 2245 | } |
| 2246 | |
| 2247 | |
| 2248 | if (opt & RUN_ON_DEVICE) |
| 2249 | { |
| 2250 | v_prp.template merge_prp_v_device<op,prop,Memory, |
| 2251 | openfpm::grow_policy_double, |
| 2252 | layout_base, |
| 2253 | decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc); |
| 2254 | } |
| 2255 | else |
| 2256 | { |
| 2257 | v_prp.template merge_prp_v<op,prop,Memory, |
| 2258 | openfpm::grow_policy_double, |
| 2259 | layout_base, |
| 2260 | decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc); |
| 2261 | } |
| 2262 | } |
| 2263 | }; |
| 2264 | |
| 2265 | |
| 2266 | #endif /* SRC_VECTOR_VECTOR_DIST_COMM_HPP_ */ |
| 2267 | |