1 | /* |
2 | * vector_dist_comm.hpp |
3 | * |
4 | * Created on: Aug 18, 2016 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_ |
9 | #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_ |
10 | |
11 | #define TEST1 |
12 | |
13 | #if defined(CUDA_GPU) && defined(__NVCC__) |
14 | #include "Vector/cuda/vector_dist_cuda_funcs.cuh" |
15 | #include "util/cuda/kernels.cuh" |
16 | #endif |
17 | |
18 | #include "Vector/util/vector_dist_funcs.hpp" |
19 | #include "cuda/vector_dist_comm_util_funcs.cuh" |
20 | #include "util/cuda/scan_ofp.cuh" |
21 | |
22 | template<typename T> |
23 | struct DEBUG |
24 | { |
25 | static float ret(T & tmp) |
26 | { |
27 | return 0.0; |
28 | } |
29 | }; |
30 | |
31 | template<> |
32 | struct DEBUG<float &> |
33 | { |
34 | static float ret(float & tmp) |
35 | { |
36 | return tmp; |
37 | } |
38 | }; |
39 | |
40 | /*! \brief compute the communication options from the ghost_get/put options |
41 | * |
42 | * |
43 | */ |
44 | inline static size_t compute_options(size_t opt) |
45 | { |
46 | size_t opt_ = NONE; |
47 | if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING) |
48 | {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;} |
49 | |
50 | if (opt & RUN_ON_DEVICE) |
51 | { |
52 | #if defined(CUDA_GPU) && defined(__NVCC__) |
53 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
54 | opt_ |= MPI_GPU_DIRECT; |
55 | #else |
56 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
57 | #endif |
58 | } |
59 | |
60 | return opt_; |
61 | } |
62 | |
63 | /*! \brief template selector for asynchronous or not asynchronous |
64 | * |
65 | * \tparam impl implementation |
66 | * \tparam prp properties |
67 | * |
68 | */ |
69 | template<unsigned int impl, template<typename> class layout_base, unsigned int ... prp> |
70 | struct ghost_exchange_comm_impl |
71 | { |
72 | template<typename Vcluster_type, typename vector_prop_type, |
73 | typename vector_pos_type, typename send_vector, |
74 | typename prc_recv_get_type, typename prc_g_opart_type, |
75 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
76 | typename g_opart_sz_type> |
77 | static inline void sendrecv_prp(Vcluster_type & v_cl, |
78 | openfpm::vector<send_vector> & g_send_prp, |
79 | vector_prop_type & v_prp, |
80 | vector_pos_type & v_pos, |
81 | prc_g_opart_type & prc_g_opart, |
82 | prc_recv_get_type & prc_recv_get, |
83 | recv_sz_get_type & recv_sz_get, |
84 | recv_sz_get_byte_type & recv_sz_get_byte, |
85 | g_opart_sz_type & g_opart_sz, |
86 | size_t g_m, |
87 | size_t opt) |
88 | { |
89 | // if there are no properties skip |
90 | // SSendRecvP send everything when we do not give properties |
91 | |
92 | if (sizeof...(prp) != 0) |
93 | { |
94 | size_t opt_ = compute_options(opt); |
95 | if (opt & SKIP_LABELLING) |
96 | { |
97 | if (opt & RUN_ON_DEVICE) |
98 | { |
99 | op_ssend_gg_recv_merge_run_device opm(g_m); |
100 | v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
101 | } |
102 | else |
103 | { |
104 | op_ssend_gg_recv_merge opm(g_m); |
105 | v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
106 | } |
107 | } |
108 | else |
109 | {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
110 | |
111 | // fill g_opart_sz |
112 | g_opart_sz.resize(prc_g_opart.size()); |
113 | |
114 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
115 | g_opart_sz.get(i) = g_send_prp.get(i).size(); |
116 | } |
117 | } |
118 | |
119 | template<typename Vcluster_type, typename vector_prop_type, |
120 | typename vector_pos_type, typename send_pos_vector, |
121 | typename prc_recv_get_type, typename prc_g_opart_type, |
122 | typename recv_sz_get_type> |
123 | static inline void sendrecv_pos(Vcluster_type & v_cl, |
124 | openfpm::vector<send_pos_vector> & g_pos_send, |
125 | vector_prop_type & v_prp, |
126 | vector_pos_type & v_pos, |
127 | prc_recv_get_type & prc_recv_get, |
128 | recv_sz_get_type & recv_sz_get, |
129 | prc_g_opart_type & prc_g_opart, |
130 | size_t opt) |
131 | { |
132 | size_t opt_ = compute_options(opt); |
133 | if (opt & SKIP_LABELLING) |
134 | { |
135 | v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
136 | } |
137 | else |
138 | { |
139 | prc_recv_get.clear(); |
140 | recv_sz_get.clear(); |
141 | v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
142 | } |
143 | } |
144 | |
145 | template<typename Vcluster_type, typename vector_prop_type, |
146 | typename vector_pos_type, typename send_pos_vector, |
147 | typename prc_recv_get_type, typename prc_g_opart_type, |
148 | typename recv_sz_get_type> |
149 | static inline void sendrecv_pos_wait(Vcluster_type & v_cl, |
150 | openfpm::vector<send_pos_vector> & g_pos_send, |
151 | vector_prop_type & v_prp, |
152 | vector_pos_type & v_pos, |
153 | prc_recv_get_type & prc_recv_get, |
154 | recv_sz_get_type & recv_sz_get, |
155 | prc_g_opart_type & prc_g_opart, |
156 | size_t opt) |
157 | {} |
158 | |
159 | template<typename Vcluster_type, typename vector_prop_type, |
160 | typename vector_pos_type, typename send_vector, |
161 | typename prc_recv_get_type, typename prc_g_opart_type, |
162 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
163 | typename g_opart_sz_type> |
164 | static inline void sendrecv_prp_wait(Vcluster_type & v_cl, |
165 | openfpm::vector<send_vector> & g_send_prp, |
166 | vector_prop_type & v_prp, |
167 | vector_pos_type & v_pos, |
168 | prc_g_opart_type & prc_g_opart, |
169 | prc_recv_get_type & prc_recv_get, |
170 | recv_sz_get_type & recv_sz_get, |
171 | recv_sz_get_byte_type & recv_sz_get_byte, |
172 | g_opart_sz_type & g_opart_sz, |
173 | size_t g_m, |
174 | size_t opt) |
175 | {} |
176 | }; |
177 | |
178 | |
179 | template<template<typename> class layout_base, unsigned int ... prp> |
180 | struct ghost_exchange_comm_impl<GHOST_ASYNC,layout_base, prp ... > |
181 | { |
182 | template<typename Vcluster_type, typename vector_prop_type, |
183 | typename vector_pos_type, typename send_vector, |
184 | typename prc_recv_get_type, typename prc_g_opart_type, |
185 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
186 | typename g_opart_sz_type> |
187 | static inline void sendrecv_prp(Vcluster_type & v_cl, |
188 | openfpm::vector<send_vector> & g_send_prp, |
189 | vector_prop_type & v_prp, |
190 | vector_pos_type & v_pos, |
191 | prc_g_opart_type & prc_g_opart, |
192 | prc_recv_get_type & prc_recv_get, |
193 | recv_sz_get_type & recv_sz_get, |
194 | recv_sz_get_byte_type & recv_sz_get_byte, |
195 | g_opart_sz_type & g_opart_sz, |
196 | size_t g_m, |
197 | size_t opt) |
198 | { |
199 | prc_recv_get.clear(); |
200 | recv_sz_get.clear(); |
201 | |
202 | // if there are no properties skip |
203 | // SSendRecvP send everything when we do not give properties |
204 | |
205 | if (sizeof...(prp) != 0) |
206 | { |
207 | size_t opt_ = compute_options(opt); |
208 | if (opt & SKIP_LABELLING) |
209 | { |
210 | if (opt & RUN_ON_DEVICE) |
211 | { |
212 | op_ssend_gg_recv_merge_run_device opm(g_m); |
213 | v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
214 | } |
215 | else |
216 | { |
217 | op_ssend_gg_recv_merge opm(g_m); |
218 | v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
219 | } |
220 | } |
221 | else |
222 | {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
223 | } |
224 | |
225 | // fill g_opart_sz |
226 | g_opart_sz.resize(prc_g_opart.size()); |
227 | |
228 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
229 | {g_opart_sz.get(i) = g_send_prp.get(i).size();} |
230 | } |
231 | |
232 | template<typename Vcluster_type, typename vector_prop_type, |
233 | typename vector_pos_type, typename send_pos_vector, |
234 | typename prc_recv_get_type, typename prc_g_opart_type, |
235 | typename recv_sz_get_type> |
236 | static inline void sendrecv_pos(Vcluster_type & v_cl, |
237 | openfpm::vector<send_pos_vector> & g_pos_send, |
238 | vector_prop_type & v_prp, |
239 | vector_pos_type & v_pos, |
240 | prc_recv_get_type & prc_recv_get, |
241 | recv_sz_get_type & recv_sz_get, |
242 | prc_g_opart_type & prc_g_opart, |
243 | size_t opt) |
244 | { |
245 | prc_recv_get.clear(); |
246 | recv_sz_get.clear(); |
247 | |
248 | size_t opt_ = compute_options(opt); |
249 | if (opt & SKIP_LABELLING) |
250 | { |
251 | v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
252 | } |
253 | else |
254 | { |
255 | prc_recv_get.clear(); |
256 | recv_sz_get.clear(); |
257 | v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
258 | } |
259 | } |
260 | |
261 | template<typename Vcluster_type, typename vector_prop_type, |
262 | typename vector_pos_type, typename send_pos_vector, |
263 | typename prc_recv_get_type, typename prc_g_opart_type, |
264 | typename recv_sz_get_type> |
265 | static inline void sendrecv_pos_wait(Vcluster_type & v_cl, |
266 | openfpm::vector<send_pos_vector> & g_pos_send, |
267 | vector_prop_type & v_prp, |
268 | vector_pos_type & v_pos, |
269 | prc_recv_get_type & prc_recv_get, |
270 | recv_sz_get_type & recv_sz_get, |
271 | prc_g_opart_type & prc_g_opart, |
272 | size_t opt) |
273 | { |
274 | size_t opt_ = compute_options(opt); |
275 | if (opt & SKIP_LABELLING) |
276 | { |
277 | v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
278 | } |
279 | else |
280 | { |
281 | v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_); |
282 | } |
283 | } |
284 | |
285 | template<typename Vcluster_type, typename vector_prop_type, |
286 | typename vector_pos_type, typename send_vector, |
287 | typename prc_recv_get_type, typename prc_g_opart_type, |
288 | typename recv_sz_get_type, typename recv_sz_get_byte_type, |
289 | typename g_opart_sz_type> |
290 | static inline void sendrecv_prp_wait(Vcluster_type & v_cl, |
291 | openfpm::vector<send_vector> & g_send_prp, |
292 | vector_prop_type & v_prp, |
293 | vector_pos_type & v_pos, |
294 | prc_g_opart_type & prc_g_opart, |
295 | prc_recv_get_type & prc_recv_get, |
296 | recv_sz_get_type & recv_sz_get, |
297 | recv_sz_get_byte_type & recv_sz_get_byte, |
298 | g_opart_sz_type & g_opart_sz, |
299 | size_t g_m, |
300 | size_t opt) |
301 | { |
302 | // if there are no properties skip |
303 | // SSendRecvP send everything when we do not give properties |
304 | |
305 | if (sizeof...(prp) != 0) |
306 | { |
307 | size_t opt_ = compute_options(opt); |
308 | if (opt & SKIP_LABELLING) |
309 | { |
310 | if (opt & RUN_ON_DEVICE) |
311 | { |
312 | op_ssend_gg_recv_merge_run_device opm(g_m); |
313 | v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
314 | } |
315 | else |
316 | { |
317 | op_ssend_gg_recv_merge opm(g_m); |
318 | v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_); |
319 | } |
320 | } |
321 | else |
322 | {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);} |
323 | } |
324 | } |
325 | }; |
326 | |
327 | |
328 | /*! \brief This class is an helper for the communication of vector_dist |
329 | * |
330 | * \tparam dim Dimensionality of the space where the elements lives |
331 | * \tparam St type of space float, double ... |
332 | * \tparam prop properties the vector element store in OpenFPM data structure format |
333 | * \tparam Decomposition Decomposition strategy to use CartDecomposition ... |
334 | * \tparam Memory Memory pool where store the information HeapMemory ... |
335 | * |
336 | * \see vector_dist |
337 | * |
338 | */ |
339 | |
340 | template<unsigned int dim, |
341 | typename St, |
342 | typename prop, |
343 | typename Decomposition = CartDecomposition<dim,St>, |
344 | typename Memory = HeapMemory, |
345 | template<typename> class layout_base = memory_traits_lin> |
346 | class vector_dist_comm |
347 | { |
348 | //! Number of units for each sub-domain |
349 | size_t v_sub_unit_factor = 64; |
350 | |
351 | //! definition of the send vector for position |
352 | typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector; |
353 | |
354 | //! VCluster |
355 | Vcluster<Memory> & v_cl; |
356 | |
357 | //! Domain decomposition |
358 | Decomposition dec; |
359 | |
360 | //! It map the processor id with the communication request into map procedure |
361 | openfpm::vector<size_t> p_map_req; |
362 | |
363 | //! For each near processor, outgoing particle id |
364 | //! \warning opart is assumed to be an ordered list |
365 | //! first id particle id |
366 | //! second id shift id |
367 | //! third id is the processor id |
368 | openfpm::vector<aggregate<int,int,int>, |
369 | Memory, |
370 | layout_base > m_opart; |
371 | |
372 | //! Per processor ordered particles id for ghost_get (see prc_g_opart) |
373 | //! For each processor the internal vector store the id of the |
374 | //! particles that must be communicated to the other processors |
375 | openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart; |
376 | |
377 | //! Same as g_opart but on device, the vector of vector is flatten into a single vector |
378 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
379 | CudaMemory, |
380 | memory_traits_inte> g_opart_device; |
381 | |
382 | //! Helper buffer for computation (on GPU) of local particles (position) |
383 | openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp; |
384 | |
385 | //! Helper buffer for computation (on GPU) of local particles (properties) |
386 | openfpm::vector<prop,Memory,layout_base> v_prp_tmp; |
387 | |
388 | //! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size() |
389 | openfpm::vector<size_t> g_opart_sz; |
390 | |
391 | //! processor rank list of g_opart |
392 | openfpm::vector<size_t> prc_g_opart; |
393 | |
394 | //! It store the list of processor that communicate with us (local processor) |
395 | //! from the last ghost get |
396 | openfpm::vector<size_t> prc_recv_get_pos; |
397 | openfpm::vector<size_t> prc_recv_get_prp; |
398 | |
399 | //! the same as prc_recv_get but for put |
400 | openfpm::vector<size_t> prc_recv_put; |
401 | |
402 | //! the same as prc_recv_get but for map |
403 | openfpm::vector<size_t> prc_recv_map; |
404 | |
405 | //! It store the size of the elements added for each processor that communicate with us (local processor) |
406 | //! from the last ghost get |
407 | openfpm::vector<size_t> recv_sz_get_pos; |
408 | openfpm::vector<size_t> recv_sz_get_prp; |
409 | //! Conversion to byte of recv_sz_get |
410 | openfpm::vector<size_t> recv_sz_get_byte; |
411 | |
412 | |
413 | //! The same as recv_sz_get but for put |
414 | openfpm::vector<size_t> recv_sz_put; |
415 | |
416 | //! The same as recv_sz_get but for map |
417 | openfpm::vector<size_t> recv_sz_map; |
418 | |
419 | //! elements sent for each processors (ghost_get) |
420 | openfpm::vector<size_t> prc_sz_gg; |
421 | |
422 | //! temporary buffer to processors ids |
423 | openfpm::vector<aggregate<unsigned int>, |
424 | Memory, |
425 | layout_base> proc_id_out; |
426 | |
427 | //! temporary buffer for the scan result |
428 | openfpm::vector<aggregate<unsigned int>, |
429 | Memory, |
430 | layout_base> starts; |
431 | |
432 | //! Processor communication size |
433 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset; |
434 | |
435 | |
436 | //! Temporary CudaMemory to do stuff |
437 | CudaMemory mem; |
438 | |
439 | //! Local ghost marker (across the ghost particles it mark from where we have the) |
440 | //! replicated ghost particles that are local |
441 | size_t lg_m; |
442 | |
443 | //! Sending buffer |
444 | openfpm::vector_fr<Memory> hsmem; |
445 | |
446 | //! process the particle with properties |
447 | template<typename prp_object, int ... prp> |
448 | struct proc_with_prp |
449 | { |
450 | //! process the particle |
451 | template<typename T1, typename T2> inline static void proc(size_t lbl, size_t cnt, size_t id, T1 & v_prp, T2 & m_prp) |
452 | { |
453 | // source object type |
454 | typedef encapc<1, prop, typename openfpm::vector<prop>::layout_type> encap_src; |
455 | // destination object type |
456 | typedef encapc<1, prp_object, typename openfpm::vector<prp_object>::layout_type> encap_dst; |
457 | |
458 | // Copy only the selected properties |
459 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(id), m_prp.get(lbl).get(cnt)); |
460 | } |
461 | }; |
462 | |
463 | /*! \brief Get the number of particles received from each processor during the last ghost_get |
464 | * |
465 | * |
466 | * \param i processor (list index) |
467 | * \return the number of particles |
468 | */ |
469 | size_t get_last_ghost_get_received_parts(size_t i) |
470 | { |
471 | // If the last ghost_get did not have properties the information about the number of particles |
472 | // received is in recv_sz_get_ois |
473 | if (recv_sz_get_prp.size() != 0) |
474 | {return recv_sz_get_prp.get(i);} |
475 | else |
476 | {return recv_sz_get_pos.get(i);} |
477 | } |
478 | |
479 | /*! \brief Get the number of processor involved during the last ghost_get |
480 | * |
481 | * \return the number of processor |
482 | */ |
483 | size_t get_last_ghost_get_num_proc() |
484 | { |
485 | if (prc_recv_get_prp.size() != 0) |
486 | {return prc_recv_get_prp.size();} |
487 | else |
488 | {return prc_recv_get_pos.size();} |
489 | } |
490 | |
491 | /*! \brief Get the number of processor involved during the last ghost_get |
492 | * |
493 | * \return the number of processor |
494 | */ |
495 | openfpm::vector<size_t> & get_last_ghost_get_num_proc_vector() |
496 | { |
497 | if (prc_recv_get_prp.size() != 0) |
498 | {return prc_recv_get_prp;} |
499 | else |
500 | {return prc_recv_get_pos;} |
501 | } |
502 | |
503 | /*! \brief Calculate sending buffer size for each processor |
504 | * |
505 | * \param prc_sz_r processor size |
506 | * \param prc_r processor ids |
507 | * |
508 | */ |
509 | inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, |
510 | openfpm::vector<size_t> & prc_sz_r, |
511 | openfpm::vector<size_t> & prc_r, |
512 | size_t opt) |
513 | { |
514 | if (opt & RUN_ON_DEVICE) |
515 | { |
516 | #ifndef TEST1 |
517 | size_t prev_off = 0; |
518 | for (size_t i = 0; i < prc_sz.size() ; i++) |
519 | { |
520 | if (prc_sz.template get<1>(i) != (unsigned int)-1) |
521 | { |
522 | prc_r.add(prc_sz.template get<1>(i)); |
523 | prc_sz_r.add(prc_sz.template get<0>(i) - prev_off); |
524 | } |
525 | prev_off = prc_sz.template get<0>(i); |
526 | } |
527 | #else |
528 | |
529 | // Calculate the sending buffer size for each processor, put this information in |
530 | // a contiguous buffer |
531 | |
532 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
533 | { |
534 | if (prc_sz.template get<0>(i) != 0 && v_cl.rank() != i) |
535 | { |
536 | prc_r.add(i); |
537 | prc_sz_r.add(prc_sz.template get<0>(i)); |
538 | } |
539 | } |
540 | |
541 | #endif |
542 | } |
543 | else |
544 | { |
545 | // Calculate the sending buffer size for each processor, put this information in |
546 | // a contiguous buffer |
547 | |
548 | p_map_req.resize(v_cl.getProcessingUnits()); |
549 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
550 | { |
551 | if (prc_sz.template get<0>(i) != 0) |
552 | { |
553 | p_map_req.get(i) = prc_r.size(); |
554 | prc_r.add(i); |
555 | prc_sz_r.add(prc_sz.template get<0>(i)); |
556 | } |
557 | } |
558 | } |
559 | } |
560 | |
561 | //! From which decomposition the shift boxes are calculated |
562 | long int shift_box_ndec = -1; |
563 | |
564 | //! this map is used to check if a combination is already present |
565 | std::unordered_map<size_t, size_t> map_cmb; |
566 | |
567 | //! The boxes touching the border of the domain are divided in groups (first vector) |
568 | //! each group contain internal ghost coming from sub-domains of the same section |
569 | openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f; |
570 | |
571 | //! The boxes touching the border of the domain + shift vector linearized from where they come from |
572 | openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev; |
573 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv; |
574 | |
575 | //! Store the sector for each group (previous vector) |
576 | openfpm::vector_std<comb<dim>> box_cmb; |
577 | |
578 | //! Id of the local particle to replicate for ghost_get |
579 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc; |
580 | |
581 | //! Processor communication size |
582 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz; |
583 | |
584 | /*! \brief For every internal ghost box we create a structure that order such internal local ghost box in |
585 | * shift vectors |
586 | * |
587 | */ |
588 | void createShiftBox() |
589 | { |
590 | if (shift_box_ndec == (long int)dec.get_ndec()) |
591 | {return;} |
592 | |
593 | struct sh_box |
594 | { |
595 | size_t shift_id; |
596 | |
597 | unsigned int box_f_sv; |
598 | Box<dim,St> box_f_dev; |
599 | |
600 | bool operator<(const sh_box & tmp) const |
601 | { |
602 | return shift_id < tmp.shift_id; |
603 | } |
604 | |
605 | }; |
606 | openfpm::vector<sh_box> reord_shift; |
607 | box_f.clear(); |
608 | map_cmb.clear(); |
609 | box_cmb.clear(); |
610 | |
611 | // Add local particles coming from periodic boundary, the only boxes that count are the one |
612 | // touching the border |
613 | for (size_t i = 0; i < dec.getNLocalSub(); i++) |
614 | { |
615 | size_t Nl = dec.getLocalNIGhost(i); |
616 | |
617 | for (size_t j = 0; j < Nl; j++) |
618 | { |
619 | // If the ghost does not come from the intersection with an out of |
620 | // border sub-domain the combination is all zero and n_zero return dim |
621 | if (dec.getLocalIGhostPos(i, j).n_zero() == dim) |
622 | continue; |
623 | |
624 | // Check if we already have boxes with such combination |
625 | auto it = map_cmb.find(dec.getLocalIGhostPos(i, j).lin()); |
626 | if (it == map_cmb.end()) |
627 | { |
628 | // we do not have it |
629 | box_f.add(); |
630 | box_f.last().add(dec.getLocalIGhostBox(i, j)); |
631 | box_cmb.add(dec.getLocalIGhostPos(i, j)); |
632 | map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1; |
633 | } |
634 | else |
635 | { |
636 | // we have it |
637 | box_f.get(it->second).add(dec.getLocalIGhostBox(i, j)); |
638 | } |
639 | |
640 | reord_shift.add(); |
641 | reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin(); |
642 | reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j); |
643 | reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j)); |
644 | } |
645 | } |
646 | |
647 | // now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles |
648 | reord_shift.sort(); |
649 | |
650 | box_f_dev.resize(reord_shift.size()); |
651 | box_f_sv.resize(reord_shift.size()); |
652 | |
653 | for (size_t i = 0 ; i < reord_shift.size() ; i++) |
654 | { |
655 | box_f_dev.get(i) = reord_shift.get(i).box_f_dev; |
656 | box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv; |
657 | } |
658 | |
659 | #ifdef CUDA_GPU |
660 | |
661 | // move box_f_dev and box_f_sv to device |
662 | box_f_dev.template hostToDevice<0,1>(); |
663 | box_f_sv.template hostToDevice<0>(); |
664 | |
665 | #endif |
666 | |
667 | shift_box_ndec = dec.get_ndec(); |
668 | } |
669 | |
670 | /*! \brief Local ghost from labeled particles |
671 | * |
672 | * \param v_pos vector of particle positions |
673 | * \param v_prp vector of particles properties |
674 | * \param opt options |
675 | * |
676 | */ |
677 | void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
678 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
679 | size_t opt) |
680 | { |
681 | // get the shift vectors |
682 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
683 | |
684 | if (!(opt & NO_POSITION)) |
685 | { |
686 | if (opt & RUN_ON_DEVICE) |
687 | { |
688 | local_ghost_from_opart_impl<true,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
689 | ::run(o_part_loc,shifts,v_pos,v_prp,opt); |
690 | } |
691 | else |
692 | { |
693 | for (size_t i = 0 ; i < o_part_loc.size() ; i++) |
694 | { |
695 | size_t lin_id = o_part_loc.template get<1>(i); |
696 | size_t key = o_part_loc.template get<0>(i); |
697 | |
698 | Point<dim, St> p = v_pos.get(key); |
699 | // shift |
700 | p -= shifts.get(lin_id); |
701 | |
702 | // add this particle shifting its position |
703 | v_pos.add(p); |
704 | v_prp.get(lg_m+i) = v_prp.get(key); |
705 | } |
706 | } |
707 | } |
708 | else |
709 | { |
710 | if (opt & RUN_ON_DEVICE) |
711 | { |
712 | local_ghost_from_opart_impl<false,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
713 | ::run(o_part_loc,shifts,v_pos,v_prp,opt); |
714 | } |
715 | else |
716 | { |
717 | for (size_t i = 0 ; i < o_part_loc.size() ; i++) |
718 | { |
719 | size_t key = o_part_loc.template get<0>(i); |
720 | |
721 | v_prp.get(lg_m+i) = v_prp.get(key); |
722 | } |
723 | } |
724 | } |
725 | } |
726 | |
727 | /*! \brief Local ghost from decomposition |
728 | * |
729 | * \param v_pos vector of particle positions |
730 | * \param v_prp vector of particle properties |
731 | * \param g_m ghost marker |
732 | * |
733 | */ |
734 | void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
735 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
736 | size_t g_m,size_t opt) |
737 | { |
738 | o_part_loc.clear(); |
739 | |
740 | // get the shift vectors |
741 | const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
742 | |
743 | if (opt & RUN_ON_DEVICE) |
744 | { |
745 | local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value> |
746 | ::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt); |
747 | } |
748 | else |
749 | { |
750 | // Label the internal (assigned) particles |
751 | auto it = v_pos.getIteratorTo(g_m); |
752 | |
753 | while (it.isNext()) |
754 | { |
755 | auto key = it.get(); |
756 | |
757 | // If particles are inside these boxes |
758 | for (size_t i = 0; i < box_f.size(); i++) |
759 | { |
760 | for (size_t j = 0; j < box_f.get(i).size(); j++) |
761 | { |
762 | if (box_f.get(i).get(j).isInsideNP(v_pos.get(key)) == true) |
763 | { |
764 | size_t lin_id = dec.convertShift(box_cmb.get(i)); |
765 | |
766 | o_part_loc.add(); |
767 | o_part_loc.template get<0>(o_part_loc.size()-1) = key; |
768 | o_part_loc.template get<1>(o_part_loc.size()-1) = lin_id; |
769 | |
770 | Point<dim, St> p = v_pos.get(key); |
771 | // shift |
772 | p -= shifts.get(lin_id); |
773 | |
774 | // add this particle shifting its position |
775 | v_pos.add(p); |
776 | v_prp.add(); |
777 | v_prp.last() = v_prp.get(key); |
778 | |
779 | // boxes in one group can be overlapping |
780 | // we do not have to search for the other |
781 | // boxes otherwise we will have duplicate particles |
782 | // |
783 | // A small note overlap of boxes across groups is fine |
784 | // (and needed) because each group has different shift |
785 | // producing non overlapping particles |
786 | // |
787 | break; |
788 | } |
789 | } |
790 | } |
791 | |
792 | ++it; |
793 | } |
794 | } |
795 | } |
796 | |
797 | /*! \brief Add local particles based on the boundary conditions |
798 | * |
799 | * In order to understand what this function use the following |
800 | * |
801 | \verbatim |
802 | |
803 | [1,1] |
804 | +---------+------------------------+---------+ |
805 | | (1,-1) | | (1,1) | |
806 | | | | (1,0) --> 7 | | | |
807 | | v | | v | |
808 | | 6 | | 8 | |
809 | +--------------------------------------------+ |
810 | | | | | |
811 | | | | | |
812 | | | | | |
813 | | (-1,0) | | (1,0) | |
814 | | | | | | | |
815 | | v | (0,0) --> 4 | v | |
816 | | 3 | | 5 | |
817 | | | | | |
818 | B | | | A | |
819 | * | | | * | |
820 | | | | | |
821 | | | | | |
822 | | | | | |
823 | +--------------------------------------------+ |
824 | | (-1,-1) | | (-1,1) | |
825 | | | | (-1,0) --> 1 | | | |
826 | | v | | v | |
827 | | 0 | | 2 | |
828 | +---------+------------------------+---------+ |
829 | |
830 | |
831 | \endverbatim |
832 | |
833 | * |
834 | * The box is the domain, while all boxes at the border (so not (0,0) ) are the |
835 | * ghost part at the border of the domain. If a particle A is in the position in figure |
836 | * a particle B must be created. This function duplicate the particle A, if A and B are |
837 | * local |
838 | * |
839 | * \param v_pos vector of particle of positions |
840 | * \param v_prp vector of particle properties |
841 | * \param g_m ghost marker |
842 | * \param opt options |
843 | * |
844 | */ |
845 | void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
846 | openfpm::vector<prop,Memory,layout_base> & v_prp , |
847 | size_t & g_m, |
848 | size_t opt) |
849 | { |
850 | // Create the shift boxes |
851 | createShiftBox(); |
852 | |
853 | if (!(opt & SKIP_LABELLING)) |
854 | lg_m = v_prp.size(); |
855 | |
856 | if (box_f.size() == 0) |
857 | return; |
858 | else |
859 | { |
860 | if (opt & SKIP_LABELLING) |
861 | {local_ghost_from_opart(v_pos,v_prp,opt);} |
862 | else |
863 | {local_ghost_from_dec(v_pos,v_prp,g_m,opt);} |
864 | } |
865 | } |
866 | |
867 | /*! \brief This function fill the send buffer for the particle position after the particles has been label with labelParticles |
868 | * |
869 | * \param v_pos vector of particle positions |
870 | * \param g_pos_send Send buffer to fill |
871 | * |
872 | */ |
873 | void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
874 | openfpm::vector<size_t> & prc_sz, |
875 | openfpm::vector<send_pos_vector> & g_pos_send, |
876 | size_t opt, |
877 | bool async) |
878 | { |
879 | // get the shift vectors |
880 | const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); |
881 | |
882 | // create a number of send buffers equal to the near processors |
883 | g_pos_send.resize(prc_sz.size()); |
884 | |
885 | size_t old_hsmem_size = 0; |
886 | |
887 | // if we do async |
888 | if (async == true) |
889 | { |
890 | old_hsmem_size = hsmem.size(); |
891 | resize_retained_buffer(hsmem,g_pos_send.size() + hsmem.size()); |
892 | } |
893 | else |
894 | {resize_retained_buffer(hsmem,g_pos_send.size());} |
895 | |
896 | for (size_t i = 0; i < g_pos_send.size(); i++) |
897 | { |
898 | // Buffer must retained and survive the destruction of the |
899 | // vector |
900 | if (hsmem.get(i+old_hsmem_size).ref() == 0) |
901 | {hsmem.get(i+old_hsmem_size).incRef();} |
902 | |
903 | // Set the memory for retain the send buffer |
904 | g_pos_send.get(i).setMemory(hsmem.get(i+old_hsmem_size)); |
905 | |
906 | // resize the sending vector (No allocation is produced) |
907 | g_pos_send.get(i).resize(prc_sz.get(i)); |
908 | } |
909 | |
910 | if (opt & RUN_ON_DEVICE) |
911 | { |
912 | #if defined(CUDA_GPU) && defined(__NVCC__) |
913 | |
914 | size_t offset = 0; |
915 | |
916 | // Fill the sending buffers |
917 | for (size_t i = 0 ; i < g_pos_send.size() ; i++) |
918 | { |
919 | auto ite = g_pos_send.get(i).getGPUIterator(); |
920 | |
921 | CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>), |
922 | ite, |
923 | g_opart_device.toKernel(), g_pos_send.get(i).toKernel(), |
924 | v_pos.toKernel(),shifts.toKernel(),offset); |
925 | |
926 | offset += prc_sz.get(i); |
927 | } |
928 | |
929 | #else |
930 | |
931 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
932 | |
933 | #endif |
934 | } |
935 | else |
936 | { |
937 | // Fill the send buffer |
938 | for (size_t i = 0; i < g_opart.size(); i++) |
939 | { |
940 | for (size_t j = 0; j < g_opart.get(i).size(); j++) |
941 | { |
942 | Point<dim, St> s = v_pos.get(g_opart.get(i).template get<0>(j)); |
943 | s -= shifts.get(g_opart.get(i).template get<1>(j)); |
944 | g_pos_send.get(i).set(j, s); |
945 | } |
946 | } |
947 | } |
948 | } |
949 | |
950 | /*! \brief This function fill the send buffer for ghost_put |
951 | * |
952 | * \tparam send_vector type used to send data |
953 | * \tparam prp_object object containing only the properties to send |
954 | * \tparam prp set of properties to send |
955 | * |
956 | * \param v_prp vector of particle properties |
957 | * \param g_send_prp Send buffer to fill |
958 | * \param g_m ghost marker |
959 | * |
960 | */ |
961 | template<typename send_vector, typename prp_object, int ... prp> |
962 | void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, |
963 | openfpm::vector<send_vector> & g_send_prp, |
964 | size_t & g_m, |
965 | size_t opt) |
966 | { |
967 | // create a number of send buffers equal to the near processors |
968 | // from which we received |
969 | |
970 | // NOTE in some case the information can be in prc_recv_get_pos |
971 | |
972 | size_t nproc = get_last_ghost_get_num_proc(); |
973 | |
974 | g_send_prp.resize(nproc); |
975 | |
976 | resize_retained_buffer(hsmem,g_send_prp.size()); |
977 | |
978 | for (size_t i = 0; i < g_send_prp.size(); i++) |
979 | { |
980 | // Buffer must retained and survive the destruction of the |
981 | // vector |
982 | if (hsmem.get(i).ref() == 0) |
983 | hsmem.get(i).incRef(); |
984 | |
985 | // Set the memory for retain the send buffer |
986 | g_send_prp.get(i).setMemory(hsmem.get(i)); |
987 | |
988 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
989 | |
990 | // resize the sending vector (No allocation is produced) |
991 | g_send_prp.get(i).resize(n_part_recv); |
992 | } |
993 | |
994 | size_t accum = g_m; |
995 | |
996 | if (opt & RUN_ON_DEVICE) |
997 | { |
998 | #if defined(CUDA_GPU) && defined(__NVCC__) |
999 | |
1000 | if (sizeof...(prp) != 0) |
1001 | { |
1002 | // Fill the sending buffers |
1003 | for (size_t i = 0 ; i < g_send_prp.size() ; i++) |
1004 | { |
1005 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
1006 | |
1007 | auto ite = g_send_prp.get(i).getGPUIterator(); |
1008 | |
1009 | if (ite.nblocks() == 0) {continue;} |
1010 | |
1011 | CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>), |
1012 | ite, |
1013 | g_send_prp.get(i).toKernel(), |
1014 | v_prp.toKernel(),accum); |
1015 | |
1016 | accum = accum + n_part_recv; |
1017 | } |
1018 | } |
1019 | |
1020 | #else |
1021 | |
1022 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
1023 | |
1024 | #endif |
1025 | } |
1026 | else |
1027 | { |
1028 | // Fill the send buffer |
1029 | for (size_t i = 0; i < g_send_prp.size(); i++) |
1030 | { |
1031 | size_t j2 = 0; |
1032 | size_t n_part_recv = get_last_ghost_get_received_parts(i); |
1033 | |
1034 | for (size_t j = accum; j < accum + n_part_recv; j++) |
1035 | { |
1036 | // source object type |
1037 | typedef encapc<1, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src; |
1038 | // destination object type |
1039 | typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst; |
1040 | |
1041 | // Copy only the selected properties |
1042 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2)); |
1043 | |
1044 | j2++; |
1045 | } |
1046 | |
1047 | accum = accum + n_part_recv; |
1048 | } |
1049 | } |
1050 | } |
1051 | |
1052 | /*! \brief resize the retained buffer by nbf |
1053 | * |
1054 | * |
1055 | */ |
1056 | void resize_retained_buffer(openfpm::vector_fr<Memory> & rt_buf, size_t nbf) |
1057 | { |
1058 | // Release all the buffer that are going to be deleted |
1059 | for (size_t i = nbf ; i < rt_buf.size() ; i++) |
1060 | { |
1061 | rt_buf.get(i).decRef(); |
1062 | } |
1063 | |
1064 | hsmem.resize(nbf); |
1065 | } |
1066 | |
1067 | /*! \brief Set the buffer for each property |
1068 | * |
1069 | * |
1070 | */ |
1071 | template<typename send_vector, typename v_mpl> |
1072 | struct set_mem_retained_buffers_inte |
1073 | { |
1074 | openfpm::vector<send_vector> & g_send_prp; |
1075 | |
1076 | size_t i; |
1077 | |
1078 | openfpm::vector_fr<Memory> & hsmem; |
1079 | |
1080 | size_t j; |
1081 | |
1082 | set_mem_retained_buffers_inte(openfpm::vector<send_vector> & g_send_prp, size_t i , |
1083 | openfpm::vector_fr<Memory> & hsmem, size_t j) |
1084 | :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j) |
1085 | {} |
1086 | |
1087 | //! It call the setMemory function for each property |
1088 | template<typename T> |
1089 | inline void operator()(T& t) |
1090 | { |
1091 | g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j)); |
1092 | |
1093 | j++; |
1094 | } |
1095 | }; |
1096 | |
1097 | template<bool inte_or_lin,typename send_vector, typename v_mpl> |
1098 | struct set_mem_retained_buffers |
1099 | { |
1100 | static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp, |
1101 | openfpm::vector<size_t> & prc_sz, |
1102 | size_t i, |
1103 | openfpm::vector_fr<Memory> & hsmem, |
1104 | size_t j) |
1105 | { |
1106 | // Set the memory for retain the send buffer |
1107 | g_send_prp.get(i).setMemory(hsmem.get(j)); |
1108 | |
1109 | // resize the sending vector (No allocation is produced) |
1110 | g_send_prp.get(i).resize(prc_sz.get(i)); |
1111 | |
1112 | return j+1; |
1113 | } |
1114 | }; |
1115 | |
1116 | template<typename send_vector, typename v_mpl> |
1117 | struct set_mem_retained_buffers<true,send_vector,v_mpl> |
1118 | { |
1119 | static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp, |
1120 | openfpm::vector<size_t> & prc_sz, |
1121 | size_t i, |
1122 | openfpm::vector_fr<Memory> & hsmem, |
1123 | size_t j) |
1124 | { |
1125 | set_mem_retained_buffers_inte<send_vector,v_mpl> smrbi(g_send_prp,i,hsmem,j); |
1126 | |
1127 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi); |
1128 | |
1129 | // if we do not send properties do not reallocate |
1130 | if (boost::mpl::size<v_mpl>::type::value != 0) |
1131 | { |
1132 | // resize the sending vector (No allocation is produced) |
1133 | g_send_prp.get(i).resize(prc_sz.get(i)); |
1134 | } |
1135 | |
1136 | return smrbi.j; |
1137 | } |
1138 | }; |
1139 | |
1140 | /*! \brief This function fill the send buffer for properties after the particles has been label with labelParticles |
1141 | * |
1142 | * \tparam send_vector type used to send data |
1143 | * \tparam prp_object object containing only the properties to send |
1144 | * \tparam prp set of properties to send |
1145 | * |
1146 | * \param v_prp vector of particle properties |
1147 | * \param g_send_prp Send buffer to fill |
1148 | * |
1149 | */ |
1150 | template<typename send_vector, typename prp_object, int ... prp> |
1151 | void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, |
1152 | openfpm::vector<size_t> & prc_sz, |
1153 | openfpm::vector<send_vector> & g_send_prp, |
1154 | size_t opt) |
1155 | { |
1156 | size_t factor = 1; |
1157 | |
1158 | typedef typename to_boost_vmpl<prp...>::type v_mpl; |
1159 | |
1160 | if (is_layout_inte<layout_base<prop>>::value == true) {factor *= sizeof...(prp);} |
1161 | |
1162 | // create a number of send buffers equal to the near processors |
1163 | g_send_prp.resize(prc_sz.size()); |
1164 | |
1165 | resize_retained_buffer(hsmem,g_send_prp.size()*factor); |
1166 | |
1167 | for (size_t i = 0; i < hsmem.size(); i++) |
1168 | { |
1169 | // Buffer must retained and survive the destruction of the |
1170 | // vector |
1171 | if (hsmem.get(i).ref() == 0) |
1172 | {hsmem.get(i).incRef();} |
1173 | } |
1174 | |
1175 | size_t j = 0; |
1176 | for (size_t i = 0; i < g_send_prp.size(); i++) |
1177 | { |
1178 | j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,prc_sz,i,hsmem,j); |
1179 | } |
1180 | |
1181 | if (opt & RUN_ON_DEVICE) |
1182 | { |
1183 | #if defined(CUDA_GPU) && defined(__NVCC__) |
1184 | |
1185 | size_t offset = 0; |
1186 | |
1187 | if (sizeof...(prp) != 0) |
1188 | { |
1189 | // Fill the sending buffers |
1190 | for (size_t i = 0 ; i < g_send_prp.size() ; i++) |
1191 | { |
1192 | auto ite = g_send_prp.get(i).getGPUIterator(); |
1193 | |
1194 | CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>), |
1195 | ite, |
1196 | g_opart_device.toKernel(), g_send_prp.get(i).toKernel(), |
1197 | v_prp.toKernel(),offset); |
1198 | |
1199 | offset += prc_sz.get(i); |
1200 | } |
1201 | } |
1202 | |
1203 | #else |
1204 | |
1205 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
1206 | |
1207 | #endif |
1208 | } |
1209 | else |
1210 | { |
1211 | // if no properties must be sent skip this step |
1212 | if (sizeof...(prp) == 0) {return;} |
1213 | |
1214 | // Fill the send buffer |
1215 | for (size_t i = 0; i < g_opart.size(); i++) |
1216 | { |
1217 | for (size_t j = 0; j < g_opart.get(i).size(); j++) |
1218 | { |
1219 | // source object type |
1220 | typedef decltype(v_prp.get(g_opart.get(i).template get<0>(j))) encap_src; |
1221 | // destination object type |
1222 | typedef decltype(g_send_prp.get(i).get(j)) encap_dst; |
1223 | |
1224 | // Copy only the selected properties |
1225 | object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j)); |
1226 | } |
1227 | } |
1228 | } |
1229 | } |
1230 | |
1231 | /*! \brief allocate and fill the send buffer for the map function |
1232 | * |
1233 | * \param v_pos vector of particle positions |
1234 | * \param v_prp vector of particles properties |
1235 | * \param prc_sz_r For each processor in the list the size of the message to send |
1236 | * \param m_pos sending buffer for position |
1237 | * \param m_prp sending buffer for properties |
1238 | * \param offset from where start the list of the particles that migrate in o_part |
1239 | * This parameter is used only in case of RUN_ON_DEVICE option |
1240 | * |
1241 | */ |
1242 | void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
1243 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
1244 | openfpm::vector<size_t> & prc_sz_r, |
1245 | openfpm::vector<size_t> & prc_r, |
1246 | openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos, |
1247 | openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp, |
1248 | openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz, |
1249 | size_t opt) |
1250 | { |
1251 | m_prp.resize(prc_sz_r.size()); |
1252 | m_pos.resize(prc_sz_r.size()); |
1253 | openfpm::vector<size_t> cnt(prc_sz_r.size()); |
1254 | |
1255 | for (size_t i = 0; i < prc_sz_r.size() ; i++) |
1256 | { |
1257 | // set the size and allocate, using mem warant that pos and prp is contiguous |
1258 | m_pos.get(i).resize(prc_sz_r.get(i)); |
1259 | m_prp.get(i).resize(prc_sz_r.get(i)); |
1260 | cnt.get(i) = 0; |
1261 | } |
1262 | |
1263 | if (opt & RUN_ON_DEVICE) |
1264 | { |
1265 | if (v_cl.size() == 1) |
1266 | {return;} |
1267 | |
1268 | #if defined(CUDA_GPU) && defined(__NVCC__) |
1269 | |
1270 | // The first part of m_opart and prc_sz contain the local particles |
1271 | |
1272 | #ifndef TEST1 |
1273 | |
1274 | v_pos_tmp.resize(prc_sz.template get<0>(0)); |
1275 | v_prp_tmp.resize(prc_sz.template get<0>(0)); |
1276 | |
1277 | auto ite = v_pos_tmp.getGPUIterator(); |
1278 | |
1279 | // fill v_pos_tmp and v_prp_tmp with local particles |
1280 | process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), |
1281 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> |
1282 | <<<ite.wthr,ite.thr>>> |
1283 | (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), |
1284 | v_pos.toKernel(),v_prp.toKernel(),0); |
1285 | |
1286 | size_t offset = prc_sz.template get<0>(0); |
1287 | |
1288 | // Fill the sending buffers |
1289 | for (size_t i = 0 ; i < m_pos.size() ; i++) |
1290 | { |
1291 | auto ite = m_pos.get(i).getGPUIterator(); |
1292 | |
1293 | process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), |
1294 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> |
1295 | <<<ite.wthr,ite.thr>>> |
1296 | (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), |
1297 | v_pos.toKernel(),v_prp.toKernel(),offset); |
1298 | |
1299 | offset += prc_sz_r.size(); |
1300 | } |
1301 | |
1302 | // old local particles with the actual local particles |
1303 | v_pos_tmp.swap(v_pos); |
1304 | v_prp_tmp.swap(v_prp); |
1305 | |
1306 | #else |
1307 | |
1308 | int rank = v_cl.rank(); |
1309 | |
1310 | v_pos_tmp.resize(prc_sz.template get<0>(rank)); |
1311 | v_prp_tmp.resize(prc_sz.template get<0>(rank)); |
1312 | |
1313 | auto ite = v_pos_tmp.getGPUIterator(); |
1314 | |
1315 | starts.template deviceToHost<0>(); |
1316 | size_t offset = starts.template get<0>(rank); |
1317 | |
1318 | // no work to do |
1319 | if (ite.wthr.x != 0) |
1320 | { |
1321 | // fill v_pos_tmp and v_prp_tmp with local particles |
1322 | CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), |
1323 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), |
1324 | ite, |
1325 | m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), |
1326 | v_pos.toKernel(),v_prp.toKernel(),offset); |
1327 | } |
1328 | |
1329 | // Fill the sending buffers |
1330 | for (size_t i = 0 ; i < m_pos.size() ; i++) |
1331 | { |
1332 | size_t offset = starts.template get<0>(prc_r.template get<0>(i)); |
1333 | |
1334 | auto ite = m_pos.get(i).getGPUIterator(); |
1335 | |
1336 | // no work to do |
1337 | if (ite.wthr.x != 0) |
1338 | { |
1339 | |
1340 | CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), |
1341 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), |
1342 | ite, |
1343 | m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), |
1344 | v_pos.toKernel(),v_prp.toKernel(),offset); |
1345 | |
1346 | } |
1347 | } |
1348 | |
1349 | // old local particles with the actual local particles |
1350 | v_pos_tmp.swap(v_pos); |
1351 | v_prp_tmp.swap(v_prp); |
1352 | |
1353 | #endif |
1354 | #else |
1355 | |
1356 | std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; |
1357 | |
1358 | #endif |
1359 | } |
1360 | else |
1361 | { |
1362 | // end vector point |
1363 | long int id_end = v_pos.size(); |
1364 | |
1365 | // end opart point |
1366 | long int end = m_opart.size()-1; |
1367 | |
1368 | // Run through all the particles and fill the sending buffer |
1369 | for (size_t i = 0; i < m_opart.size(); i++) |
1370 | { |
1371 | process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt); |
1372 | } |
1373 | |
1374 | v_pos.resize(v_pos.size() - m_opart.size()); |
1375 | v_prp.resize(v_prp.size() - m_opart.size()); |
1376 | } |
1377 | } |
1378 | |
1379 | |
1380 | /*! \brief allocate and fill the send buffer for the map function |
1381 | * |
1382 | * \tparam prp_object object type to send |
1383 | * \tparam prp properties to send |
1384 | * |
1385 | * \param v_pos vector of particle positions |
1386 | * \param v_prp vector of particle properties |
1387 | * \param prc_sz_r number of particles to send for each processor |
1388 | * \param m_pos sending buffer for position |
1389 | * \param m_prp sending buffer for properties |
1390 | * |
1391 | */ |
1392 | template<typename prp_object,int ... prp> |
1393 | void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos, |
1394 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
1395 | openfpm::vector<size_t> & prc_sz_r, |
1396 | openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, |
1397 | openfpm::vector<openfpm::vector<prp_object>> & m_prp) |
1398 | { |
1399 | m_prp.resize(prc_sz_r.size()); |
1400 | m_pos.resize(prc_sz_r.size()); |
1401 | openfpm::vector<size_t> cnt(prc_sz_r.size()); |
1402 | |
1403 | for (size_t i = 0; i < prc_sz_r.size(); i++) |
1404 | { |
1405 | // set the size and allocate, using mem warant that pos and prp is contiguous |
1406 | m_pos.get(i).resize(prc_sz_r.get(i)); |
1407 | m_prp.get(i).resize(prc_sz_r.get(i)); |
1408 | cnt.get(i) = 0; |
1409 | } |
1410 | |
1411 | // end vector point |
1412 | long int id_end = v_pos.size(); |
1413 | |
1414 | // end opart point |
1415 | long int end = m_opart.size()-1; |
1416 | |
1417 | // Run through all the particles and fill the sending buffer |
1418 | for (size_t i = 0; i < m_opart.size(); i++) |
1419 | { |
1420 | process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt); |
1421 | } |
1422 | |
1423 | v_pos.resize(v_pos.size() - m_opart.size()); |
1424 | v_prp.resize(v_prp.size() - m_opart.size()); |
1425 | } |
1426 | |
1427 | /*! \brief Label particles for mappings |
1428 | * |
1429 | * \param v_pos vector of particle positions |
1430 | * \param lbl_p Particle labeled |
1431 | * \param prc_sz For each processor the number of particles to send |
1432 | * \param opt options |
1433 | * |
1434 | */ |
1435 | template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
1436 | openfpm::vector<aggregate<int,int,int>, |
1437 | Memory, |
1438 | layout_base> & lbl_p, |
1439 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, |
1440 | size_t opt) |
1441 | { |
1442 | if (opt == RUN_ON_DEVICE) |
1443 | { |
1444 | #ifdef __NVCC__ |
1445 | |
1446 | // Map directly on gpu |
1447 | |
1448 | lbl_p.resize(v_pos.size()); |
1449 | |
1450 | // labelling kernel |
1451 | |
1452 | prc_sz.template fill<0>(0); |
1453 | |
1454 | auto ite = v_pos.getGPUIterator(); |
1455 | if (ite.wthr.x == 0) |
1456 | { |
1457 | starts.resize(v_cl.size()); |
1458 | starts.template fill<0>(0); |
1459 | return; |
1460 | } |
1461 | |
1462 | // we have one process we can skip ... |
1463 | if (v_cl.size() == 1) |
1464 | { |
1465 | // ... but we have to apply the boundary conditions |
1466 | |
1467 | periodicity_int<dim> bc; |
1468 | |
1469 | for (size_t i = 0 ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);} |
1470 | |
1471 | CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,dec.getDomain(),bc,v_pos.toKernel()); |
1472 | |
1473 | return; |
1474 | } |
1475 | |
1476 | // label particle processor |
1477 | CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>), |
1478 | ite, |
1479 | dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank()); |
1480 | |
1481 | |
1482 | #ifndef TEST1 |
1483 | |
1484 | // sort particles |
1485 | mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
1486 | |
1487 | mem.allocate(sizeof(int)); |
1488 | mem.fill(0); |
1489 | |
1490 | // Find the buffer bases |
1491 | find_buffer_offsets<1,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>> |
1492 | (lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel()); |
1493 | |
1494 | #error "should not be here" |
1495 | |
1496 | // Trasfer the number of offsets on CPU |
1497 | mem.deviceToHost(); |
1498 | prc_sz.template deviceToHost<0,1>(); |
1499 | // get also the last element from lbl_p; |
1500 | lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1); |
1501 | |
1502 | mem.deviceToHost(); |
1503 | int noff = *(int *)mem.getPointer(); |
1504 | prc_sz.resize(noff+1); |
1505 | prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size(); |
1506 | prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1); |
1507 | |
1508 | #else |
1509 | |
1510 | starts.resize(v_cl.size()); |
1511 | openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
1512 | |
1513 | // move prc_sz to host |
1514 | prc_sz.template deviceToHost<0>(); |
1515 | |
1516 | ite = lbl_p.getGPUIterator(); |
1517 | |
1518 | // we order lbl_p |
1519 | CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel()); |
1520 | |
1521 | #endif |
1522 | |
1523 | #else |
1524 | |
1525 | std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl; |
1526 | |
1527 | #endif |
1528 | } |
1529 | else |
1530 | { |
1531 | // reset lbl_p |
1532 | lbl_p.clear(); |
1533 | prc_sz_gg.clear(); |
1534 | o_part_loc.clear(); |
1535 | g_opart.clear(); |
1536 | prc_g_opart.clear(); |
1537 | |
1538 | // resize the label buffer |
1539 | prc_sz.template fill<0>(0); |
1540 | |
1541 | auto it = v_pos.getIterator(); |
1542 | |
1543 | // Label all the particles with the processor id where they should go |
1544 | while (it.isNext()) |
1545 | { |
1546 | auto key = it.get(); |
1547 | |
1548 | // Apply the boundary conditions |
1549 | dec.applyPointBC(v_pos.get(key)); |
1550 | |
1551 | size_t p_id = 0; |
1552 | |
1553 | // Check if the particle is inside the domain |
1554 | if (dec.getDomain().isInside(v_pos.get(key)) == true) |
1555 | {p_id = dec.processorID(v_pos.get(key));} |
1556 | else |
1557 | {p_id = obp::out(key, v_cl.getProcessUnitID());} |
1558 | |
1559 | // Particle to move |
1560 | if (p_id != v_cl.getProcessUnitID()) |
1561 | { |
1562 | if ((long int) p_id != -1) |
1563 | { |
1564 | prc_sz.template get<0>(p_id)++; |
1565 | lbl_p.add(); |
1566 | lbl_p.last().template get<0>() = key; |
1567 | lbl_p.last().template get<2>() = p_id; |
1568 | } |
1569 | else |
1570 | { |
1571 | lbl_p.add(); |
1572 | lbl_p.last().template get<0>() = key; |
1573 | lbl_p.last().template get<2>() = p_id; |
1574 | } |
1575 | } |
1576 | |
1577 | // Add processors and add size |
1578 | |
1579 | ++it; |
1580 | } |
1581 | } |
1582 | } |
1583 | |
1584 | /*! \brief Label the particles |
1585 | * |
1586 | * It count the number of particle to send to each processors and save its ids |
1587 | * |
1588 | * \see nn_prcs::getShiftvectors() |
1589 | * |
1590 | * \param v_pos vector of particle positions |
1591 | * \param v_prp vector of particle properties |
1592 | * \param prc for each particle it label the processor id (the owner of the particle, or where it should go the particle) |
1593 | * \param g_m ghost marker |
1594 | * \param opt ghost_get options |
1595 | * |
1596 | */ |
1597 | void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
1598 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
1599 | openfpm::vector<size_t> & prc, |
1600 | openfpm::vector<size_t> & prc_sz, |
1601 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
1602 | size_t & g_m, |
1603 | size_t opt) |
1604 | { |
1605 | // Buffer that contain for each processor the id of the particle to send |
1606 | prc_sz.clear(); |
1607 | g_opart.clear(); |
1608 | g_opart.resize(dec.getNNProcessors()); |
1609 | prc_g_opart.clear(); |
1610 | |
1611 | if (opt & RUN_ON_DEVICE) |
1612 | { |
1613 | labelParticlesGhost_impl<dim,St,prop,Memory,layout_base, |
1614 | Decomposition,std::is_same<Memory,CudaMemory>::value> |
1615 | ::run(mem,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt); |
1616 | } |
1617 | else |
1618 | { |
1619 | // Iterate over all particles |
1620 | auto it = v_pos.getIteratorTo(g_m); |
1621 | while (it.isNext()) |
1622 | { |
1623 | auto key = it.get(); |
1624 | |
1625 | // Given a particle, it return which processor require it (first id) and shift id, second id |
1626 | // For an explanation about shifts vectors please consult getShiftVector in ie_ghost |
1627 | const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE); |
1628 | |
1629 | for (size_t i = 0; i < vp_id.size(); i++) |
1630 | { |
1631 | // processor id |
1632 | size_t p_id = vp_id.get(i).first; |
1633 | |
1634 | // add particle to communicate |
1635 | g_opart.get(p_id).add(); |
1636 | g_opart.get(p_id).last().template get<0>() = key; |
1637 | g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second; |
1638 | } |
1639 | |
1640 | ++it; |
1641 | } |
1642 | |
1643 | // remove all zero entry and construct prc (the list of the sending processors) |
1644 | openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f; |
1645 | |
1646 | // count the non zero element |
1647 | for (size_t i = 0 ; i < g_opart.size() ; i++) |
1648 | { |
1649 | if (g_opart.get(i).size() != 0) |
1650 | { |
1651 | prc_sz.add(g_opart.get(i).size()); |
1652 | g_opart_f.add(); |
1653 | g_opart.get(i).swap(g_opart_f.last()); |
1654 | prc.add(dec.IDtoProc(i)); |
1655 | } |
1656 | } |
1657 | |
1658 | g_opart.swap(g_opart_f); |
1659 | } |
1660 | #ifdef EXTREA_TRACE_PRE_COMM |
1661 | Extrae_user_function (0); |
1662 | #endif |
1663 | } |
1664 | |
1665 | /*! \brief Call-back to allocate buffer to receive incoming elements (particles) |
1666 | * |
1667 | * \param msg_i size required to receive the message from i |
1668 | * \param total_msg total size to receive from all the processors |
1669 | * \param total_p the total number of processor that want to communicate with you |
1670 | * \param i processor id |
1671 | * \param ri request id (it is an id that goes from 0 to total_p, and is unique |
1672 | * every time message_alloc is called) |
1673 | * \param ptr a pointer to the vector_dist structure |
1674 | * |
1675 | * \return the pointer where to store the message for the processor i |
1676 | * |
1677 | */ |
1678 | static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr) |
1679 | { |
1680 | // cast the pointer |
1681 | vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd = static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *>(ptr); |
1682 | |
1683 | vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits()); |
1684 | vd->recv_mem_gm.get(i).resize(msg_i); |
1685 | |
1686 | return vd->recv_mem_gm.get(i).getPointer(); |
1687 | } |
1688 | |
1689 | public: |
1690 | |
1691 | /*! \brief Copy Constructor |
1692 | * |
1693 | * \param v vector to copy |
1694 | * |
1695 | */ |
1696 | vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & v) |
1697 | :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0) |
1698 | { |
1699 | this->operator=(v); |
1700 | } |
1701 | |
1702 | |
1703 | /*! \brief Constructor |
1704 | * |
1705 | * \param dec Domain decompositon |
1706 | * |
1707 | */ |
1708 | vector_dist_comm(const Decomposition & dec) |
1709 | :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0) |
1710 | { |
1711 | |
1712 | } |
1713 | |
1714 | /*! \brief Constructor |
1715 | * |
1716 | * \param dec Domain decompositon |
1717 | * |
1718 | */ |
1719 | vector_dist_comm(Decomposition && dec) |
1720 | :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0) |
1721 | { |
1722 | |
1723 | } |
1724 | |
1725 | /*! \brief Constructor |
1726 | * |
1727 | */ |
1728 | vector_dist_comm() |
1729 | :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0) |
1730 | { |
1731 | } |
1732 | |
1733 | /*! \brief Destructor |
1734 | * |
1735 | * Release the retained buffer |
1736 | * |
1737 | */ |
1738 | ~vector_dist_comm() |
1739 | { |
1740 | for (size_t i = 0 ; i < hsmem.size() ; i++) |
1741 | { |
1742 | if (hsmem.get(i).ref() == 1) |
1743 | hsmem.get(i).decRef(); |
1744 | else |
1745 | std::cout << __FILE__ << ":" << __LINE__ << " internal error memory is in an invalid state " << std::endl; |
1746 | } |
1747 | |
1748 | } |
1749 | |
1750 | /*! \brief Get the number of minimum sub-domain per processor |
1751 | * |
1752 | * \return minimum number |
1753 | * |
1754 | */ |
1755 | size_t getDecompositionGranularity() |
1756 | { |
1757 | return v_sub_unit_factor; |
1758 | } |
1759 | |
1760 | /*! \brief Set the minimum number of sub-domain per processor |
1761 | * |
1762 | * \param n_sub |
1763 | * |
1764 | */ |
1765 | void setDecompositionGranularity(size_t n_sub) |
1766 | { |
1767 | this->v_sub_unit_factor = n_sub; |
1768 | } |
1769 | |
1770 | /*! \brief Initialize the decomposition |
1771 | * |
1772 | * \param box domain |
1773 | * \param bc boundary conditions |
1774 | * \param g ghost extension |
1775 | * \param opt additional options |
1776 | * |
1777 | */ |
1778 | void init_decomposition(Box<dim,St> & box, |
1779 | const size_t (& bc)[dim], |
1780 | const Ghost<dim,St> & g, |
1781 | size_t opt, |
1782 | const grid_sm<dim,void> & gdist) |
1783 | { |
1784 | size_t div[dim]; |
1785 | |
1786 | if (opt & BIND_DEC_TO_GHOST) |
1787 | { |
1788 | // padding |
1789 | size_t pad = 0; |
1790 | |
1791 | // CellDecomposer |
1792 | CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm; |
1793 | |
1794 | // Calculate the divisions for the symmetric Cell-lists |
1795 | cl_param_calculateSym<dim,St>(box,cd_sm,g,pad); |
1796 | |
1797 | for (size_t i = 0 ; i < dim ; i++) |
1798 | {div[i] = cd_sm.getDiv()[i] - 2*pad;} |
1799 | |
1800 | // Create the sub-domains |
1801 | dec.setParameters(div, box, bc, g, gdist); |
1802 | } |
1803 | else |
1804 | { |
1805 | dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist); |
1806 | } |
1807 | dec.decompose(); |
1808 | } |
1809 | |
1810 | /*! \brief Initialize the decomposition |
1811 | * |
1812 | * \param box domain |
1813 | * \param bc boundary conditions |
1814 | * \param g ghost extension |
1815 | * \param opt additional options |
1816 | * |
1817 | */ |
1818 | void init_decomposition_gr_cell(Box<dim,St> & box, |
1819 | const size_t (& bc)[dim], |
1820 | const Ghost<dim,St> & g, |
1821 | size_t opt, |
1822 | const grid_sm<dim,void> & gdist) |
1823 | { |
1824 | size_t div[dim]; |
1825 | |
1826 | for (size_t i = 0 ; i < dim ; i++) |
1827 | {div[i] = gdist.size(i);} |
1828 | |
1829 | // Create the sub-domains |
1830 | dec.setParameters(div, box, bc, g); |
1831 | |
1832 | dec.decompose(); |
1833 | } |
1834 | |
1835 | /*! \brief It synchronize the properties and position of the ghost particles |
1836 | * |
1837 | * \tparam prp list of properties to get synchronize |
1838 | * |
1839 | * \param opt options WITH_POSITION, it send also the positional information of the particles |
1840 | * \param v_pos vector of position to update |
1841 | * \param v_prp vector of properties to update |
1842 | * \param g_m marker between real and ghost particles |
1843 | * |
1844 | */ |
1845 | template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
1846 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
1847 | size_t & g_m, |
1848 | size_t opt = WITH_POSITION) |
1849 | { |
1850 | #ifdef PROFILE_SCOREP |
1851 | SCOREP_USER_REGION("ghost_get" ,SCOREP_USER_REGION_TYPE_FUNCTION) |
1852 | #endif |
1853 | |
1854 | // Sending property object |
1855 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
1856 | |
1857 | // send vector for each processor |
1858 | typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; |
1859 | |
1860 | if (!(opt & NO_POSITION)) |
1861 | {v_pos.resize(g_m);} |
1862 | |
1863 | // reset the ghost part |
1864 | |
1865 | if (!(opt & SKIP_LABELLING)) |
1866 | {v_prp.resize(g_m);} |
1867 | |
1868 | // Label all the particles |
1869 | if ((opt & SKIP_LABELLING) == false) |
1870 | {labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz_gg,prc_offset,g_m,opt);} |
1871 | |
1872 | { |
1873 | // Send and receive ghost particle information |
1874 | openfpm::vector<send_vector> g_send_prp; |
1875 | |
1876 | fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz_gg,g_send_prp,opt); |
1877 | |
1878 | #if defined(CUDA_GPU) && defined(__NVCC__) |
1879 | cudaDeviceSynchronize(); |
1880 | #endif |
1881 | |
1882 | // if there are no properties skip |
1883 | // SSendRecvP send everything when we do not give properties |
1884 | |
1885 | ghost_exchange_comm_impl<impl,layout_base,prp ...>::template |
1886 | sendrecv_prp(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart, |
1887 | prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt); |
1888 | } |
1889 | |
1890 | if (!(opt & NO_POSITION)) |
1891 | { |
1892 | // Sending buffer for the ghost particles position |
1893 | openfpm::vector<send_pos_vector> g_pos_send; |
1894 | |
1895 | fill_send_ghost_pos_buf(v_pos,prc_sz_gg,g_pos_send,opt,impl == GHOST_ASYNC); |
1896 | |
1897 | #if defined(CUDA_GPU) && defined(__NVCC__) |
1898 | cudaDeviceSynchronize(); |
1899 | #endif |
1900 | |
1901 | ghost_exchange_comm_impl<impl,layout_base,prp ...>::template |
1902 | sendrecv_pos(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt); |
1903 | |
1904 | // fill g_opart_sz |
1905 | g_opart_sz.resize(prc_g_opart.size()); |
1906 | |
1907 | for (size_t i = 0 ; i < prc_g_opart.size() ; i++) |
1908 | g_opart_sz.get(i) = g_pos_send.get(i).size(); |
1909 | } |
1910 | |
1911 | // Important to ensure that the number of particles in v_prp must be equal to v_pos |
1912 | // Note that if we do not give properties sizeof...(prp) == 0 in general at this point |
1913 | // v_prp.size() != v_pos.size() |
1914 | if (!(opt & SKIP_LABELLING)) |
1915 | { |
1916 | v_prp.resize(v_pos.size()); |
1917 | } |
1918 | |
1919 | add_loc_particles_bc(v_pos,v_prp,g_m,opt); |
1920 | } |
1921 | |
1922 | /*! \brief It synchronize the properties and position of the ghost particles |
1923 | * |
1924 | * \tparam prp list of properties to get synchronize |
1925 | * |
1926 | * \param opt options WITH_POSITION, it send also the positional information of the particles |
1927 | * \param v_pos vector of position to update |
1928 | * \param v_prp vector of properties to update |
1929 | * \param g_m marker between real and ghost particles |
1930 | * |
1931 | */ |
1932 | template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
1933 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
1934 | size_t & g_m, |
1935 | size_t opt = WITH_POSITION) |
1936 | { |
1937 | // Sending property object |
1938 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
1939 | |
1940 | // send vector for each processor |
1941 | typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; |
1942 | |
1943 | // Send and receive ghost particle information |
1944 | openfpm::vector<send_vector> g_send_prp; |
1945 | openfpm::vector<send_pos_vector> g_pos_send; |
1946 | |
1947 | ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template |
1948 | sendrecv_prp_wait(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart, |
1949 | prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt); |
1950 | |
1951 | |
1952 | ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template |
1953 | sendrecv_pos_wait(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt); |
1954 | } |
1955 | |
1956 | /*! \brief It move all the particles that does not belong to the local processor to the respective processor |
1957 | * |
1958 | * \tparam out of bound policy it specify what to do when the particles are detected out of bound |
1959 | * |
1960 | * In general this function is called after moving the particles to move the |
1961 | * elements out the local processor. Or just after initialization if each processor |
1962 | * contain non local particles |
1963 | * |
1964 | * \tparam prp properties to communicate |
1965 | * |
1966 | * \param v_pos vector of particle positions |
1967 | * \param v_prp vector of particle properties |
1968 | * \param g_m ghost marker |
1969 | * \param opt options |
1970 | * |
1971 | */ |
1972 | template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt) |
1973 | { |
1974 | if (opt & RUN_ON_DEVICE) |
1975 | { |
1976 | std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " map_list is unsupported on device (coming soon)" << std::endl; |
1977 | return; |
1978 | } |
1979 | |
1980 | typedef KillParticle obp; |
1981 | |
1982 | // Processor communication size |
1983 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits()); |
1984 | |
1985 | // map completely reset the ghost part |
1986 | v_pos.resize(g_m); |
1987 | v_prp.resize(g_m); |
1988 | |
1989 | // m_opart, Contain the processor id of each particle (basically where they have to go) |
1990 | labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt); |
1991 | |
1992 | // Calculate the sending buffer size for each processor, put this information in |
1993 | // a contiguous buffer |
1994 | p_map_req.resize(v_cl.getProcessingUnits()); |
1995 | openfpm::vector<size_t> prc_sz_r; |
1996 | openfpm::vector<size_t> prc_r; |
1997 | |
1998 | for (size_t i = 0; i < v_cl.getProcessingUnits(); i++) |
1999 | { |
2000 | if (prc_sz.template get<0>(i) != 0) |
2001 | { |
2002 | p_map_req.get(i) = prc_r.size(); |
2003 | prc_r.add(i); |
2004 | prc_sz_r.add(prc_sz.template get<0>(i)); |
2005 | } |
2006 | } |
2007 | |
2008 | if (opt & MAP_LOCAL) |
2009 | { |
2010 | // if the map is local we indicate that we receive only from the neighborhood processors |
2011 | |
2012 | prc_recv_map.clear(); |
2013 | for (size_t i = 0 ; i < dec.getNNProcessors() ; i++) |
2014 | {prc_recv_map.add(dec.IDtoProc(i));} |
2015 | } |
2016 | |
2017 | // Sending property object |
2018 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
2019 | |
2020 | //! position vector |
2021 | openfpm::vector<openfpm::vector<Point<dim, St>>> m_pos; |
2022 | //! properties vector |
2023 | openfpm::vector<openfpm::vector<prp_object>> m_prp; |
2024 | |
2025 | fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp); |
2026 | |
2027 | v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt); |
2028 | v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt); |
2029 | |
2030 | // mark the ghost part |
2031 | |
2032 | g_m = v_pos.size(); |
2033 | } |
2034 | |
2035 | /*! \brief It move all the particles that does not belong to the local processor to the respective processor |
2036 | * |
2037 | * \tparam out of bound policy it specify what to do when the particles are detected out of bound |
2038 | * |
2039 | * In general this function is called after moving the particles to move the |
2040 | * elements out the local processor. Or just after initialization if each processor |
2041 | * contain non local particles |
2042 | * |
2043 | * \param v_pos vector of particle positions |
2044 | * \param v_prp vector of particle properties |
2045 | * \param g_m ghost marker |
2046 | * |
2047 | */ |
2048 | template<typename obp = KillParticle> |
2049 | void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
2050 | openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, |
2051 | size_t opt) |
2052 | { |
2053 | #ifdef PROFILE_SCOREP |
2054 | SCOREP_USER_REGION("map" ,SCOREP_USER_REGION_TYPE_FUNCTION) |
2055 | #endif |
2056 | |
2057 | prc_sz.resize(v_cl.getProcessingUnits()); |
2058 | |
2059 | // map completely reset the ghost part |
2060 | v_pos.resize(g_m); |
2061 | v_prp.resize(g_m); |
2062 | |
2063 | // Contain the processor id of each particle (basically where they have to go) |
2064 | labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt); |
2065 | |
2066 | openfpm::vector<size_t> prc_sz_r; |
2067 | openfpm::vector<size_t> prc_r; |
2068 | |
2069 | // Calculate the sending buffer size for each processor, put this information in |
2070 | // a contiguous buffer |
2071 | calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt); |
2072 | |
2073 | //! position vector |
2074 | openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos; |
2075 | //! properties vector |
2076 | openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp; |
2077 | |
2078 | fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt); |
2079 | |
2080 | size_t opt_ = 0; |
2081 | if (opt & RUN_ON_DEVICE) |
2082 | { |
2083 | #if defined(CUDA_GPU) && defined(__NVCC__) |
2084 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
2085 | cudaDeviceSynchronize(); |
2086 | opt_ |= MPI_GPU_DIRECT; |
2087 | #else |
2088 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
2089 | #endif |
2090 | } |
2091 | |
2092 | v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>, |
2093 | openfpm::vector<Point<dim, St>,Memory,layout_base>, |
2094 | layout_base> |
2095 | (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_); |
2096 | |
2097 | v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>, |
2098 | openfpm::vector<prop,Memory,layout_base>, |
2099 | layout_base> |
2100 | (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_); |
2101 | |
2102 | // mark the ghost part |
2103 | |
2104 | g_m = v_pos.size(); |
2105 | } |
2106 | |
2107 | /*! \brief Get the decomposition |
2108 | * |
2109 | * \return |
2110 | * |
2111 | */ |
2112 | inline Decomposition & getDecomposition() |
2113 | { |
2114 | return dec; |
2115 | } |
2116 | |
2117 | /*! \brief Get the decomposition |
2118 | * |
2119 | * \return |
2120 | * |
2121 | */ |
2122 | inline const Decomposition & getDecomposition() const |
2123 | { |
2124 | return dec; |
2125 | } |
2126 | |
2127 | /*! \brief Copy a vector |
2128 | * |
2129 | * \param vc vector to copy |
2130 | * |
2131 | * \return iteself |
2132 | * |
2133 | */ |
2134 | vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc) |
2135 | { |
2136 | dec = vc.dec; |
2137 | |
2138 | return *this; |
2139 | } |
2140 | |
2141 | /*! \brief Copy a vector |
2142 | * |
2143 | * \param vc vector to copy |
2144 | * |
2145 | * \return itself |
2146 | * |
2147 | */ |
2148 | vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc) |
2149 | { |
2150 | dec = vc.dec; |
2151 | |
2152 | return *this; |
2153 | } |
2154 | |
2155 | /*! \brief Ghost put |
2156 | * |
2157 | * \tparam op operation to apply |
2158 | * \tparam prp set of properties |
2159 | * |
2160 | * \param v_pos vector of particle positions |
2161 | * \param v_prp vector od particle properties |
2162 | * \param g_m ghost marker |
2163 | * \param opt options |
2164 | * |
2165 | */ |
2166 | template<template<typename,typename> class op, int ... prp> |
2167 | void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
2168 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
2169 | size_t & g_m, |
2170 | size_t opt) |
2171 | { |
2172 | // Sending property object |
2173 | typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; |
2174 | |
2175 | // send vector for each processor |
2176 | typedef openfpm::vector<prp_object,Memory,layout_base> send_vector; |
2177 | |
2178 | openfpm::vector<send_vector> g_send_prp; |
2179 | fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt); |
2180 | |
2181 | if (opt & RUN_ON_DEVICE) |
2182 | { |
2183 | #if defined(CUDA_GPU) && defined(__NVCC__) |
2184 | // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete |
2185 | cudaDeviceSynchronize(); |
2186 | #else |
2187 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
2188 | #endif |
2189 | } |
2190 | |
2191 | // Send and receive ghost particle information |
2192 | if (opt & NO_CHANGE_ELEMENTS) |
2193 | { |
2194 | size_t opt_ = compute_options(opt); |
2195 | |
2196 | if (opt & RUN_ON_DEVICE) |
2197 | { |
2198 | op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset); |
2199 | v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>, |
2200 | send_vector, |
2201 | decltype(v_prp), |
2202 | layout_base, |
2203 | prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_); |
2204 | } |
2205 | else |
2206 | { |
2207 | op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart); |
2208 | v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>, |
2209 | send_vector, |
2210 | decltype(v_prp), |
2211 | layout_base, |
2212 | prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_); |
2213 | } |
2214 | } |
2215 | else |
2216 | { |
2217 | size_t opt_ = compute_options(opt); |
2218 | |
2219 | if (opt & RUN_ON_DEVICE) |
2220 | { |
2221 | op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset); |
2222 | v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>, |
2223 | send_vector, |
2224 | decltype(v_prp), |
2225 | layout_base, |
2226 | prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_); |
2227 | } |
2228 | else |
2229 | { |
2230 | op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart); |
2231 | v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>, |
2232 | send_vector, |
2233 | decltype(v_prp), |
2234 | layout_base, |
2235 | prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_); |
2236 | } |
2237 | } |
2238 | |
2239 | // process also the local replicated particles |
2240 | |
2241 | if (lg_m < v_prp.size() && v_prp.size() - lg_m != o_part_loc.size()) |
2242 | { |
2243 | std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Local ghost particles = " << v_prp.size() - lg_m << " != " << o_part_loc.size() << std::endl; |
2244 | std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Check that you did a ghost_get before a ghost_put" << std::endl; |
2245 | } |
2246 | |
2247 | |
2248 | if (opt & RUN_ON_DEVICE) |
2249 | { |
2250 | v_prp.template merge_prp_v_device<op,prop,Memory, |
2251 | openfpm::grow_policy_double, |
2252 | layout_base, |
2253 | decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc); |
2254 | } |
2255 | else |
2256 | { |
2257 | v_prp.template merge_prp_v<op,prop,Memory, |
2258 | openfpm::grow_policy_double, |
2259 | layout_base, |
2260 | decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc); |
2261 | } |
2262 | } |
2263 | }; |
2264 | |
2265 | |
2266 | #endif /* SRC_VECTOR_VECTOR_DIST_COMM_HPP_ */ |
2267 | |