1/*
2 * vector_dist_comm.hpp
3 *
4 * Created on: Aug 18, 2016
5 * Author: i-bird
6 */
7
8#ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
9#define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
10
11#define TEST1
12
13#if defined(CUDA_GPU) && defined(__NVCC__)
14#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
15#include "util/cuda/kernels.cuh"
16#endif
17
18#include "Vector/util/vector_dist_funcs.hpp"
19#include "cuda/vector_dist_comm_util_funcs.cuh"
20#include "util/cuda/scan_ofp.cuh"
21
22template<typename T>
23struct DEBUG
24{
25 static float ret(T & tmp)
26 {
27 return 0.0;
28 }
29};
30
31template<>
32struct DEBUG<float &>
33{
34 static float ret(float & tmp)
35 {
36 return tmp;
37 }
38};
39
40/*! \brief compute the communication options from the ghost_get/put options
41 *
42 *
43 */
44inline static size_t compute_options(size_t opt)
45{
46 size_t opt_ = NONE;
47 if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48 {opt_ = RECEIVE_KNOWN | KNOWN_ELEMENT_OR_BYTE;}
49
50 if (opt & RUN_ON_DEVICE)
51 {
52#if defined(CUDA_GPU) && defined(__NVCC__)
53 // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
54 opt_ |= MPI_GPU_DIRECT;
55#else
56 std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
57#endif
58 }
59
60 return opt_;
61}
62
63/*! \brief template selector for asynchronous or not asynchronous
64 *
65 * \tparam impl implementation
66 * \tparam prp properties
67 *
68 */
69template<unsigned int impl, template<typename> class layout_base, unsigned int ... prp>
70struct ghost_exchange_comm_impl
71{
72 template<typename Vcluster_type, typename vector_prop_type,
73 typename vector_pos_type, typename send_vector,
74 typename prc_recv_get_type, typename prc_g_opart_type,
75 typename recv_sz_get_type, typename recv_sz_get_byte_type,
76 typename g_opart_sz_type>
77 static inline void sendrecv_prp(Vcluster_type & v_cl,
78 openfpm::vector<send_vector> & g_send_prp,
79 vector_prop_type & v_prp,
80 vector_pos_type & v_pos,
81 prc_g_opart_type & prc_g_opart,
82 prc_recv_get_type & prc_recv_get,
83 recv_sz_get_type & recv_sz_get,
84 recv_sz_get_byte_type & recv_sz_get_byte,
85 g_opart_sz_type & g_opart_sz,
86 size_t g_m,
87 size_t opt)
88 {
89 // if there are no properties skip
90 // SSendRecvP send everything when we do not give properties
91
92 if (sizeof...(prp) != 0)
93 {
94 size_t opt_ = compute_options(opt);
95 if (opt & SKIP_LABELLING)
96 {
97 if (opt & RUN_ON_DEVICE)
98 {
99 op_ssend_gg_recv_merge_run_device opm(g_m);
100 v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
101 }
102 else
103 {
104 op_ssend_gg_recv_merge opm(g_m);
105 v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
106 }
107 }
108 else
109 {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
110
111 // fill g_opart_sz
112 g_opart_sz.resize(prc_g_opart.size());
113
114 for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
115 g_opart_sz.get(i) = g_send_prp.get(i).size();
116 }
117 }
118
119 template<typename Vcluster_type, typename vector_prop_type,
120 typename vector_pos_type, typename send_pos_vector,
121 typename prc_recv_get_type, typename prc_g_opart_type,
122 typename recv_sz_get_type>
123 static inline void sendrecv_pos(Vcluster_type & v_cl,
124 openfpm::vector<send_pos_vector> & g_pos_send,
125 vector_prop_type & v_prp,
126 vector_pos_type & v_pos,
127 prc_recv_get_type & prc_recv_get,
128 recv_sz_get_type & recv_sz_get,
129 prc_g_opart_type & prc_g_opart,
130 size_t opt)
131 {
132 size_t opt_ = compute_options(opt);
133 if (opt & SKIP_LABELLING)
134 {
135 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
136 }
137 else
138 {
139 prc_recv_get.clear();
140 recv_sz_get.clear();
141 v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
142 }
143 }
144
145 template<typename Vcluster_type, typename vector_prop_type,
146 typename vector_pos_type, typename send_pos_vector,
147 typename prc_recv_get_type, typename prc_g_opart_type,
148 typename recv_sz_get_type>
149 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
150 openfpm::vector<send_pos_vector> & g_pos_send,
151 vector_prop_type & v_prp,
152 vector_pos_type & v_pos,
153 prc_recv_get_type & prc_recv_get,
154 recv_sz_get_type & recv_sz_get,
155 prc_g_opart_type & prc_g_opart,
156 size_t opt)
157 {}
158
159 template<typename Vcluster_type, typename vector_prop_type,
160 typename vector_pos_type, typename send_vector,
161 typename prc_recv_get_type, typename prc_g_opart_type,
162 typename recv_sz_get_type, typename recv_sz_get_byte_type,
163 typename g_opart_sz_type>
164 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
165 openfpm::vector<send_vector> & g_send_prp,
166 vector_prop_type & v_prp,
167 vector_pos_type & v_pos,
168 prc_g_opart_type & prc_g_opart,
169 prc_recv_get_type & prc_recv_get,
170 recv_sz_get_type & recv_sz_get,
171 recv_sz_get_byte_type & recv_sz_get_byte,
172 g_opart_sz_type & g_opart_sz,
173 size_t g_m,
174 size_t opt)
175 {}
176};
177
178
179template<template<typename> class layout_base, unsigned int ... prp>
180struct ghost_exchange_comm_impl<GHOST_ASYNC,layout_base, prp ... >
181{
182 template<typename Vcluster_type, typename vector_prop_type,
183 typename vector_pos_type, typename send_vector,
184 typename prc_recv_get_type, typename prc_g_opart_type,
185 typename recv_sz_get_type, typename recv_sz_get_byte_type,
186 typename g_opart_sz_type>
187 static inline void sendrecv_prp(Vcluster_type & v_cl,
188 openfpm::vector<send_vector> & g_send_prp,
189 vector_prop_type & v_prp,
190 vector_pos_type & v_pos,
191 prc_g_opart_type & prc_g_opart,
192 prc_recv_get_type & prc_recv_get,
193 recv_sz_get_type & recv_sz_get,
194 recv_sz_get_byte_type & recv_sz_get_byte,
195 g_opart_sz_type & g_opart_sz,
196 size_t g_m,
197 size_t opt)
198 {
199 prc_recv_get.clear();
200 recv_sz_get.clear();
201
202 // if there are no properties skip
203 // SSendRecvP send everything when we do not give properties
204
205 if (sizeof...(prp) != 0)
206 {
207 size_t opt_ = compute_options(opt);
208 if (opt & SKIP_LABELLING)
209 {
210 if (opt & RUN_ON_DEVICE)
211 {
212 op_ssend_gg_recv_merge_run_device opm(g_m);
213 v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
214 }
215 else
216 {
217 op_ssend_gg_recv_merge opm(g_m);
218 v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
219 }
220 }
221 else
222 {v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
223 }
224
225 // fill g_opart_sz
226 g_opart_sz.resize(prc_g_opart.size());
227
228 for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
229 {g_opart_sz.get(i) = g_send_prp.get(i).size();}
230 }
231
232 template<typename Vcluster_type, typename vector_prop_type,
233 typename vector_pos_type, typename send_pos_vector,
234 typename prc_recv_get_type, typename prc_g_opart_type,
235 typename recv_sz_get_type>
236 static inline void sendrecv_pos(Vcluster_type & v_cl,
237 openfpm::vector<send_pos_vector> & g_pos_send,
238 vector_prop_type & v_prp,
239 vector_pos_type & v_pos,
240 prc_recv_get_type & prc_recv_get,
241 recv_sz_get_type & recv_sz_get,
242 prc_g_opart_type & prc_g_opart,
243 size_t opt)
244 {
245 prc_recv_get.clear();
246 recv_sz_get.clear();
247
248 size_t opt_ = compute_options(opt);
249 if (opt & SKIP_LABELLING)
250 {
251 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
252 }
253 else
254 {
255 prc_recv_get.clear();
256 recv_sz_get.clear();
257 v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
258 }
259 }
260
261 template<typename Vcluster_type, typename vector_prop_type,
262 typename vector_pos_type, typename send_pos_vector,
263 typename prc_recv_get_type, typename prc_g_opart_type,
264 typename recv_sz_get_type>
265 static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
266 openfpm::vector<send_pos_vector> & g_pos_send,
267 vector_prop_type & v_prp,
268 vector_pos_type & v_pos,
269 prc_recv_get_type & prc_recv_get,
270 recv_sz_get_type & recv_sz_get,
271 prc_g_opart_type & prc_g_opart,
272 size_t opt)
273 {
274 size_t opt_ = compute_options(opt);
275 if (opt & SKIP_LABELLING)
276 {
277 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
278 }
279 else
280 {
281 v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
282 }
283 }
284
285 template<typename Vcluster_type, typename vector_prop_type,
286 typename vector_pos_type, typename send_vector,
287 typename prc_recv_get_type, typename prc_g_opart_type,
288 typename recv_sz_get_type, typename recv_sz_get_byte_type,
289 typename g_opart_sz_type>
290 static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
291 openfpm::vector<send_vector> & g_send_prp,
292 vector_prop_type & v_prp,
293 vector_pos_type & v_pos,
294 prc_g_opart_type & prc_g_opart,
295 prc_recv_get_type & prc_recv_get,
296 recv_sz_get_type & recv_sz_get,
297 recv_sz_get_byte_type & recv_sz_get_byte,
298 g_opart_sz_type & g_opart_sz,
299 size_t g_m,
300 size_t opt)
301 {
302 // if there are no properties skip
303 // SSendRecvP send everything when we do not give properties
304
305 if (sizeof...(prp) != 0)
306 {
307 size_t opt_ = compute_options(opt);
308 if (opt & SKIP_LABELLING)
309 {
310 if (opt & RUN_ON_DEVICE)
311 {
312 op_ssend_gg_recv_merge_run_device opm(g_m);
313 v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
314 }
315 else
316 {
317 op_ssend_gg_recv_merge opm(g_m);
318 v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
319 }
320 }
321 else
322 {v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
323 }
324 }
325};
326
327
328/*! \brief This class is an helper for the communication of vector_dist
329 *
330 * \tparam dim Dimensionality of the space where the elements lives
331 * \tparam St type of space float, double ...
332 * \tparam prop properties the vector element store in OpenFPM data structure format
333 * \tparam Decomposition Decomposition strategy to use CartDecomposition ...
334 * \tparam Memory Memory pool where store the information HeapMemory ...
335 *
336 * \see vector_dist
337 *
338 */
339
340template<unsigned int dim,
341 typename St,
342 typename prop,
343 typename Decomposition = CartDecomposition<dim,St>,
344 typename Memory = HeapMemory,
345 template<typename> class layout_base = memory_traits_lin>
346class vector_dist_comm
347{
348 //! Number of units for each sub-domain
349 size_t v_sub_unit_factor = 64;
350
351 //! definition of the send vector for position
352 typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector;
353
354 //! VCluster
355 Vcluster<Memory> & v_cl;
356
357 //! Domain decomposition
358 Decomposition dec;
359
360 //! It map the processor id with the communication request into map procedure
361 openfpm::vector<size_t> p_map_req;
362
363 //! For each near processor, outgoing particle id
364 //! \warning opart is assumed to be an ordered list
365 //! first id particle id
366 //! second id shift id
367 //! third id is the processor id
368 openfpm::vector<aggregate<int,int,int>,
369 Memory,
370 layout_base > m_opart;
371
372 //! Per processor ordered particles id for ghost_get (see prc_g_opart)
373 //! For each processor the internal vector store the id of the
374 //! particles that must be communicated to the other processors
375 openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart;
376
377 //! Same as g_opart but on device, the vector of vector is flatten into a single vector
378 openfpm::vector<aggregate<unsigned int,unsigned long int>,
379 CudaMemory,
380 memory_traits_inte> g_opart_device;
381
382 //! Helper buffer for computation (on GPU) of local particles (position)
383 openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp;
384
385 //! Helper buffer for computation (on GPU) of local particles (properties)
386 openfpm::vector<prop,Memory,layout_base> v_prp_tmp;
387
388 //! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
389 openfpm::vector<size_t> g_opart_sz;
390
391 //! processor rank list of g_opart
392 openfpm::vector<size_t> prc_g_opart;
393
394 //! It store the list of processor that communicate with us (local processor)
395 //! from the last ghost get
396 openfpm::vector<size_t> prc_recv_get_pos;
397 openfpm::vector<size_t> prc_recv_get_prp;
398
399 //! the same as prc_recv_get but for put
400 openfpm::vector<size_t> prc_recv_put;
401
402 //! the same as prc_recv_get but for map
403 openfpm::vector<size_t> prc_recv_map;
404
405 //! It store the size of the elements added for each processor that communicate with us (local processor)
406 //! from the last ghost get
407 openfpm::vector<size_t> recv_sz_get_pos;
408 openfpm::vector<size_t> recv_sz_get_prp;
409 //! Conversion to byte of recv_sz_get
410 openfpm::vector<size_t> recv_sz_get_byte;
411
412
413 //! The same as recv_sz_get but for put
414 openfpm::vector<size_t> recv_sz_put;
415
416 //! The same as recv_sz_get but for map
417 openfpm::vector<size_t> recv_sz_map;
418
419 //! elements sent for each processors (ghost_get)
420 openfpm::vector<size_t> prc_sz_gg;
421
422 //! temporary buffer to processors ids
423 openfpm::vector<aggregate<unsigned int>,
424 Memory,
425 layout_base> proc_id_out;
426
427 //! temporary buffer for the scan result
428 openfpm::vector<aggregate<unsigned int>,
429 Memory,
430 layout_base> starts;
431
432 //! Processor communication size
433 openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset;
434
435
436 //! Temporary CudaMemory to do stuff
437 CudaMemory mem;
438
439 //! Local ghost marker (across the ghost particles it mark from where we have the)
440 //! replicated ghost particles that are local
441 size_t lg_m;
442
443 //! Sending buffer
444 openfpm::vector_fr<Memory> hsmem;
445
446 //! process the particle with properties
447 template<typename prp_object, int ... prp>
448 struct proc_with_prp
449 {
450 //! process the particle
451 template<typename T1, typename T2> inline static void proc(size_t lbl, size_t cnt, size_t id, T1 & v_prp, T2 & m_prp)
452 {
453 // source object type
454 typedef encapc<1, prop, typename openfpm::vector<prop>::layout_type> encap_src;
455 // destination object type
456 typedef encapc<1, prp_object, typename openfpm::vector<prp_object>::layout_type> encap_dst;
457
458 // Copy only the selected properties
459 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(id), m_prp.get(lbl).get(cnt));
460 }
461 };
462
463 /*! \brief Get the number of particles received from each processor during the last ghost_get
464 *
465 *
466 * \param i processor (list index)
467 * \return the number of particles
468 */
469 size_t get_last_ghost_get_received_parts(size_t i)
470 {
471 // If the last ghost_get did not have properties the information about the number of particles
472 // received is in recv_sz_get_ois
473 if (recv_sz_get_prp.size() != 0)
474 {return recv_sz_get_prp.get(i);}
475 else
476 {return recv_sz_get_pos.get(i);}
477 }
478
479 /*! \brief Get the number of processor involved during the last ghost_get
480 *
481 * \return the number of processor
482 */
483 size_t get_last_ghost_get_num_proc()
484 {
485 if (prc_recv_get_prp.size() != 0)
486 {return prc_recv_get_prp.size();}
487 else
488 {return prc_recv_get_pos.size();}
489 }
490
491 /*! \brief Get the number of processor involved during the last ghost_get
492 *
493 * \return the number of processor
494 */
495 openfpm::vector<size_t> & get_last_ghost_get_num_proc_vector()
496 {
497 if (prc_recv_get_prp.size() != 0)
498 {return prc_recv_get_prp;}
499 else
500 {return prc_recv_get_pos;}
501 }
502
503 /*! \brief Calculate sending buffer size for each processor
504 *
505 * \param prc_sz_r processor size
506 * \param prc_r processor ids
507 *
508 */
509 inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
510 openfpm::vector<size_t> & prc_sz_r,
511 openfpm::vector<size_t> & prc_r,
512 size_t opt)
513 {
514 if (opt & RUN_ON_DEVICE)
515 {
516#ifndef TEST1
517 size_t prev_off = 0;
518 for (size_t i = 0; i < prc_sz.size() ; i++)
519 {
520 if (prc_sz.template get<1>(i) != (unsigned int)-1)
521 {
522 prc_r.add(prc_sz.template get<1>(i));
523 prc_sz_r.add(prc_sz.template get<0>(i) - prev_off);
524 }
525 prev_off = prc_sz.template get<0>(i);
526 }
527#else
528
529 // Calculate the sending buffer size for each processor, put this information in
530 // a contiguous buffer
531
532 for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
533 {
534 if (prc_sz.template get<0>(i) != 0 && v_cl.rank() != i)
535 {
536 prc_r.add(i);
537 prc_sz_r.add(prc_sz.template get<0>(i));
538 }
539 }
540
541#endif
542 }
543 else
544 {
545 // Calculate the sending buffer size for each processor, put this information in
546 // a contiguous buffer
547
548 p_map_req.resize(v_cl.getProcessingUnits());
549 for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
550 {
551 if (prc_sz.template get<0>(i) != 0)
552 {
553 p_map_req.get(i) = prc_r.size();
554 prc_r.add(i);
555 prc_sz_r.add(prc_sz.template get<0>(i));
556 }
557 }
558 }
559 }
560
561 //! From which decomposition the shift boxes are calculated
562 long int shift_box_ndec = -1;
563
564 //! this map is used to check if a combination is already present
565 std::unordered_map<size_t, size_t> map_cmb;
566
567 //! The boxes touching the border of the domain are divided in groups (first vector)
568 //! each group contain internal ghost coming from sub-domains of the same section
569 openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f;
570
571 //! The boxes touching the border of the domain + shift vector linearized from where they come from
572 openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev;
573 openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv;
574
575 //! Store the sector for each group (previous vector)
576 openfpm::vector_std<comb<dim>> box_cmb;
577
578 //! Id of the local particle to replicate for ghost_get
579 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc;
580
581 //! Processor communication size
582 openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz;
583
584 /*! \brief For every internal ghost box we create a structure that order such internal local ghost box in
585 * shift vectors
586 *
587 */
588 void createShiftBox()
589 {
590 if (shift_box_ndec == (long int)dec.get_ndec())
591 {return;}
592
593 struct sh_box
594 {
595 size_t shift_id;
596
597 unsigned int box_f_sv;
598 Box<dim,St> box_f_dev;
599
600 bool operator<(const sh_box & tmp) const
601 {
602 return shift_id < tmp.shift_id;
603 }
604
605 };
606 openfpm::vector<sh_box> reord_shift;
607 box_f.clear();
608 map_cmb.clear();
609 box_cmb.clear();
610
611 // Add local particles coming from periodic boundary, the only boxes that count are the one
612 // touching the border
613 for (size_t i = 0; i < dec.getNLocalSub(); i++)
614 {
615 size_t Nl = dec.getLocalNIGhost(i);
616
617 for (size_t j = 0; j < Nl; j++)
618 {
619 // If the ghost does not come from the intersection with an out of
620 // border sub-domain the combination is all zero and n_zero return dim
621 if (dec.getLocalIGhostPos(i, j).n_zero() == dim)
622 continue;
623
624 // Check if we already have boxes with such combination
625 auto it = map_cmb.find(dec.getLocalIGhostPos(i, j).lin());
626 if (it == map_cmb.end())
627 {
628 // we do not have it
629 box_f.add();
630 box_f.last().add(dec.getLocalIGhostBox(i, j));
631 box_cmb.add(dec.getLocalIGhostPos(i, j));
632 map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1;
633 }
634 else
635 {
636 // we have it
637 box_f.get(it->second).add(dec.getLocalIGhostBox(i, j));
638 }
639
640 reord_shift.add();
641 reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin();
642 reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j);
643 reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j));
644 }
645 }
646
647 // now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles
648 reord_shift.sort();
649
650 box_f_dev.resize(reord_shift.size());
651 box_f_sv.resize(reord_shift.size());
652
653 for (size_t i = 0 ; i < reord_shift.size() ; i++)
654 {
655 box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656 box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
657 }
658
659#ifdef CUDA_GPU
660
661 // move box_f_dev and box_f_sv to device
662 box_f_dev.template hostToDevice<0,1>();
663 box_f_sv.template hostToDevice<0>();
664
665#endif
666
667 shift_box_ndec = dec.get_ndec();
668 }
669
670 /*! \brief Local ghost from labeled particles
671 *
672 * \param v_pos vector of particle positions
673 * \param v_prp vector of particles properties
674 * \param opt options
675 *
676 */
677 void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
678 openfpm::vector<prop,Memory,layout_base> & v_prp,
679 size_t opt)
680 {
681 // get the shift vectors
682 const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors();
683
684 if (!(opt & NO_POSITION))
685 {
686 if (opt & RUN_ON_DEVICE)
687 {
688 local_ghost_from_opart_impl<true,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
689 ::run(o_part_loc,shifts,v_pos,v_prp,opt);
690 }
691 else
692 {
693 for (size_t i = 0 ; i < o_part_loc.size() ; i++)
694 {
695 size_t lin_id = o_part_loc.template get<1>(i);
696 size_t key = o_part_loc.template get<0>(i);
697
698 Point<dim, St> p = v_pos.get(key);
699 // shift
700 p -= shifts.get(lin_id);
701
702 // add this particle shifting its position
703 v_pos.add(p);
704 v_prp.get(lg_m+i) = v_prp.get(key);
705 }
706 }
707 }
708 else
709 {
710 if (opt & RUN_ON_DEVICE)
711 {
712 local_ghost_from_opart_impl<false,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
713 ::run(o_part_loc,shifts,v_pos,v_prp,opt);
714 }
715 else
716 {
717 for (size_t i = 0 ; i < o_part_loc.size() ; i++)
718 {
719 size_t key = o_part_loc.template get<0>(i);
720
721 v_prp.get(lg_m+i) = v_prp.get(key);
722 }
723 }
724 }
725 }
726
727 /*! \brief Local ghost from decomposition
728 *
729 * \param v_pos vector of particle positions
730 * \param v_prp vector of particle properties
731 * \param g_m ghost marker
732 *
733 */
734 void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
735 openfpm::vector<prop,Memory,layout_base> & v_prp,
736 size_t g_m,size_t opt)
737 {
738 o_part_loc.clear();
739
740 // get the shift vectors
741 const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
742
743 if (opt & RUN_ON_DEVICE)
744 {
745 local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
746 ::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt);
747 }
748 else
749 {
750 // Label the internal (assigned) particles
751 auto it = v_pos.getIteratorTo(g_m);
752
753 while (it.isNext())
754 {
755 auto key = it.get();
756
757 // If particles are inside these boxes
758 for (size_t i = 0; i < box_f.size(); i++)
759 {
760 for (size_t j = 0; j < box_f.get(i).size(); j++)
761 {
762 if (box_f.get(i).get(j).isInsideNP(v_pos.get(key)) == true)
763 {
764 size_t lin_id = dec.convertShift(box_cmb.get(i));
765
766 o_part_loc.add();
767 o_part_loc.template get<0>(o_part_loc.size()-1) = key;
768 o_part_loc.template get<1>(o_part_loc.size()-1) = lin_id;
769
770 Point<dim, St> p = v_pos.get(key);
771 // shift
772 p -= shifts.get(lin_id);
773
774 // add this particle shifting its position
775 v_pos.add(p);
776 v_prp.add();
777 v_prp.last() = v_prp.get(key);
778
779 // boxes in one group can be overlapping
780 // we do not have to search for the other
781 // boxes otherwise we will have duplicate particles
782 //
783 // A small note overlap of boxes across groups is fine
784 // (and needed) because each group has different shift
785 // producing non overlapping particles
786 //
787 break;
788 }
789 }
790 }
791
792 ++it;
793 }
794 }
795 }
796
797 /*! \brief Add local particles based on the boundary conditions
798 *
799 * In order to understand what this function use the following
800 *
801 \verbatim
802
803 [1,1]
804 +---------+------------------------+---------+
805 | (1,-1) | | (1,1) |
806 | | | (1,0) --> 7 | | |
807 | v | | v |
808 | 6 | | 8 |
809 +--------------------------------------------+
810 | | | |
811 | | | |
812 | | | |
813 | (-1,0) | | (1,0) |
814 | | | | | |
815 | v | (0,0) --> 4 | v |
816 | 3 | | 5 |
817 | | | |
818 B | | | A |
819 * | | | * |
820 | | | |
821 | | | |
822 | | | |
823 +--------------------------------------------+
824 | (-1,-1) | | (-1,1) |
825 | | | (-1,0) --> 1 | | |
826 | v | | v |
827 | 0 | | 2 |
828 +---------+------------------------+---------+
829
830
831 \endverbatim
832
833 *
834 * The box is the domain, while all boxes at the border (so not (0,0) ) are the
835 * ghost part at the border of the domain. If a particle A is in the position in figure
836 * a particle B must be created. This function duplicate the particle A, if A and B are
837 * local
838 *
839 * \param v_pos vector of particle of positions
840 * \param v_prp vector of particle properties
841 * \param g_m ghost marker
842 * \param opt options
843 *
844 */
845 void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
846 openfpm::vector<prop,Memory,layout_base> & v_prp ,
847 size_t & g_m,
848 size_t opt)
849 {
850 // Create the shift boxes
851 createShiftBox();
852
853 if (!(opt & SKIP_LABELLING))
854 lg_m = v_prp.size();
855
856 if (box_f.size() == 0)
857 return;
858 else
859 {
860 if (opt & SKIP_LABELLING)
861 {local_ghost_from_opart(v_pos,v_prp,opt);}
862 else
863 {local_ghost_from_dec(v_pos,v_prp,g_m,opt);}
864 }
865 }
866
867 /*! \brief This function fill the send buffer for the particle position after the particles has been label with labelParticles
868 *
869 * \param v_pos vector of particle positions
870 * \param g_pos_send Send buffer to fill
871 *
872 */
873 void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
874 openfpm::vector<size_t> & prc_sz,
875 openfpm::vector<send_pos_vector> & g_pos_send,
876 size_t opt,
877 bool async)
878 {
879 // get the shift vectors
880 const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
881
882 // create a number of send buffers equal to the near processors
883 g_pos_send.resize(prc_sz.size());
884
885 size_t old_hsmem_size = 0;
886
887 // if we do async
888 if (async == true)
889 {
890 old_hsmem_size = hsmem.size();
891 resize_retained_buffer(hsmem,g_pos_send.size() + hsmem.size());
892 }
893 else
894 {resize_retained_buffer(hsmem,g_pos_send.size());}
895
896 for (size_t i = 0; i < g_pos_send.size(); i++)
897 {
898 // Buffer must retained and survive the destruction of the
899 // vector
900 if (hsmem.get(i+old_hsmem_size).ref() == 0)
901 {hsmem.get(i+old_hsmem_size).incRef();}
902
903 // Set the memory for retain the send buffer
904 g_pos_send.get(i).setMemory(hsmem.get(i+old_hsmem_size));
905
906 // resize the sending vector (No allocation is produced)
907 g_pos_send.get(i).resize(prc_sz.get(i));
908 }
909
910 if (opt & RUN_ON_DEVICE)
911 {
912#if defined(CUDA_GPU) && defined(__NVCC__)
913
914 size_t offset = 0;
915
916 // Fill the sending buffers
917 for (size_t i = 0 ; i < g_pos_send.size() ; i++)
918 {
919 auto ite = g_pos_send.get(i).getGPUIterator();
920
921 CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
922 ite,
923 g_opart_device.toKernel(), g_pos_send.get(i).toKernel(),
924 v_pos.toKernel(),shifts.toKernel(),offset);
925
926 offset += prc_sz.get(i);
927 }
928
929#else
930
931 std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
932
933#endif
934 }
935 else
936 {
937 // Fill the send buffer
938 for (size_t i = 0; i < g_opart.size(); i++)
939 {
940 for (size_t j = 0; j < g_opart.get(i).size(); j++)
941 {
942 Point<dim, St> s = v_pos.get(g_opart.get(i).template get<0>(j));
943 s -= shifts.get(g_opart.get(i).template get<1>(j));
944 g_pos_send.get(i).set(j, s);
945 }
946 }
947 }
948 }
949
950 /*! \brief This function fill the send buffer for ghost_put
951 *
952 * \tparam send_vector type used to send data
953 * \tparam prp_object object containing only the properties to send
954 * \tparam prp set of properties to send
955 *
956 * \param v_prp vector of particle properties
957 * \param g_send_prp Send buffer to fill
958 * \param g_m ghost marker
959 *
960 */
961 template<typename send_vector, typename prp_object, int ... prp>
962 void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
963 openfpm::vector<send_vector> & g_send_prp,
964 size_t & g_m,
965 size_t opt)
966 {
967 // create a number of send buffers equal to the near processors
968 // from which we received
969
970 // NOTE in some case the information can be in prc_recv_get_pos
971
972 size_t nproc = get_last_ghost_get_num_proc();
973
974 g_send_prp.resize(nproc);
975
976 resize_retained_buffer(hsmem,g_send_prp.size());
977
978 for (size_t i = 0; i < g_send_prp.size(); i++)
979 {
980 // Buffer must retained and survive the destruction of the
981 // vector
982 if (hsmem.get(i).ref() == 0)
983 hsmem.get(i).incRef();
984
985 // Set the memory for retain the send buffer
986 g_send_prp.get(i).setMemory(hsmem.get(i));
987
988 size_t n_part_recv = get_last_ghost_get_received_parts(i);
989
990 // resize the sending vector (No allocation is produced)
991 g_send_prp.get(i).resize(n_part_recv);
992 }
993
994 size_t accum = g_m;
995
996 if (opt & RUN_ON_DEVICE)
997 {
998#if defined(CUDA_GPU) && defined(__NVCC__)
999
1000 if (sizeof...(prp) != 0)
1001 {
1002 // Fill the sending buffers
1003 for (size_t i = 0 ; i < g_send_prp.size() ; i++)
1004 {
1005 size_t n_part_recv = get_last_ghost_get_received_parts(i);
1006
1007 auto ite = g_send_prp.get(i).getGPUIterator();
1008
1009 if (ite.nblocks() == 0) {continue;}
1010
1011 CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1012 ite,
1013 g_send_prp.get(i).toKernel(),
1014 v_prp.toKernel(),accum);
1015
1016 accum = accum + n_part_recv;
1017 }
1018 }
1019
1020#else
1021
1022 std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1023
1024#endif
1025 }
1026 else
1027 {
1028 // Fill the send buffer
1029 for (size_t i = 0; i < g_send_prp.size(); i++)
1030 {
1031 size_t j2 = 0;
1032 size_t n_part_recv = get_last_ghost_get_received_parts(i);
1033
1034 for (size_t j = accum; j < accum + n_part_recv; j++)
1035 {
1036 // source object type
1037 typedef encapc<1, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src;
1038 // destination object type
1039 typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst;
1040
1041 // Copy only the selected properties
1042 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1043
1044 j2++;
1045 }
1046
1047 accum = accum + n_part_recv;
1048 }
1049 }
1050 }
1051
1052 /*! \brief resize the retained buffer by nbf
1053 *
1054 *
1055 */
1056 void resize_retained_buffer(openfpm::vector_fr<Memory> & rt_buf, size_t nbf)
1057 {
1058 // Release all the buffer that are going to be deleted
1059 for (size_t i = nbf ; i < rt_buf.size() ; i++)
1060 {
1061 rt_buf.get(i).decRef();
1062 }
1063
1064 hsmem.resize(nbf);
1065 }
1066
1067 /*! \brief Set the buffer for each property
1068 *
1069 *
1070 */
1071 template<typename send_vector, typename v_mpl>
1072 struct set_mem_retained_buffers_inte
1073 {
1074 openfpm::vector<send_vector> & g_send_prp;
1075
1076 size_t i;
1077
1078 openfpm::vector_fr<Memory> & hsmem;
1079
1080 size_t j;
1081
1082 set_mem_retained_buffers_inte(openfpm::vector<send_vector> & g_send_prp, size_t i ,
1083 openfpm::vector_fr<Memory> & hsmem, size_t j)
1084 :g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1085 {}
1086
1087 //! It call the setMemory function for each property
1088 template<typename T>
1089 inline void operator()(T& t)
1090 {
1091 g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
1092
1093 j++;
1094 }
1095 };
1096
1097 template<bool inte_or_lin,typename send_vector, typename v_mpl>
1098 struct set_mem_retained_buffers
1099 {
1100 static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1101 openfpm::vector<size_t> & prc_sz,
1102 size_t i,
1103 openfpm::vector_fr<Memory> & hsmem,
1104 size_t j)
1105 {
1106 // Set the memory for retain the send buffer
1107 g_send_prp.get(i).setMemory(hsmem.get(j));
1108
1109 // resize the sending vector (No allocation is produced)
1110 g_send_prp.get(i).resize(prc_sz.get(i));
1111
1112 return j+1;
1113 }
1114 };
1115
1116 template<typename send_vector, typename v_mpl>
1117 struct set_mem_retained_buffers<true,send_vector,v_mpl>
1118 {
1119 static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1120 openfpm::vector<size_t> & prc_sz,
1121 size_t i,
1122 openfpm::vector_fr<Memory> & hsmem,
1123 size_t j)
1124 {
1125 set_mem_retained_buffers_inte<send_vector,v_mpl> smrbi(g_send_prp,i,hsmem,j);
1126
1127 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1128
1129 // if we do not send properties do not reallocate
1130 if (boost::mpl::size<v_mpl>::type::value != 0)
1131 {
1132 // resize the sending vector (No allocation is produced)
1133 g_send_prp.get(i).resize(prc_sz.get(i));
1134 }
1135
1136 return smrbi.j;
1137 }
1138 };
1139
1140 /*! \brief This function fill the send buffer for properties after the particles has been label with labelParticles
1141 *
1142 * \tparam send_vector type used to send data
1143 * \tparam prp_object object containing only the properties to send
1144 * \tparam prp set of properties to send
1145 *
1146 * \param v_prp vector of particle properties
1147 * \param g_send_prp Send buffer to fill
1148 *
1149 */
1150 template<typename send_vector, typename prp_object, int ... prp>
1151 void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
1152 openfpm::vector<size_t> & prc_sz,
1153 openfpm::vector<send_vector> & g_send_prp,
1154 size_t opt)
1155 {
1156 size_t factor = 1;
1157
1158 typedef typename to_boost_vmpl<prp...>::type v_mpl;
1159
1160 if (is_layout_inte<layout_base<prop>>::value == true) {factor *= sizeof...(prp);}
1161
1162 // create a number of send buffers equal to the near processors
1163 g_send_prp.resize(prc_sz.size());
1164
1165 resize_retained_buffer(hsmem,g_send_prp.size()*factor);
1166
1167 for (size_t i = 0; i < hsmem.size(); i++)
1168 {
1169 // Buffer must retained and survive the destruction of the
1170 // vector
1171 if (hsmem.get(i).ref() == 0)
1172 {hsmem.get(i).incRef();}
1173 }
1174
1175 size_t j = 0;
1176 for (size_t i = 0; i < g_send_prp.size(); i++)
1177 {
1178 j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,prc_sz,i,hsmem,j);
1179 }
1180
1181 if (opt & RUN_ON_DEVICE)
1182 {
1183#if defined(CUDA_GPU) && defined(__NVCC__)
1184
1185 size_t offset = 0;
1186
1187 if (sizeof...(prp) != 0)
1188 {
1189 // Fill the sending buffers
1190 for (size_t i = 0 ; i < g_send_prp.size() ; i++)
1191 {
1192 auto ite = g_send_prp.get(i).getGPUIterator();
1193
1194 CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1195 ite,
1196 g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
1197 v_prp.toKernel(),offset);
1198
1199 offset += prc_sz.get(i);
1200 }
1201 }
1202
1203#else
1204
1205 std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1206
1207#endif
1208 }
1209 else
1210 {
1211 // if no properties must be sent skip this step
1212 if (sizeof...(prp) == 0) {return;}
1213
1214 // Fill the send buffer
1215 for (size_t i = 0; i < g_opart.size(); i++)
1216 {
1217 for (size_t j = 0; j < g_opart.get(i).size(); j++)
1218 {
1219 // source object type
1220 typedef decltype(v_prp.get(g_opart.get(i).template get<0>(j))) encap_src;
1221 // destination object type
1222 typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1223
1224 // Copy only the selected properties
1225 object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(g_opart.get(i).template get<0>(j)), g_send_prp.get(i).get(j));
1226 }
1227 }
1228 }
1229 }
1230
1231 /*! \brief allocate and fill the send buffer for the map function
1232 *
1233 * \param v_pos vector of particle positions
1234 * \param v_prp vector of particles properties
1235 * \param prc_sz_r For each processor in the list the size of the message to send
1236 * \param m_pos sending buffer for position
1237 * \param m_prp sending buffer for properties
1238 * \param offset from where start the list of the particles that migrate in o_part
1239 * This parameter is used only in case of RUN_ON_DEVICE option
1240 *
1241 */
1242 void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1243 openfpm::vector<prop,Memory,layout_base> & v_prp,
1244 openfpm::vector<size_t> & prc_sz_r,
1245 openfpm::vector<size_t> & prc_r,
1246 openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos,
1247 openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp,
1248 openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz,
1249 size_t opt)
1250 {
1251 m_prp.resize(prc_sz_r.size());
1252 m_pos.resize(prc_sz_r.size());
1253 openfpm::vector<size_t> cnt(prc_sz_r.size());
1254
1255 for (size_t i = 0; i < prc_sz_r.size() ; i++)
1256 {
1257 // set the size and allocate, using mem warant that pos and prp is contiguous
1258 m_pos.get(i).resize(prc_sz_r.get(i));
1259 m_prp.get(i).resize(prc_sz_r.get(i));
1260 cnt.get(i) = 0;
1261 }
1262
1263 if (opt & RUN_ON_DEVICE)
1264 {
1265 if (v_cl.size() == 1)
1266 {return;}
1267
1268#if defined(CUDA_GPU) && defined(__NVCC__)
1269
1270 // The first part of m_opart and prc_sz contain the local particles
1271
1272 #ifndef TEST1
1273
1274 v_pos_tmp.resize(prc_sz.template get<0>(0));
1275 v_prp_tmp.resize(prc_sz.template get<0>(0));
1276
1277 auto ite = v_pos_tmp.getGPUIterator();
1278
1279 // fill v_pos_tmp and v_prp_tmp with local particles
1280 process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
1281 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
1282 <<<ite.wthr,ite.thr>>>
1283 (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
1284 v_pos.toKernel(),v_prp.toKernel(),0);
1285
1286 size_t offset = prc_sz.template get<0>(0);
1287
1288 // Fill the sending buffers
1289 for (size_t i = 0 ; i < m_pos.size() ; i++)
1290 {
1291 auto ite = m_pos.get(i).getGPUIterator();
1292
1293 process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1294 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
1295 <<<ite.wthr,ite.thr>>>
1296 (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1297 v_pos.toKernel(),v_prp.toKernel(),offset);
1298
1299 offset += prc_sz_r.size();
1300 }
1301
1302 // old local particles with the actual local particles
1303 v_pos_tmp.swap(v_pos);
1304 v_prp_tmp.swap(v_prp);
1305
1306 #else
1307
1308 int rank = v_cl.rank();
1309
1310 v_pos_tmp.resize(prc_sz.template get<0>(rank));
1311 v_prp_tmp.resize(prc_sz.template get<0>(rank));
1312
1313 auto ite = v_pos_tmp.getGPUIterator();
1314
1315 starts.template deviceToHost<0>();
1316 size_t offset = starts.template get<0>(rank);
1317
1318 // no work to do
1319 if (ite.wthr.x != 0)
1320 {
1321 // fill v_pos_tmp and v_prp_tmp with local particles
1322 CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
1323 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1324 ite,
1325 m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
1326 v_pos.toKernel(),v_prp.toKernel(),offset);
1327 }
1328
1329 // Fill the sending buffers
1330 for (size_t i = 0 ; i < m_pos.size() ; i++)
1331 {
1332 size_t offset = starts.template get<0>(prc_r.template get<0>(i));
1333
1334 auto ite = m_pos.get(i).getGPUIterator();
1335
1336 // no work to do
1337 if (ite.wthr.x != 0)
1338 {
1339
1340 CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1341 decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1342 ite,
1343 m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1344 v_pos.toKernel(),v_prp.toKernel(),offset);
1345
1346 }
1347 }
1348
1349 // old local particles with the actual local particles
1350 v_pos_tmp.swap(v_pos);
1351 v_prp_tmp.swap(v_prp);
1352
1353 #endif
1354#else
1355
1356 std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1357
1358#endif
1359 }
1360 else
1361 {
1362 // end vector point
1363 long int id_end = v_pos.size();
1364
1365 // end opart point
1366 long int end = m_opart.size()-1;
1367
1368 // Run through all the particles and fill the sending buffer
1369 for (size_t i = 0; i < m_opart.size(); i++)
1370 {
1371 process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1372 }
1373
1374 v_pos.resize(v_pos.size() - m_opart.size());
1375 v_prp.resize(v_prp.size() - m_opart.size());
1376 }
1377 }
1378
1379
1380 /*! \brief allocate and fill the send buffer for the map function
1381 *
1382 * \tparam prp_object object type to send
1383 * \tparam prp properties to send
1384 *
1385 * \param v_pos vector of particle positions
1386 * \param v_prp vector of particle properties
1387 * \param prc_sz_r number of particles to send for each processor
1388 * \param m_pos sending buffer for position
1389 * \param m_prp sending buffer for properties
1390 *
1391 */
1392 template<typename prp_object,int ... prp>
1393 void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos,
1394 openfpm::vector<prop,Memory,layout_base> & v_prp,
1395 openfpm::vector<size_t> & prc_sz_r,
1396 openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos,
1397 openfpm::vector<openfpm::vector<prp_object>> & m_prp)
1398 {
1399 m_prp.resize(prc_sz_r.size());
1400 m_pos.resize(prc_sz_r.size());
1401 openfpm::vector<size_t> cnt(prc_sz_r.size());
1402
1403 for (size_t i = 0; i < prc_sz_r.size(); i++)
1404 {
1405 // set the size and allocate, using mem warant that pos and prp is contiguous
1406 m_pos.get(i).resize(prc_sz_r.get(i));
1407 m_prp.get(i).resize(prc_sz_r.get(i));
1408 cnt.get(i) = 0;
1409 }
1410
1411 // end vector point
1412 long int id_end = v_pos.size();
1413
1414 // end opart point
1415 long int end = m_opart.size()-1;
1416
1417 // Run through all the particles and fill the sending buffer
1418 for (size_t i = 0; i < m_opart.size(); i++)
1419 {
1420 process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1421 }
1422
1423 v_pos.resize(v_pos.size() - m_opart.size());
1424 v_prp.resize(v_prp.size() - m_opart.size());
1425 }
1426
1427 /*! \brief Label particles for mappings
1428 *
1429 * \param v_pos vector of particle positions
1430 * \param lbl_p Particle labeled
1431 * \param prc_sz For each processor the number of particles to send
1432 * \param opt options
1433 *
1434 */
1435 template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1436 openfpm::vector<aggregate<int,int,int>,
1437 Memory,
1438 layout_base> & lbl_p,
1439 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
1440 size_t opt)
1441 {
1442 if (opt == RUN_ON_DEVICE)
1443 {
1444#ifdef __NVCC__
1445
1446 // Map directly on gpu
1447
1448 lbl_p.resize(v_pos.size());
1449
1450 // labelling kernel
1451
1452 prc_sz.template fill<0>(0);
1453
1454 auto ite = v_pos.getGPUIterator();
1455 if (ite.wthr.x == 0)
1456 {
1457 starts.resize(v_cl.size());
1458 starts.template fill<0>(0);
1459 return;
1460 }
1461
1462 // we have one process we can skip ...
1463 if (v_cl.size() == 1)
1464 {
1465 // ... but we have to apply the boundary conditions
1466
1467 periodicity_int<dim> bc;
1468
1469 for (size_t i = 0 ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);}
1470
1471 CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,dec.getDomain(),bc,v_pos.toKernel());
1472
1473 return;
1474 }
1475
1476 // label particle processor
1477 CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>),
1478 ite,
1479 dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
1480
1481
1482 #ifndef TEST1
1483
1484 // sort particles
1485 mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
1486
1487 mem.allocate(sizeof(int));
1488 mem.fill(0);
1489
1490 // Find the buffer bases
1491 find_buffer_offsets<1,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>>
1492 (lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel());
1493
1494#error "should not be here"
1495
1496 // Trasfer the number of offsets on CPU
1497 mem.deviceToHost();
1498 prc_sz.template deviceToHost<0,1>();
1499 // get also the last element from lbl_p;
1500 lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1);
1501
1502 mem.deviceToHost();
1503 int noff = *(int *)mem.getPointer();
1504 prc_sz.resize(noff+1);
1505 prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size();
1506 prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1);
1507
1508 #else
1509
1510 starts.resize(v_cl.size());
1511 openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
1512
1513 // move prc_sz to host
1514 prc_sz.template deviceToHost<0>();
1515
1516 ite = lbl_p.getGPUIterator();
1517
1518 // we order lbl_p
1519 CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel());
1520
1521 #endif
1522
1523#else
1524
1525 std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1526
1527#endif
1528 }
1529 else
1530 {
1531 // reset lbl_p
1532 lbl_p.clear();
1533 prc_sz_gg.clear();
1534 o_part_loc.clear();
1535 g_opart.clear();
1536 prc_g_opart.clear();
1537
1538 // resize the label buffer
1539 prc_sz.template fill<0>(0);
1540
1541 auto it = v_pos.getIterator();
1542
1543 // Label all the particles with the processor id where they should go
1544 while (it.isNext())
1545 {
1546 auto key = it.get();
1547
1548 // Apply the boundary conditions
1549 dec.applyPointBC(v_pos.get(key));
1550
1551 size_t p_id = 0;
1552
1553 // Check if the particle is inside the domain
1554 if (dec.getDomain().isInside(v_pos.get(key)) == true)
1555 {p_id = dec.processorID(v_pos.get(key));}
1556 else
1557 {p_id = obp::out(key, v_cl.getProcessUnitID());}
1558
1559 // Particle to move
1560 if (p_id != v_cl.getProcessUnitID())
1561 {
1562 if ((long int) p_id != -1)
1563 {
1564 prc_sz.template get<0>(p_id)++;
1565 lbl_p.add();
1566 lbl_p.last().template get<0>() = key;
1567 lbl_p.last().template get<2>() = p_id;
1568 }
1569 else
1570 {
1571 lbl_p.add();
1572 lbl_p.last().template get<0>() = key;
1573 lbl_p.last().template get<2>() = p_id;
1574 }
1575 }
1576
1577 // Add processors and add size
1578
1579 ++it;
1580 }
1581 }
1582 }
1583
1584 /*! \brief Label the particles
1585 *
1586 * It count the number of particle to send to each processors and save its ids
1587 *
1588 * \see nn_prcs::getShiftvectors()
1589 *
1590 * \param v_pos vector of particle positions
1591 * \param v_prp vector of particle properties
1592 * \param prc for each particle it label the processor id (the owner of the particle, or where it should go the particle)
1593 * \param g_m ghost marker
1594 * \param opt ghost_get options
1595 *
1596 */
1597 void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1598 openfpm::vector<prop,Memory,layout_base> & v_prp,
1599 openfpm::vector<size_t> & prc,
1600 openfpm::vector<size_t> & prc_sz,
1601 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
1602 size_t & g_m,
1603 size_t opt)
1604 {
1605 // Buffer that contain for each processor the id of the particle to send
1606 prc_sz.clear();
1607 g_opart.clear();
1608 g_opart.resize(dec.getNNProcessors());
1609 prc_g_opart.clear();
1610
1611 if (opt & RUN_ON_DEVICE)
1612 {
1613 labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,
1614 Decomposition,std::is_same<Memory,CudaMemory>::value>
1615 ::run(mem,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt);
1616 }
1617 else
1618 {
1619 // Iterate over all particles
1620 auto it = v_pos.getIteratorTo(g_m);
1621 while (it.isNext())
1622 {
1623 auto key = it.get();
1624
1625 // Given a particle, it return which processor require it (first id) and shift id, second id
1626 // For an explanation about shifts vectors please consult getShiftVector in ie_ghost
1627 const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE);
1628
1629 for (size_t i = 0; i < vp_id.size(); i++)
1630 {
1631 // processor id
1632 size_t p_id = vp_id.get(i).first;
1633
1634 // add particle to communicate
1635 g_opart.get(p_id).add();
1636 g_opart.get(p_id).last().template get<0>() = key;
1637 g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
1638 }
1639
1640 ++it;
1641 }
1642
1643 // remove all zero entry and construct prc (the list of the sending processors)
1644 openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f;
1645
1646 // count the non zero element
1647 for (size_t i = 0 ; i < g_opart.size() ; i++)
1648 {
1649 if (g_opart.get(i).size() != 0)
1650 {
1651 prc_sz.add(g_opart.get(i).size());
1652 g_opart_f.add();
1653 g_opart.get(i).swap(g_opart_f.last());
1654 prc.add(dec.IDtoProc(i));
1655 }
1656 }
1657
1658 g_opart.swap(g_opart_f);
1659 }
1660#ifdef EXTREA_TRACE_PRE_COMM
1661 Extrae_user_function (0);
1662#endif
1663 }
1664
1665 /*! \brief Call-back to allocate buffer to receive incoming elements (particles)
1666 *
1667 * \param msg_i size required to receive the message from i
1668 * \param total_msg total size to receive from all the processors
1669 * \param total_p the total number of processor that want to communicate with you
1670 * \param i processor id
1671 * \param ri request id (it is an id that goes from 0 to total_p, and is unique
1672 * every time message_alloc is called)
1673 * \param ptr a pointer to the vector_dist structure
1674 *
1675 * \return the pointer where to store the message for the processor i
1676 *
1677 */
1678 static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr)
1679 {
1680 // cast the pointer
1681 vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd = static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *>(ptr);
1682
1683 vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits());
1684 vd->recv_mem_gm.get(i).resize(msg_i);
1685
1686 return vd->recv_mem_gm.get(i).getPointer();
1687 }
1688
1689public:
1690
1691 /*! \brief Copy Constructor
1692 *
1693 * \param v vector to copy
1694 *
1695 */
1696 vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & v)
1697 :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
1698 {
1699 this->operator=(v);
1700 }
1701
1702
1703 /*! \brief Constructor
1704 *
1705 * \param dec Domain decompositon
1706 *
1707 */
1708 vector_dist_comm(const Decomposition & dec)
1709 :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
1710 {
1711
1712 }
1713
1714 /*! \brief Constructor
1715 *
1716 * \param dec Domain decompositon
1717 *
1718 */
1719 vector_dist_comm(Decomposition && dec)
1720 :v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
1721 {
1722
1723 }
1724
1725 /*! \brief Constructor
1726 *
1727 */
1728 vector_dist_comm()
1729 :v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
1730 {
1731 }
1732
1733 /*! \brief Destructor
1734 *
1735 * Release the retained buffer
1736 *
1737 */
1738 ~vector_dist_comm()
1739 {
1740 for (size_t i = 0 ; i < hsmem.size() ; i++)
1741 {
1742 if (hsmem.get(i).ref() == 1)
1743 hsmem.get(i).decRef();
1744 else
1745 std::cout << __FILE__ << ":" << __LINE__ << " internal error memory is in an invalid state " << std::endl;
1746 }
1747
1748 }
1749
1750 /*! \brief Get the number of minimum sub-domain per processor
1751 *
1752 * \return minimum number
1753 *
1754 */
1755 size_t getDecompositionGranularity()
1756 {
1757 return v_sub_unit_factor;
1758 }
1759
1760 /*! \brief Set the minimum number of sub-domain per processor
1761 *
1762 * \param n_sub
1763 *
1764 */
1765 void setDecompositionGranularity(size_t n_sub)
1766 {
1767 this->v_sub_unit_factor = n_sub;
1768 }
1769
1770 /*! \brief Initialize the decomposition
1771 *
1772 * \param box domain
1773 * \param bc boundary conditions
1774 * \param g ghost extension
1775 * \param opt additional options
1776 *
1777 */
1778 void init_decomposition(Box<dim,St> & box,
1779 const size_t (& bc)[dim],
1780 const Ghost<dim,St> & g,
1781 size_t opt,
1782 const grid_sm<dim,void> & gdist)
1783 {
1784 size_t div[dim];
1785
1786 if (opt & BIND_DEC_TO_GHOST)
1787 {
1788 // padding
1789 size_t pad = 0;
1790
1791 // CellDecomposer
1792 CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1793
1794 // Calculate the divisions for the symmetric Cell-lists
1795 cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1796
1797 for (size_t i = 0 ; i < dim ; i++)
1798 {div[i] = cd_sm.getDiv()[i] - 2*pad;}
1799
1800 // Create the sub-domains
1801 dec.setParameters(div, box, bc, g, gdist);
1802 }
1803 else
1804 {
1805 dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist);
1806 }
1807 dec.decompose();
1808 }
1809
1810 /*! \brief Initialize the decomposition
1811 *
1812 * \param box domain
1813 * \param bc boundary conditions
1814 * \param g ghost extension
1815 * \param opt additional options
1816 *
1817 */
1818 void init_decomposition_gr_cell(Box<dim,St> & box,
1819 const size_t (& bc)[dim],
1820 const Ghost<dim,St> & g,
1821 size_t opt,
1822 const grid_sm<dim,void> & gdist)
1823 {
1824 size_t div[dim];
1825
1826 for (size_t i = 0 ; i < dim ; i++)
1827 {div[i] = gdist.size(i);}
1828
1829 // Create the sub-domains
1830 dec.setParameters(div, box, bc, g);
1831
1832 dec.decompose();
1833 }
1834
1835 /*! \brief It synchronize the properties and position of the ghost particles
1836 *
1837 * \tparam prp list of properties to get synchronize
1838 *
1839 * \param opt options WITH_POSITION, it send also the positional information of the particles
1840 * \param v_pos vector of position to update
1841 * \param v_prp vector of properties to update
1842 * \param g_m marker between real and ghost particles
1843 *
1844 */
1845 template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1846 openfpm::vector<prop,Memory,layout_base> & v_prp,
1847 size_t & g_m,
1848 size_t opt = WITH_POSITION)
1849 {
1850#ifdef PROFILE_SCOREP
1851 SCOREP_USER_REGION("ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1852#endif
1853
1854 // Sending property object
1855 typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1856
1857 // send vector for each processor
1858 typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
1859
1860 if (!(opt & NO_POSITION))
1861 {v_pos.resize(g_m);}
1862
1863 // reset the ghost part
1864
1865 if (!(opt & SKIP_LABELLING))
1866 {v_prp.resize(g_m);}
1867
1868 // Label all the particles
1869 if ((opt & SKIP_LABELLING) == false)
1870 {labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz_gg,prc_offset,g_m,opt);}
1871
1872 {
1873 // Send and receive ghost particle information
1874 openfpm::vector<send_vector> g_send_prp;
1875
1876 fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz_gg,g_send_prp,opt);
1877
1878 #if defined(CUDA_GPU) && defined(__NVCC__)
1879 cudaDeviceSynchronize();
1880 #endif
1881
1882 // if there are no properties skip
1883 // SSendRecvP send everything when we do not give properties
1884
1885 ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1886 sendrecv_prp(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1887 prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1888 }
1889
1890 if (!(opt & NO_POSITION))
1891 {
1892 // Sending buffer for the ghost particles position
1893 openfpm::vector<send_pos_vector> g_pos_send;
1894
1895 fill_send_ghost_pos_buf(v_pos,prc_sz_gg,g_pos_send,opt,impl == GHOST_ASYNC);
1896
1897#if defined(CUDA_GPU) && defined(__NVCC__)
1898 cudaDeviceSynchronize();
1899#endif
1900
1901 ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1902 sendrecv_pos(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1903
1904 // fill g_opart_sz
1905 g_opart_sz.resize(prc_g_opart.size());
1906
1907 for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
1908 g_opart_sz.get(i) = g_pos_send.get(i).size();
1909 }
1910
1911 // Important to ensure that the number of particles in v_prp must be equal to v_pos
1912 // Note that if we do not give properties sizeof...(prp) == 0 in general at this point
1913 // v_prp.size() != v_pos.size()
1914 if (!(opt & SKIP_LABELLING))
1915 {
1916 v_prp.resize(v_pos.size());
1917 }
1918
1919 add_loc_particles_bc(v_pos,v_prp,g_m,opt);
1920 }
1921
1922 /*! \brief It synchronize the properties and position of the ghost particles
1923 *
1924 * \tparam prp list of properties to get synchronize
1925 *
1926 * \param opt options WITH_POSITION, it send also the positional information of the particles
1927 * \param v_pos vector of position to update
1928 * \param v_prp vector of properties to update
1929 * \param g_m marker between real and ghost particles
1930 *
1931 */
1932 template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1933 openfpm::vector<prop,Memory,layout_base> & v_prp,
1934 size_t & g_m,
1935 size_t opt = WITH_POSITION)
1936 {
1937 // Sending property object
1938 typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1939
1940 // send vector for each processor
1941 typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
1942
1943 // Send and receive ghost particle information
1944 openfpm::vector<send_vector> g_send_prp;
1945 openfpm::vector<send_pos_vector> g_pos_send;
1946
1947 ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1948 sendrecv_prp_wait(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1949 prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1950
1951
1952 ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1953 sendrecv_pos_wait(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1954 }
1955
1956 /*! \brief It move all the particles that does not belong to the local processor to the respective processor
1957 *
1958 * \tparam out of bound policy it specify what to do when the particles are detected out of bound
1959 *
1960 * In general this function is called after moving the particles to move the
1961 * elements out the local processor. Or just after initialization if each processor
1962 * contain non local particles
1963 *
1964 * \tparam prp properties to communicate
1965 *
1966 * \param v_pos vector of particle positions
1967 * \param v_prp vector of particle properties
1968 * \param g_m ghost marker
1969 * \param opt options
1970 *
1971 */
1972 template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt)
1973 {
1974 if (opt & RUN_ON_DEVICE)
1975 {
1976 std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " map_list is unsupported on device (coming soon)" << std::endl;
1977 return;
1978 }
1979
1980 typedef KillParticle obp;
1981
1982 // Processor communication size
1983 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits());
1984
1985 // map completely reset the ghost part
1986 v_pos.resize(g_m);
1987 v_prp.resize(g_m);
1988
1989 // m_opart, Contain the processor id of each particle (basically where they have to go)
1990 labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
1991
1992 // Calculate the sending buffer size for each processor, put this information in
1993 // a contiguous buffer
1994 p_map_req.resize(v_cl.getProcessingUnits());
1995 openfpm::vector<size_t> prc_sz_r;
1996 openfpm::vector<size_t> prc_r;
1997
1998 for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
1999 {
2000 if (prc_sz.template get<0>(i) != 0)
2001 {
2002 p_map_req.get(i) = prc_r.size();
2003 prc_r.add(i);
2004 prc_sz_r.add(prc_sz.template get<0>(i));
2005 }
2006 }
2007
2008 if (opt & MAP_LOCAL)
2009 {
2010 // if the map is local we indicate that we receive only from the neighborhood processors
2011
2012 prc_recv_map.clear();
2013 for (size_t i = 0 ; i < dec.getNNProcessors() ; i++)
2014 {prc_recv_map.add(dec.IDtoProc(i));}
2015 }
2016
2017 // Sending property object
2018 typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
2019
2020 //! position vector
2021 openfpm::vector<openfpm::vector<Point<dim, St>>> m_pos;
2022 //! properties vector
2023 openfpm::vector<openfpm::vector<prp_object>> m_prp;
2024
2025 fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
2026
2027 v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
2028 v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
2029
2030 // mark the ghost part
2031
2032 g_m = v_pos.size();
2033 }
2034
2035 /*! \brief It move all the particles that does not belong to the local processor to the respective processor
2036 *
2037 * \tparam out of bound policy it specify what to do when the particles are detected out of bound
2038 *
2039 * In general this function is called after moving the particles to move the
2040 * elements out the local processor. Or just after initialization if each processor
2041 * contain non local particles
2042 *
2043 * \param v_pos vector of particle positions
2044 * \param v_prp vector of particle properties
2045 * \param g_m ghost marker
2046 *
2047 */
2048 template<typename obp = KillParticle>
2049 void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
2050 openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m,
2051 size_t opt)
2052 {
2053#ifdef PROFILE_SCOREP
2054 SCOREP_USER_REGION("map",SCOREP_USER_REGION_TYPE_FUNCTION)
2055#endif
2056
2057 prc_sz.resize(v_cl.getProcessingUnits());
2058
2059 // map completely reset the ghost part
2060 v_pos.resize(g_m);
2061 v_prp.resize(g_m);
2062
2063 // Contain the processor id of each particle (basically where they have to go)
2064 labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
2065
2066 openfpm::vector<size_t> prc_sz_r;
2067 openfpm::vector<size_t> prc_r;
2068
2069 // Calculate the sending buffer size for each processor, put this information in
2070 // a contiguous buffer
2071 calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt);
2072
2073 //! position vector
2074 openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos;
2075 //! properties vector
2076 openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp;
2077
2078 fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt);
2079
2080 size_t opt_ = 0;
2081 if (opt & RUN_ON_DEVICE)
2082 {
2083#if defined(CUDA_GPU) && defined(__NVCC__)
2084 // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2085 cudaDeviceSynchronize();
2086 opt_ |= MPI_GPU_DIRECT;
2087#else
2088 std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2089#endif
2090 }
2091
2092 v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>,
2093 openfpm::vector<Point<dim, St>,Memory,layout_base>,
2094 layout_base>
2095 (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_);
2096
2097 v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2098 openfpm::vector<prop,Memory,layout_base>,
2099 layout_base>
2100 (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_);
2101
2102 // mark the ghost part
2103
2104 g_m = v_pos.size();
2105 }
2106
2107 /*! \brief Get the decomposition
2108 *
2109 * \return
2110 *
2111 */
2112 inline Decomposition & getDecomposition()
2113 {
2114 return dec;
2115 }
2116
2117 /*! \brief Get the decomposition
2118 *
2119 * \return
2120 *
2121 */
2122 inline const Decomposition & getDecomposition() const
2123 {
2124 return dec;
2125 }
2126
2127 /*! \brief Copy a vector
2128 *
2129 * \param vc vector to copy
2130 *
2131 * \return iteself
2132 *
2133 */
2134 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc)
2135 {
2136 dec = vc.dec;
2137
2138 return *this;
2139 }
2140
2141 /*! \brief Copy a vector
2142 *
2143 * \param vc vector to copy
2144 *
2145 * \return itself
2146 *
2147 */
2148 vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc)
2149 {
2150 dec = vc.dec;
2151
2152 return *this;
2153 }
2154
2155 /*! \brief Ghost put
2156 *
2157 * \tparam op operation to apply
2158 * \tparam prp set of properties
2159 *
2160 * \param v_pos vector of particle positions
2161 * \param v_prp vector od particle properties
2162 * \param g_m ghost marker
2163 * \param opt options
2164 *
2165 */
2166 template<template<typename,typename> class op, int ... prp>
2167 void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
2168 openfpm::vector<prop,Memory,layout_base> & v_prp,
2169 size_t & g_m,
2170 size_t opt)
2171 {
2172 // Sending property object
2173 typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
2174
2175 // send vector for each processor
2176 typedef openfpm::vector<prp_object,Memory,layout_base> send_vector;
2177
2178 openfpm::vector<send_vector> g_send_prp;
2179 fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt);
2180
2181 if (opt & RUN_ON_DEVICE)
2182 {
2183#if defined(CUDA_GPU) && defined(__NVCC__)
2184 // Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2185 cudaDeviceSynchronize();
2186#else
2187 std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2188#endif
2189 }
2190
2191 // Send and receive ghost particle information
2192 if (opt & NO_CHANGE_ELEMENTS)
2193 {
2194 size_t opt_ = compute_options(opt);
2195
2196 if (opt & RUN_ON_DEVICE)
2197 {
2198 op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset);
2199 v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2200 send_vector,
2201 decltype(v_prp),
2202 layout_base,
2203 prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2204 }
2205 else
2206 {
2207 op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2208 v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2209 send_vector,
2210 decltype(v_prp),
2211 layout_base,
2212 prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2213 }
2214 }
2215 else
2216 {
2217 size_t opt_ = compute_options(opt);
2218
2219 if (opt & RUN_ON_DEVICE)
2220 {
2221 op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset);
2222 v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2223 send_vector,
2224 decltype(v_prp),
2225 layout_base,
2226 prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2227 }
2228 else
2229 {
2230 op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2231 v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2232 send_vector,
2233 decltype(v_prp),
2234 layout_base,
2235 prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2236 }
2237 }
2238
2239 // process also the local replicated particles
2240
2241 if (lg_m < v_prp.size() && v_prp.size() - lg_m != o_part_loc.size())
2242 {
2243 std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Local ghost particles = " << v_prp.size() - lg_m << " != " << o_part_loc.size() << std::endl;
2244 std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Check that you did a ghost_get before a ghost_put" << std::endl;
2245 }
2246
2247
2248 if (opt & RUN_ON_DEVICE)
2249 {
2250 v_prp.template merge_prp_v_device<op,prop,Memory,
2251 openfpm::grow_policy_double,
2252 layout_base,
2253 decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2254 }
2255 else
2256 {
2257 v_prp.template merge_prp_v<op,prop,Memory,
2258 openfpm::grow_policy_double,
2259 layout_base,
2260 decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2261 }
2262 }
2263};
2264
2265
2266#endif /* SRC_VECTOR_VECTOR_DIST_COMM_HPP_ */
2267