vector_dist_comm.hpp source code [openfpm/src/Vector/vector_dist_comm.hpp]

1	/*
2	* vector_dist_comm.hpp
3	*
4	* Created on: Aug 18, 2016
5	* Author: i-bird
6	*/
7
8	#ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
9	#define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
10
11	#define TEST1
12
13	#if defined(CUDA_GPU) && defined(__NVCC__)
14	#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
15	#include "util/cuda/kernels.cuh"
16	#endif
17
18	#include "Vector/util/vector_dist_funcs.hpp"
19	#include "cuda/vector_dist_comm_util_funcs.cuh"
20	#include "util/cuda/scan_ofp.cuh"
21
22	template<typename T>
23	struct DEBUG
24	{
25	static float ret(T & tmp)
26	{
27	return `0.0`;
28	}
29	};
30
31	template<>
32	struct DEBUG<float &>
33	{
34	static float ret(float & tmp)
35	{
36	return tmp;
37	}
38	};
39
40	/! \brief compute the communication options from the ghost_get/put options*
41	*
42	*
43	*/
44	inline static size_t compute_options(size_t opt)
45	{
46	size_t opt_ = NONE;
47	if (opt & NO_CHANGE_ELEMENTS && opt & SKIP_LABELLING)
48	{opt_ = RECEIVE_KNOWN \| KNOWN_ELEMENT_OR_BYTE;}
49
50	if (opt & RUN_ON_DEVICE)
51	{
52	#if defined(CUDA_GPU) && defined(__NVCC__)
53	// Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
54	opt_ \|= MPI_GPU_DIRECT;
55	#else
56	std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
57	#endif
58	}
59
60	return opt_;
61	}
62
63	/! \brief template selector for asynchronous or not asynchronous*
64	*
65	* \tparam impl implementation
66	* \tparam prp properties
67	*
68	*/
69	template<unsigned int impl, template<typename> class layout_base, unsigned int ... prp>
70	struct ghost_exchange_comm_impl
71	{
72	template<typename Vcluster_type, typename vector_prop_type,
73	typename vector_pos_type, typename send_vector,
74	typename prc_recv_get_type, typename prc_g_opart_type,
75	typename recv_sz_get_type, typename recv_sz_get_byte_type,
76	typename g_opart_sz_type>
77	static inline void sendrecv_prp(Vcluster_type & v_cl,
78	openfpm::vector<send_vector> & g_send_prp,
79	vector_prop_type & v_prp,
80	vector_pos_type & v_pos,
81	prc_g_opart_type & prc_g_opart,
82	prc_recv_get_type & prc_recv_get,
83	recv_sz_get_type & recv_sz_get,
84	recv_sz_get_byte_type & recv_sz_get_byte,
85	g_opart_sz_type & g_opart_sz,
86	size_t g_m,
87	size_t opt)
88	{
89	// if there are no properties skip
90	// SSendRecvP send everything when we do not give properties
91
92	if (sizeof...(prp) != `0`)
93	{
94	size_t opt_ = compute_options(opt);
95	if (opt & SKIP_LABELLING)
96	{
97	if (opt & RUN_ON_DEVICE)
98	{
99	op_ssend_gg_recv_merge_run_device opm(g_m);
100	v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
101	}
102	else
103	{
104	op_ssend_gg_recv_merge opm(g_m);
105	v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
106	}
107	}
108	else
109	{v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
110
111	// fill g_opart_sz
112	g_opart_sz.resize(prc_g_opart.size());
113
114	for (size_t i = `0` ; i < prc_g_opart.size() ; i++)
115	g_opart_sz.get(i) = g_send_prp.get(i).size();
116	}
117	}
118
119	template<typename Vcluster_type, typename vector_prop_type,
120	typename vector_pos_type, typename send_pos_vector,
121	typename prc_recv_get_type, typename prc_g_opart_type,
122	typename recv_sz_get_type>
123	static inline void sendrecv_pos(Vcluster_type & v_cl,
124	openfpm::vector<send_pos_vector> & g_pos_send,
125	vector_prop_type & v_prp,
126	vector_pos_type & v_pos,
127	prc_recv_get_type & prc_recv_get,
128	recv_sz_get_type & recv_sz_get,
129	prc_g_opart_type & prc_g_opart,
130	size_t opt)
131	{
132	size_t opt_ = compute_options(opt);
133	if (opt & SKIP_LABELLING)
134	{
135	v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
136	}
137	else
138	{
139	prc_recv_get.clear();
140	recv_sz_get.clear();
141	v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
142	}
143	}
144
145	template<typename Vcluster_type, typename vector_prop_type,
146	typename vector_pos_type, typename send_pos_vector,
147	typename prc_recv_get_type, typename prc_g_opart_type,
148	typename recv_sz_get_type>
149	static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
150	openfpm::vector<send_pos_vector> & g_pos_send,
151	vector_prop_type & v_prp,
152	vector_pos_type & v_pos,
153	prc_recv_get_type & prc_recv_get,
154	recv_sz_get_type & recv_sz_get,
155	prc_g_opart_type & prc_g_opart,
156	size_t opt)
157	{}
158
159	template<typename Vcluster_type, typename vector_prop_type,
160	typename vector_pos_type, typename send_vector,
161	typename prc_recv_get_type, typename prc_g_opart_type,
162	typename recv_sz_get_type, typename recv_sz_get_byte_type,
163	typename g_opart_sz_type>
164	static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
165	openfpm::vector<send_vector> & g_send_prp,
166	vector_prop_type & v_prp,
167	vector_pos_type & v_pos,
168	prc_g_opart_type & prc_g_opart,
169	prc_recv_get_type & prc_recv_get,
170	recv_sz_get_type & recv_sz_get,
171	recv_sz_get_byte_type & recv_sz_get_byte,
172	g_opart_sz_type & g_opart_sz,
173	size_t g_m,
174	size_t opt)
175	{}
176	};
177
178
179	template<template<typename> class layout_base, unsigned int ... prp>
180	struct ghost_exchange_comm_impl<GHOST_ASYNC,layout_base, prp ... >
181	{
182	template<typename Vcluster_type, typename vector_prop_type,
183	typename vector_pos_type, typename send_vector,
184	typename prc_recv_get_type, typename prc_g_opart_type,
185	typename recv_sz_get_type, typename recv_sz_get_byte_type,
186	typename g_opart_sz_type>
187	static inline void sendrecv_prp(Vcluster_type & v_cl,
188	openfpm::vector<send_vector> & g_send_prp,
189	vector_prop_type & v_prp,
190	vector_pos_type & v_pos,
191	prc_g_opart_type & prc_g_opart,
192	prc_recv_get_type & prc_recv_get,
193	recv_sz_get_type & recv_sz_get,
194	recv_sz_get_byte_type & recv_sz_get_byte,
195	g_opart_sz_type & g_opart_sz,
196	size_t g_m,
197	size_t opt)
198	{
199	prc_recv_get.clear();
200	recv_sz_get.clear();
201
202	// if there are no properties skip
203	// SSendRecvP send everything when we do not give properties
204
205	if (sizeof...(prp) != `0`)
206	{
207	size_t opt_ = compute_options(opt);
208	if (opt & SKIP_LABELLING)
209	{
210	if (opt & RUN_ON_DEVICE)
211	{
212	op_ssend_gg_recv_merge_run_device opm(g_m);
213	v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
214	}
215	else
216	{
217	op_ssend_gg_recv_merge opm(g_m);
218	v_cl.template SSendRecvP_opAsync<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
219	}
220	}
221	else
222	{v_cl.template SSendRecvPAsync<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
223	}
224
225	// fill g_opart_sz
226	g_opart_sz.resize(prc_g_opart.size());
227
228	for (size_t i = `0` ; i < prc_g_opart.size() ; i++)
229	{g_opart_sz.get(i) = g_send_prp.get(i).size();}
230	}
231
232	template<typename Vcluster_type, typename vector_prop_type,
233	typename vector_pos_type, typename send_pos_vector,
234	typename prc_recv_get_type, typename prc_g_opart_type,
235	typename recv_sz_get_type>
236	static inline void sendrecv_pos(Vcluster_type & v_cl,
237	openfpm::vector<send_pos_vector> & g_pos_send,
238	vector_prop_type & v_prp,
239	vector_pos_type & v_pos,
240	prc_recv_get_type & prc_recv_get,
241	recv_sz_get_type & recv_sz_get,
242	prc_g_opart_type & prc_g_opart,
243	size_t opt)
244	{
245	prc_recv_get.clear();
246	recv_sz_get.clear();
247
248	size_t opt_ = compute_options(opt);
249	if (opt & SKIP_LABELLING)
250	{
251	v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
252	}
253	else
254	{
255	prc_recv_get.clear();
256	recv_sz_get.clear();
257	v_cl.template SSendRecvAsync<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
258	}
259	}
260
261	template<typename Vcluster_type, typename vector_prop_type,
262	typename vector_pos_type, typename send_pos_vector,
263	typename prc_recv_get_type, typename prc_g_opart_type,
264	typename recv_sz_get_type>
265	static inline void sendrecv_pos_wait(Vcluster_type & v_cl,
266	openfpm::vector<send_pos_vector> & g_pos_send,
267	vector_prop_type & v_prp,
268	vector_pos_type & v_pos,
269	prc_recv_get_type & prc_recv_get,
270	recv_sz_get_type & recv_sz_get,
271	prc_g_opart_type & prc_g_opart,
272	size_t opt)
273	{
274	size_t opt_ = compute_options(opt);
275	if (opt & SKIP_LABELLING)
276	{
277	v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
278	}
279	else
280	{
281	v_cl.template SSendRecvWait<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
282	}
283	}
284
285	template<typename Vcluster_type, typename vector_prop_type,
286	typename vector_pos_type, typename send_vector,
287	typename prc_recv_get_type, typename prc_g_opart_type,
288	typename recv_sz_get_type, typename recv_sz_get_byte_type,
289	typename g_opart_sz_type>
290	static inline void sendrecv_prp_wait(Vcluster_type & v_cl,
291	openfpm::vector<send_vector> & g_send_prp,
292	vector_prop_type & v_prp,
293	vector_pos_type & v_pos,
294	prc_g_opart_type & prc_g_opart,
295	prc_recv_get_type & prc_recv_get,
296	recv_sz_get_type & recv_sz_get,
297	recv_sz_get_byte_type & recv_sz_get_byte,
298	g_opart_sz_type & g_opart_sz,
299	size_t g_m,
300	size_t opt)
301	{
302	// if there are no properties skip
303	// SSendRecvP send everything when we do not give properties
304
305	if (sizeof...(prp) != `0`)
306	{
307	size_t opt_ = compute_options(opt);
308	if (opt & SKIP_LABELLING)
309	{
310	if (opt & RUN_ON_DEVICE)
311	{
312	op_ssend_gg_recv_merge_run_device opm(g_m);
313	v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge_run_device,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
314	}
315	else
316	{
317	op_ssend_gg_recv_merge opm(g_m);
318	v_cl.template SSendRecvP_opWait<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
319	}
320	}
321	else
322	{v_cl.template SSendRecvPWait<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
323	}
324	}
325	};
326
327
328	/! \brief This class is an helper for the communication of vector_dist*
329	*
330	* \tparam dim Dimensionality of the space where the elements lives
331	* \tparam St type of space float, double ...
332	* \tparam prop properties the vector element store in OpenFPM data structure format
333	* \tparam Decomposition Decomposition strategy to use CartDecomposition ...
334	* \tparam Memory Memory pool where store the information HeapMemory ...
335	*
336	* \see vector_dist
337	*
338	*/
339
340	template<unsigned int dim,
341	typename St,
342	typename prop,
343	typename Decomposition = CartDecomposition<dim,St>,
344	typename Memory = HeapMemory,
345	template<typename> class layout_base = memory_traits_lin>
346	class vector_dist_comm
347	{
348	//! Number of units for each sub-domain
349	size_t v_sub_unit_factor = `64`;
350
351	//! definition of the send vector for position
352	typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector;
353
354	//! VCluster
355	Vcluster<Memory> & v_cl;
356
357	//! Domain decomposition
358	Decomposition dec;
359
360	//! It map the processor id with the communication request into map procedure
361	openfpm::vector<size_t> p_map_req;
362
363	//! For each near processor, outgoing particle id
364	//! \warning opart is assumed to be an ordered list
365	//! first id particle id
366	//! second id shift id
367	//! third id is the processor id
368	openfpm::vector<aggregate<int,int,int>,
369	Memory,
370	layout_base > m_opart;
371
372	//! Per processor ordered particles id for ghost_get (see prc_g_opart)
373	//! For each processor the internal vector store the id of the
374	//! particles that must be communicated to the other processors
375	openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart;
376
377	//! Same as g_opart but on device, the vector of vector is flatten into a single vector
378	openfpm::vector<aggregate<unsigned int,unsigned long int>,
379	CudaMemory,
380	memory_traits_inte> g_opart_device;
381
382	//! Helper buffer for computation (on GPU) of local particles (position)
383	openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp;
384
385	//! Helper buffer for computation (on GPU) of local particles (properties)
386	openfpm::vector<prop,Memory,layout_base> v_prp_tmp;
387
388	//! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
389	openfpm::vector<size_t> g_opart_sz;
390
391	//! processor rank list of g_opart
392	openfpm::vector<size_t> prc_g_opart;
393
394	//! It store the list of processor that communicate with us (local processor)
395	//! from the last ghost get
396	openfpm::vector<size_t> prc_recv_get_pos;
397	openfpm::vector<size_t> prc_recv_get_prp;
398
399	//! the same as prc_recv_get but for put
400	openfpm::vector<size_t> prc_recv_put;
401
402	//! the same as prc_recv_get but for map
403	openfpm::vector<size_t> prc_recv_map;
404
405	//! It store the size of the elements added for each processor that communicate with us (local processor)
406	//! from the last ghost get
407	openfpm::vector<size_t> recv_sz_get_pos;
408	openfpm::vector<size_t> recv_sz_get_prp;
409	//! Conversion to byte of recv_sz_get
410	openfpm::vector<size_t> recv_sz_get_byte;
411
412
413	//! The same as recv_sz_get but for put
414	openfpm::vector<size_t> recv_sz_put;
415
416	//! The same as recv_sz_get but for map
417	openfpm::vector<size_t> recv_sz_map;
418
419	//! elements sent for each processors (ghost_get)
420	openfpm::vector<size_t> prc_sz_gg;
421
422	//! temporary buffer to processors ids
423	openfpm::vector<aggregate<unsigned int>,
424	Memory,
425	layout_base> proc_id_out;
426
427	//! temporary buffer for the scan result
428	openfpm::vector<aggregate<unsigned int>,
429	Memory,
430	layout_base> starts;
431
432	//! Processor communication size
433	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset;
434
435
436	//! Temporary CudaMemory to do stuff
437	CudaMemory mem;
438
439	//! Local ghost marker (across the ghost particles it mark from where we have the)
440	//! replicated ghost particles that are local
441	size_t lg_m;
442
443	//! Sending buffer
444	openfpm::vector_fr<Memory> hsmem;
445
446	//! process the particle with properties
447	template<typename prp_object, int ... prp>
448	struct proc_with_prp
449	{
450	//! process the particle
451	template<typename T1, typename T2> inline static void proc(size_t lbl, size_t cnt, size_t id, T1 & v_prp, T2 & m_prp)
452	{
453	// source object type
454	typedef encapc<`1`, prop, typename openfpm::vector<prop>::layout_type> encap_src;
455	// destination object type
456	typedef encapc<`1`, prp_object, typename openfpm::vector<prp_object>::layout_type> encap_dst;
457
458	// Copy only the selected properties
459	object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(id), m_prp.get(lbl).get(cnt));
460	}
461	};
462
463	/! \brief Get the number of particles received from each processor during the last ghost_get*
464	*
465	*
466	* \param i processor (list index)
467	* \return the number of particles
468	*/
469	size_t get_last_ghost_get_received_parts(size_t i)
470	{
471	// If the last ghost_get did not have properties the information about the number of particles
472	// received is in recv_sz_get_ois
473	if (recv_sz_get_prp.size() != `0`)
474	{return recv_sz_get_prp.get(i);}
475	else
476	{return recv_sz_get_pos.get(i);}
477	}
478
479	/! \brief Get the number of processor involved during the last ghost_get*
480	*
481	* \return the number of processor
482	*/
483	size_t get_last_ghost_get_num_proc()
484	{
485	if (prc_recv_get_prp.size() != `0`)
486	{return prc_recv_get_prp.size();}
487	else
488	{return prc_recv_get_pos.size();}
489	}
490
491	/! \brief Get the number of processor involved during the last ghost_get*
492	*
493	* \return the number of processor
494	*/
495	openfpm::vector<size_t> & get_last_ghost_get_num_proc_vector()
496	{
497	if (prc_recv_get_prp.size() != `0`)
498	{return prc_recv_get_prp;}
499	else
500	{return prc_recv_get_pos;}
501	}
502
503	/! \brief Calculate sending buffer size for each processor*
504	*
505	* \param prc_sz_r processor size
506	* \param prc_r processor ids
507	*
508	*/
509	inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
510	openfpm::vector<size_t> & prc_sz_r,
511	openfpm::vector<size_t> & prc_r,
512	size_t opt)
513	{
514	if (opt & RUN_ON_DEVICE)
515	{
516	#ifndef TEST1
517	size_t prev_off = `0`;
518	for (size_t i = `0`; i < prc_sz.size() ; i++)
519	{
520	if (prc_sz.template get<`1`>(i) != (unsigned int)-`1`)
521	{
522	prc_r.add(prc_sz.template get<`1`>(i));
523	prc_sz_r.add(prc_sz.template get<`0`>(i) - prev_off);
524	}
525	prev_off = prc_sz.template get<`0`>(i);
526	}
527	#else
528
529	// Calculate the sending buffer size for each processor, put this information in
530	// a contiguous buffer
531
532	for (size_t i = `0`; i < v_cl.getProcessingUnits(); i++)
533	{
534	if (prc_sz.template get<`0`>(i) != `0` && v_cl.rank() != i)
535	{
536	prc_r.add(i);
537	prc_sz_r.add(prc_sz.template get<`0`>(i));
538	}
539	}
540
541	#endif
542	}
543	else
544	{
545	// Calculate the sending buffer size for each processor, put this information in
546	// a contiguous buffer
547
548	p_map_req.resize(v_cl.getProcessingUnits());
549	for (size_t i = `0`; i < v_cl.getProcessingUnits(); i++)
550	{
551	if (prc_sz.template get<`0`>(i) != `0`)
552	{
553	p_map_req.get(i) = prc_r.size();
554	prc_r.add(i);
555	prc_sz_r.add(prc_sz.template get<`0`>(i));
556	}
557	}
558	}
559	}
560
561	//! From which decomposition the shift boxes are calculated
562	long int shift_box_ndec = -`1`;
563
564	//! this map is used to check if a combination is already present
565	std::unordered_map<size_t, size_t> map_cmb;
566
567	//! The boxes touching the border of the domain are divided in groups (first vector)
568	//! each group contain internal ghost coming from sub-domains of the same section
569	openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f;
570
571	//! The boxes touching the border of the domain + shift vector linearized from where they come from
572	openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev;
573	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv;
574
575	//! Store the sector for each group (previous vector)
576	openfpm::vector_std<comb<dim>> box_cmb;
577
578	//! Id of the local particle to replicate for ghost_get
579	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc;
580
581	//! Processor communication size
582	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz;
583
584	/! \brief For every internal ghost box we create a structure that order such internal local ghost box in*
585	* shift vectors
586	*
587	*/
588	void createShiftBox()
589	{
590	if (shift_box_ndec == (long int)dec.get_ndec())
591	{return;}
592
593	struct sh_box
594	{
595	size_t shift_id;
596
597	unsigned int box_f_sv;
598	Box<dim,St> box_f_dev;
599
600	bool operator<(const sh_box & tmp) const
601	{
602	return shift_id < tmp.shift_id;
603	}
604
605	};
606	openfpm::vector<sh_box> reord_shift;
607	box_f.clear();
608	map_cmb.clear();
609	box_cmb.clear();
610
611	// Add local particles coming from periodic boundary, the only boxes that count are the one
612	// touching the border
613	for (size_t i = `0`; i < dec.getNLocalSub(); i++)
614	{
615	size_t Nl = dec.getLocalNIGhost(i);
616
617	for (size_t j = `0`; j < Nl; j++)
618	{
619	// If the ghost does not come from the intersection with an out of
620	// border sub-domain the combination is all zero and n_zero return dim
621	if (dec.getLocalIGhostPos(i, j).n_zero() == dim)
622	continue;
623
624	// Check if we already have boxes with such combination
625	auto it = map_cmb.find(dec.getLocalIGhostPos(i, j).lin());
626	if (it == map_cmb.end())
627	{
628	// we do not have it
629	box_f.add();
630	box_f.last().add(dec.getLocalIGhostBox(i, j));
631	box_cmb.add(dec.getLocalIGhostPos(i, j));
632	map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - `1`;
633	}
634	else
635	{
636	// we have it
637	box_f.get(it->second).add(dec.getLocalIGhostBox(i, j));
638	}
639
640	reord_shift.add();
641	reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin();
642	reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j);
643	reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j));
644	}
645	}
646
647	// now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles
648	reord_shift.sort();
649
650	box_f_dev.resize(reord_shift.size());
651	box_f_sv.resize(reord_shift.size());
652
653	for (size_t i = `0` ; i < reord_shift.size() ; i++)
654	{
655	box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
656	box_f_sv.template get<`0`>(i) = reord_shift.get(i).box_f_sv;
657	}
658
659	#ifdef CUDA_GPU
660
661	// move box_f_dev and box_f_sv to device
662	box_f_dev.template hostToDevice<`0`,`1`>();
663	box_f_sv.template hostToDevice<`0`>();
664
665	#endif
666
667	shift_box_ndec = dec.get_ndec();
668	}
669
670	/! \brief Local ghost from labeled particles*
671	*
672	* \param v_pos vector of particle positions
673	* \param v_prp vector of particles properties
674	* \param opt options
675	*
676	*/
677	void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
678	openfpm::vector<prop,Memory,layout_base> & v_prp,
679	size_t opt)
680	{
681	// get the shift vectors
682	const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors();
683
684	if (!(opt & NO_POSITION))
685	{
686	if (opt & RUN_ON_DEVICE)
687	{
688	local_ghost_from_opart_impl<true,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
689	::run(o_part_loc,shifts,v_pos,v_prp,opt);
690	}
691	else
692	{
693	for (size_t i = `0` ; i < o_part_loc.size() ; i++)
694	{
695	size_t lin_id = o_part_loc.template get<`1`>(i);
696	size_t key = o_part_loc.template get<`0`>(i);
697
698	Point<dim, St> p = v_pos.get(key);
699	// shift
700	p -= shifts.get(lin_id);
701
702	// add this particle shifting its position
703	v_pos.add(p);
704	v_prp.get(lg_m+i) = v_prp.get(key);
705	}
706	}
707	}
708	else
709	{
710	if (opt & RUN_ON_DEVICE)
711	{
712	local_ghost_from_opart_impl<false,dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
713	::run(o_part_loc,shifts,v_pos,v_prp,opt);
714	}
715	else
716	{
717	for (size_t i = `0` ; i < o_part_loc.size() ; i++)
718	{
719	size_t key = o_part_loc.template get<`0`>(i);
720
721	v_prp.get(lg_m+i) = v_prp.get(key);
722	}
723	}
724	}
725	}
726
727	/! \brief Local ghost from decomposition*
728	*
729	* \param v_pos vector of particle positions
730	* \param v_prp vector of particle properties
731	* \param g_m ghost marker
732	*
733	*/
734	void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
735	openfpm::vector<prop,Memory,layout_base> & v_prp,
736	size_t g_m,size_t opt)
737	{
738	o_part_loc.clear();
739
740	// get the shift vectors
741	const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
742
743	if (opt & RUN_ON_DEVICE)
744	{
745	local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
746	::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt);
747	}
748	else
749	{
750	// Label the internal (assigned) particles
751	auto it = v_pos.getIteratorTo(g_m);
752
753	while (it.isNext())
754	{
755	auto key = it.get();
756
757	// If particles are inside these boxes
758	for (size_t i = `0`; i < box_f.size(); i++)
759	{
760	for (size_t j = `0`; j < box_f.get(i).size(); j++)
761	{
762	if (box_f.get(i).get(j).isInsideNP(v_pos.get(key)) == true)
763	{
764	size_t lin_id = dec.convertShift(box_cmb.get(i));
765
766	o_part_loc.add();
767	o_part_loc.template get<`0`>(o_part_loc.size()-`1`) = key;
768	o_part_loc.template get<`1`>(o_part_loc.size()-`1`) = lin_id;
769
770	Point<dim, St> p = v_pos.get(key);
771	// shift
772	p -= shifts.get(lin_id);
773
774	// add this particle shifting its position
775	v_pos.add(p);
776	v_prp.add();
777	v_prp.last() = v_prp.get(key);
778
779	// boxes in one group can be overlapping
780	// we do not have to search for the other
781	// boxes otherwise we will have duplicate particles
782	//
783	// A small note overlap of boxes across groups is fine
784	// (and needed) because each group has different shift
785	// producing non overlapping particles
786	//
787	break;
788	}
789	}
790	}
791
792	++it;
793	}
794	}
795	}
796
797	/! \brief Add local particles based on the boundary conditions*
798	*
799	* In order to understand what this function use the following
800	*
801	\verbatim
802
803	[1,1]
804	+---------+------------------------+---------+
805	\| (1,-1) \| \| (1,1) \|
806	\| \| \| (1,0) --> 7 \| \| \|
807	\| v \| \| v \|
808	\| 6 \| \| 8 \|
809	+--------------------------------------------+
810	\| \| \| \|
811	\| \| \| \|
812	\| \| \| \|
813	\| (-1,0) \| \| (1,0) \|
814	\| \| \| \| \| \|
815	\| v \| (0,0) --> 4 \| v \|
816	\| 3 \| \| 5 \|
817	\| \| \| \|
818	B \| \| \| A \|
819	* \| \| \| * \|
820	\| \| \| \|
821	\| \| \| \|
822	\| \| \| \|
823	+--------------------------------------------+
824	\| (-1,-1) \| \| (-1,1) \|
825	\| \| \| (-1,0) --> 1 \| \| \|
826	\| v \| \| v \|
827	\| 0 \| \| 2 \|
828	+---------+------------------------+---------+
829
830
831	\endverbatim
832
833	*
834	* The box is the domain, while all boxes at the border (so not (0,0) ) are the
835	* ghost part at the border of the domain. If a particle A is in the position in figure
836	* a particle B must be created. This function duplicate the particle A, if A and B are
837	* local
838	*
839	* \param v_pos vector of particle of positions
840	* \param v_prp vector of particle properties
841	* \param g_m ghost marker
842	* \param opt options
843	*
844	*/
845	void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
846	openfpm::vector<prop,Memory,layout_base> & v_prp ,
847	size_t & g_m,
848	size_t opt)
849	{
850	// Create the shift boxes
851	createShiftBox();
852
853	if (!(opt & SKIP_LABELLING))
854	lg_m = v_prp.size();
855
856	if (box_f.size() == `0`)
857	return;
858	else
859	{
860	if (opt & SKIP_LABELLING)
861	{local_ghost_from_opart(v_pos,v_prp,opt);}
862	else
863	{local_ghost_from_dec(v_pos,v_prp,g_m,opt);}
864	}
865	}
866
867	/! \brief This function fill the send buffer for the particle position after the particles has been label with labelParticles*
868	*
869	* \param v_pos vector of particle positions
870	* \param g_pos_send Send buffer to fill
871	*
872	*/
873	void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
874	openfpm::vector<size_t> & prc_sz,
875	openfpm::vector<send_pos_vector> & g_pos_send,
876	size_t opt,
877	bool async)
878	{
879	// get the shift vectors
880	const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
881
882	// create a number of send buffers equal to the near processors
883	g_pos_send.resize(prc_sz.size());
884
885	size_t old_hsmem_size = `0`;
886
887	// if we do async
888	if (async == true)
889	{
890	old_hsmem_size = hsmem.size();
891	resize_retained_buffer(hsmem,g_pos_send.size() + hsmem.size());
892	}
893	else
894	{resize_retained_buffer(hsmem,g_pos_send.size());}
895
896	for (size_t i = `0`; i < g_pos_send.size(); i++)
897	{
898	// Buffer must retained and survive the destruction of the
899	// vector
900	if (hsmem.get(i+old_hsmem_size).ref() == `0`)
901	{hsmem.get(i+old_hsmem_size).incRef();}
902
903	// Set the memory for retain the send buffer
904	g_pos_send.get(i).setMemory(hsmem.get(i+old_hsmem_size));
905
906	// resize the sending vector (No allocation is produced)
907	g_pos_send.get(i).resize(prc_sz.get(i));
908	}
909
910	if (opt & RUN_ON_DEVICE)
911	{
912	#if defined(CUDA_GPU) && defined(__NVCC__)
913
914	size_t offset = `0`;
915
916	// Fill the sending buffers
917	for (size_t i = `0` ; i < g_pos_send.size() ; i++)
918	{
919	auto ite = g_pos_send.get(i).getGPUIterator();
920
921	CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
922	ite,
923	g_opart_device.toKernel(), g_pos_send.get(i).toKernel(),
924	v_pos.toKernel(),shifts.toKernel(),offset);
925
926	offset += prc_sz.get(i);
927	}
928
929	#else
930
931	std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
932
933	#endif
934	}
935	else
936	{
937	// Fill the send buffer
938	for (size_t i = `0`; i < g_opart.size(); i++)
939	{
940	for (size_t j = `0`; j < g_opart.get(i).size(); j++)
941	{
942	Point<dim, St> s = v_pos.get(g_opart.get(i).template get<`0`>(j));
943	s -= shifts.get(g_opart.get(i).template get<`1`>(j));
944	g_pos_send.get(i).set(j, s);
945	}
946	}
947	}
948	}
949
950	/! \brief This function fill the send buffer for ghost_put*
951	*
952	* \tparam send_vector type used to send data
953	* \tparam prp_object object containing only the properties to send
954	* \tparam prp set of properties to send
955	*
956	* \param v_prp vector of particle properties
957	* \param g_send_prp Send buffer to fill
958	* \param g_m ghost marker
959	*
960	*/
961	template<typename send_vector, typename prp_object, int ... prp>
962	void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
963	openfpm::vector<send_vector> & g_send_prp,
964	size_t & g_m,
965	size_t opt)
966	{
967	// create a number of send buffers equal to the near processors
968	// from which we received
969
970	// NOTE in some case the information can be in prc_recv_get_pos
971
972	size_t nproc = get_last_ghost_get_num_proc();
973
974	g_send_prp.resize(nproc);
975
976	resize_retained_buffer(hsmem,g_send_prp.size());
977
978	for (size_t i = `0`; i < g_send_prp.size(); i++)
979	{
980	// Buffer must retained and survive the destruction of the
981	// vector
982	if (hsmem.get(i).ref() == `0`)
983	hsmem.get(i).incRef();
984
985	// Set the memory for retain the send buffer
986	g_send_prp.get(i).setMemory(hsmem.get(i));
987
988	size_t n_part_recv = get_last_ghost_get_received_parts(i);
989
990	// resize the sending vector (No allocation is produced)
991	g_send_prp.get(i).resize(n_part_recv);
992	}
993
994	size_t accum = g_m;
995
996	if (opt & RUN_ON_DEVICE)
997	{
998	#if defined(CUDA_GPU) && defined(__NVCC__)
999
1000	if (sizeof...(prp) != `0`)
1001	{
1002	// Fill the sending buffers
1003	for (size_t i = `0` ; i < g_send_prp.size() ; i++)
1004	{
1005	size_t n_part_recv = get_last_ghost_get_received_parts(i);
1006
1007	auto ite = g_send_prp.get(i).getGPUIterator();
1008
1009	if (ite.nblocks() == `0`) {continue;}
1010
1011	CUDA_LAUNCH((process_ghost_particles_prp_put<decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1012	ite,
1013	g_send_prp.get(i).toKernel(),
1014	v_prp.toKernel(),accum);
1015
1016	accum = accum + n_part_recv;
1017	}
1018	}
1019
1020	#else
1021
1022	std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1023
1024	#endif
1025	}
1026	else
1027	{
1028	// Fill the send buffer
1029	for (size_t i = `0`; i < g_send_prp.size(); i++)
1030	{
1031	size_t j2 = `0`;
1032	size_t n_part_recv = get_last_ghost_get_received_parts(i);
1033
1034	for (size_t j = accum; j < accum + n_part_recv; j++)
1035	{
1036	// source object type
1037	typedef encapc<`1`, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src;
1038	// destination object type
1039	typedef encapc<`1`, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst;
1040
1041	// Copy only the selected properties
1042	object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
1043
1044	j2++;
1045	}
1046
1047	accum = accum + n_part_recv;
1048	}
1049	}
1050	}
1051
1052	/! \brief resize the retained buffer by nbf*
1053	*
1054	*
1055	*/
1056	void resize_retained_buffer(openfpm::vector_fr<Memory> & rt_buf, size_t nbf)
1057	{
1058	// Release all the buffer that are going to be deleted
1059	for (size_t i = nbf ; i < rt_buf.size() ; i++)
1060	{
1061	rt_buf.get(i).decRef();
1062	}
1063
1064	hsmem.resize(nbf);
1065	}
1066
1067	/! \brief Set the buffer for each property*
1068	*
1069	*
1070	*/
1071	template<typename send_vector, typename v_mpl>
1072	struct set_mem_retained_buffers_inte
1073	{
1074	openfpm::vector<send_vector> & g_send_prp;
1075
1076	size_t i;
1077
1078	openfpm::vector_fr<Memory> & hsmem;
1079
1080	size_t j;
1081
1082	set_mem_retained_buffers_inte(openfpm::vector<send_vector> & g_send_prp, size_t i ,
1083	openfpm::vector_fr<Memory> & hsmem, size_t j)
1084	:g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
1085	{}
1086
1087	//! It call the setMemory function for each property
1088	template<typename T>
1089	inline void operator()(T& t)
1090	{
1091	g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
1092
1093	j++;
1094	}
1095	};
1096
1097	template<bool inte_or_lin,typename send_vector, typename v_mpl>
1098	struct set_mem_retained_buffers
1099	{
1100	static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1101	openfpm::vector<size_t> & prc_sz,
1102	size_t i,
1103	openfpm::vector_fr<Memory> & hsmem,
1104	size_t j)
1105	{
1106	// Set the memory for retain the send buffer
1107	g_send_prp.get(i).setMemory(hsmem.get(j));
1108
1109	// resize the sending vector (No allocation is produced)
1110	g_send_prp.get(i).resize(prc_sz.get(i));
1111
1112	return j+`1`;
1113	}
1114	};
1115
1116	template<typename send_vector, typename v_mpl>
1117	struct set_mem_retained_buffers<true,send_vector,v_mpl>
1118	{
1119	static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
1120	openfpm::vector<size_t> & prc_sz,
1121	size_t i,
1122	openfpm::vector_fr<Memory> & hsmem,
1123	size_t j)
1124	{
1125	set_mem_retained_buffers_inte<send_vector,v_mpl> smrbi(g_send_prp,i,hsmem,j);
1126
1127	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,boost::mpl::size<v_mpl>::type::value>>(smrbi);
1128
1129	// if we do not send properties do not reallocate
1130	if (boost::mpl::size<v_mpl>::type::value != `0`)
1131	{
1132	// resize the sending vector (No allocation is produced)
1133	g_send_prp.get(i).resize(prc_sz.get(i));
1134	}
1135
1136	return smrbi.j;
1137	}
1138	};
1139
1140	/! \brief This function fill the send buffer for properties after the particles has been label with labelParticles*
1141	*
1142	* \tparam send_vector type used to send data
1143	* \tparam prp_object object containing only the properties to send
1144	* \tparam prp set of properties to send
1145	*
1146	* \param v_prp vector of particle properties
1147	* \param g_send_prp Send buffer to fill
1148	*
1149	*/
1150	template<typename send_vector, typename prp_object, int ... prp>
1151	void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
1152	openfpm::vector<size_t> & prc_sz,
1153	openfpm::vector<send_vector> & g_send_prp,
1154	size_t opt)
1155	{
1156	size_t factor = `1`;
1157
1158	typedef typename to_boost_vmpl<prp...>::type v_mpl;
1159
1160	if (is_layout_inte<layout_base<prop>>::value == true) {factor = sizeof*...(prp);}
1161
1162	// create a number of send buffers equal to the near processors
1163	g_send_prp.resize(prc_sz.size());
1164
1165	resize_retained_buffer(hsmem,g_send_prp.size()*factor);
1166
1167	for (size_t i = `0`; i < hsmem.size(); i++)
1168	{
1169	// Buffer must retained and survive the destruction of the
1170	// vector
1171	if (hsmem.get(i).ref() == `0`)
1172	{hsmem.get(i).incRef();}
1173	}
1174
1175	size_t j = `0`;
1176	for (size_t i = `0`; i < g_send_prp.size(); i++)
1177	{
1178	j = set_mem_retained_buffers<is_layout_inte<layout_base<prop>>::value,send_vector,v_mpl>::set_mem_retained_buffers_(g_send_prp,prc_sz,i,hsmem,j);
1179	}
1180
1181	if (opt & RUN_ON_DEVICE)
1182	{
1183	#if defined(CUDA_GPU) && defined(__NVCC__)
1184
1185	size_t offset = `0`;
1186
1187	if (sizeof...(prp) != `0`)
1188	{
1189	// Fill the sending buffers
1190	for (size_t i = `0` ; i < g_send_prp.size() ; i++)
1191	{
1192	auto ite = g_send_prp.get(i).getGPUIterator();
1193
1194	CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
1195	ite,
1196	g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
1197	v_prp.toKernel(),offset);
1198
1199	offset += prc_sz.get(i);
1200	}
1201	}
1202
1203	#else
1204
1205	std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1206
1207	#endif
1208	}
1209	else
1210	{
1211	// if no properties must be sent skip this step
1212	if (sizeof...(prp) == `0`) {return;}
1213
1214	// Fill the send buffer
1215	for (size_t i = `0`; i < g_opart.size(); i++)
1216	{
1217	for (size_t j = `0`; j < g_opart.get(i).size(); j++)
1218	{
1219	// source object type
1220	typedef decltype(v_prp.get(g_opart.get(i).template get<`0`>(j))) encap_src;
1221	// destination object type
1222	typedef decltype(g_send_prp.get(i).get(j)) encap_dst;
1223
1224	// Copy only the selected properties
1225	object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(g_opart.get(i).template get<`0`>(j)), g_send_prp.get(i).get(j));
1226	}
1227	}
1228	}
1229	}
1230
1231	/! \brief allocate and fill the send buffer for the map function*
1232	*
1233	* \param v_pos vector of particle positions
1234	* \param v_prp vector of particles properties
1235	* \param prc_sz_r For each processor in the list the size of the message to send
1236	* \param m_pos sending buffer for position
1237	* \param m_prp sending buffer for properties
1238	* \param offset from where start the list of the particles that migrate in o_part
1239	* This parameter is used only in case of RUN_ON_DEVICE option
1240	*
1241	*/
1242	void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1243	openfpm::vector<prop,Memory,layout_base> & v_prp,
1244	openfpm::vector<size_t> & prc_sz_r,
1245	openfpm::vector<size_t> & prc_r,
1246	openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos,
1247	openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp,
1248	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz,
1249	size_t opt)
1250	{
1251	m_prp.resize(prc_sz_r.size());
1252	m_pos.resize(prc_sz_r.size());
1253	openfpm::vector<size_t> cnt(prc_sz_r.size());
1254
1255	for (size_t i = `0`; i < prc_sz_r.size() ; i++)
1256	{
1257	// set the size and allocate, using mem warant that pos and prp is contiguous
1258	m_pos.get(i).resize(prc_sz_r.get(i));
1259	m_prp.get(i).resize(prc_sz_r.get(i));
1260	cnt.get(i) = `0`;
1261	}
1262
1263	if (opt & RUN_ON_DEVICE)
1264	{
1265	if (v_cl.size() == `1`)
1266	{return;}
1267
1268	#if defined(CUDA_GPU) && defined(__NVCC__)
1269
1270	// The first part of m_opart and prc_sz contain the local particles
1271
1272	#ifndef TEST1
1273
1274	v_pos_tmp.resize(prc_sz.template get<`0`>(`0`));
1275	v_prp_tmp.resize(prc_sz.template get<`0`>(`0`));
1276
1277	auto ite = v_pos_tmp.getGPUIterator();
1278
1279	// fill v_pos_tmp and v_prp_tmp with local particles
1280	process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
1281	decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
1282	<<<ite.wthr,ite.thr>>>
1283	(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
1284	v_pos.toKernel(),v_prp.toKernel(),`0`);
1285
1286	size_t offset = prc_sz.template get<`0`>(`0`);
1287
1288	// Fill the sending buffers
1289	for (size_t i = `0` ; i < m_pos.size() ; i++)
1290	{
1291	auto ite = m_pos.get(i).getGPUIterator();
1292
1293	process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1294	decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
1295	<<<ite.wthr,ite.thr>>>
1296	(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1297	v_pos.toKernel(),v_prp.toKernel(),offset);
1298
1299	offset += prc_sz_r.size();
1300	}
1301
1302	// old local particles with the actual local particles
1303	v_pos_tmp.swap(v_pos);
1304	v_prp_tmp.swap(v_prp);
1305
1306	#else
1307
1308	int rank = v_cl.rank();
1309
1310	v_pos_tmp.resize(prc_sz.template get<`0`>(rank));
1311	v_prp_tmp.resize(prc_sz.template get<`0`>(rank));
1312
1313	auto ite = v_pos_tmp.getGPUIterator();
1314
1315	starts.template deviceToHost<`0`>();
1316	size_t offset = starts.template get<`0`>(rank);
1317
1318	// no work to do
1319	if (ite.wthr.x != `0`)
1320	{
1321	// fill v_pos_tmp and v_prp_tmp with local particles
1322	CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
1323	decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1324	ite,
1325	m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
1326	v_pos.toKernel(),v_prp.toKernel(),offset);
1327	}
1328
1329	// Fill the sending buffers
1330	for (size_t i = `0` ; i < m_pos.size() ; i++)
1331	{
1332	size_t offset = starts.template get<`0`>(prc_r.template get<`0`>(i));
1333
1334	auto ite = m_pos.get(i).getGPUIterator();
1335
1336	// no work to do
1337	if (ite.wthr.x != `0`)
1338	{
1339
1340	CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
1341	decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
1342	ite,
1343	m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
1344	v_pos.toKernel(),v_prp.toKernel(),offset);
1345
1346	}
1347	}
1348
1349	// old local particles with the actual local particles
1350	v_pos_tmp.swap(v_pos);
1351	v_prp_tmp.swap(v_prp);
1352
1353	#endif
1354	#else
1355
1356	std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
1357
1358	#endif
1359	}
1360	else
1361	{
1362	// end vector point
1363	long int id_end = v_pos.size();
1364
1365	// end opart point
1366	long int end = m_opart.size()-`1`;
1367
1368	// Run through all the particles and fill the sending buffer
1369	for (size_t i = `0`; i < m_opart.size(); i++)
1370	{
1371	process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1372	}
1373
1374	v_pos.resize(v_pos.size() - m_opart.size());
1375	v_prp.resize(v_prp.size() - m_opart.size());
1376	}
1377	}
1378
1379
1380	/! \brief allocate and fill the send buffer for the map function*
1381	*
1382	* \tparam prp_object object type to send
1383	* \tparam prp properties to send
1384	*
1385	* \param v_pos vector of particle positions
1386	* \param v_prp vector of particle properties
1387	* \param prc_sz_r number of particles to send for each processor
1388	* \param m_pos sending buffer for position
1389	* \param m_prp sending buffer for properties
1390	*
1391	*/
1392	template<typename prp_object,int ... prp>
1393	void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos,
1394	openfpm::vector<prop,Memory,layout_base> & v_prp,
1395	openfpm::vector<size_t> & prc_sz_r,
1396	openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos,
1397	openfpm::vector<openfpm::vector<prp_object>> & m_prp)
1398	{
1399	m_prp.resize(prc_sz_r.size());
1400	m_pos.resize(prc_sz_r.size());
1401	openfpm::vector<size_t> cnt(prc_sz_r.size());
1402
1403	for (size_t i = `0`; i < prc_sz_r.size(); i++)
1404	{
1405	// set the size and allocate, using mem warant that pos and prp is contiguous
1406	m_pos.get(i).resize(prc_sz_r.get(i));
1407	m_prp.get(i).resize(prc_sz_r.get(i));
1408	cnt.get(i) = `0`;
1409	}
1410
1411	// end vector point
1412	long int id_end = v_pos.size();
1413
1414	// end opart point
1415	long int end = m_opart.size()-`1`;
1416
1417	// Run through all the particles and fill the sending buffer
1418	for (size_t i = `0`; i < m_opart.size(); i++)
1419	{
1420	process_map_particle<proc_with_prp<prp_object,prp...>>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
1421	}
1422
1423	v_pos.resize(v_pos.size() - m_opart.size());
1424	v_prp.resize(v_prp.size() - m_opart.size());
1425	}
1426
1427	/! \brief Label particles for mappings*
1428	*
1429	* \param v_pos vector of particle positions
1430	* \param lbl_p Particle labeled
1431	* \param prc_sz For each processor the number of particles to send
1432	* \param opt options
1433	*
1434	*/
1435	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1436	openfpm::vector<aggregate<int,int,int>,
1437	Memory,
1438	layout_base> & lbl_p,
1439	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
1440	size_t opt)
1441	{
1442	if (opt == RUN_ON_DEVICE)
1443	{
1444	#ifdef __NVCC__
1445
1446	// Map directly on gpu
1447
1448	lbl_p.resize(v_pos.size());
1449
1450	// labelling kernel
1451
1452	prc_sz.template fill<`0`>(`0`);
1453
1454	auto ite = v_pos.getGPUIterator();
1455	if (ite.wthr.x == `0`)
1456	{
1457	starts.resize(v_cl.size());
1458	starts.template fill<`0`>(`0`);
1459	return;
1460	}
1461
1462	// we have one process we can skip ...
1463	if (v_cl.size() == `1`)
1464	{
1465	// ... but we have to apply the boundary conditions
1466
1467	periodicity_int<dim> bc;
1468
1469	for (size_t i = `0` ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);}
1470
1471	CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite,dec.getDomain(),bc,v_pos.toKernel());
1472
1473	return;
1474	}
1475
1476	// label particle processor
1477	CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>),
1478	ite,
1479	dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
1480
1481
1482	#ifndef TEST1
1483
1484	// sort particles
1485	mergesort((int )lbl_p.template getDeviceBuffer<`1`>(),(int* )lbl_p.template getDeviceBuffer<`0`>(), lbl_p.size(), mgpu::template* less_t<int>(), v_cl.getmgpuContext());
1486
1487	mem.allocate(sizeof(int));
1488	mem.fill(`0`);
1489
1490	// Find the buffer bases
1491	find_buffer_offsets<`1`,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>>
1492	(lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel());
1493
1494	#error "should not be here"
1495
1496	// Trasfer the number of offsets on CPU
1497	mem.deviceToHost();
1498	prc_sz.template deviceToHost<`0`,`1`>();
1499	// get also the last element from lbl_p;
1500	lbl_p.template deviceToHost<`1`>(lbl_p.size()-`1`,lbl_p.size()-`1`);
1501
1502	mem.deviceToHost();
1503	int noff = (int* *)mem.getPointer();
1504	prc_sz.resize(noff+`1`);
1505	prc_sz.template get<`0`>(prc_sz.size()-`1`) = lbl_p.size();
1506	prc_sz.template get<`1`>(prc_sz.size()-`1`) = lbl_p.template get<`1`>(lbl_p.size()-`1`);
1507
1508	#else
1509
1510	starts.resize(v_cl.size());
1511	openfpm::scan((unsigned int )prc_sz.template getDeviceBuffer<`0`>(), prc_sz.size(), (unsigned* int *)starts.template getDeviceBuffer<`0`>() , v_cl.getmgpuContext());
1512
1513	// move prc_sz to host
1514	prc_sz.template deviceToHost<`0`>();
1515
1516	ite = lbl_p.getGPUIterator();
1517
1518	// we order lbl_p
1519	CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel());
1520
1521	#endif
1522
1523	#else
1524
1525	std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
1526
1527	#endif
1528	}
1529	else
1530	{
1531	// reset lbl_p
1532	lbl_p.clear();
1533	prc_sz_gg.clear();
1534	o_part_loc.clear();
1535	g_opart.clear();
1536	prc_g_opart.clear();
1537
1538	// resize the label buffer
1539	prc_sz.template fill<`0`>(`0`);
1540
1541	auto it = v_pos.getIterator();
1542
1543	// Label all the particles with the processor id where they should go
1544	while (it.isNext())
1545	{
1546	auto key = it.get();
1547
1548	// Apply the boundary conditions
1549	dec.applyPointBC(v_pos.get(key));
1550
1551	size_t p_id = `0`;
1552
1553	// Check if the particle is inside the domain
1554	if (dec.getDomain().isInside(v_pos.get(key)) == true)
1555	{p_id = dec.processorID(v_pos.get(key));}
1556	else
1557	{p_id = obp::out(key, v_cl.getProcessUnitID());}
1558
1559	// Particle to move
1560	if (p_id != v_cl.getProcessUnitID())
1561	{
1562	if ((long int) p_id != -`1`)
1563	{
1564	prc_sz.template get<`0`>(p_id)++;
1565	lbl_p.add();
1566	lbl_p.last().template get<`0`>() = key;
1567	lbl_p.last().template get<`2`>() = p_id;
1568	}
1569	else
1570	{
1571	lbl_p.add();
1572	lbl_p.last().template get<`0`>() = key;
1573	lbl_p.last().template get<`2`>() = p_id;
1574	}
1575	}
1576
1577	// Add processors and add size
1578
1579	++it;
1580	}
1581	}
1582	}
1583
1584	/! \brief Label the particles*
1585	*
1586	* It count the number of particle to send to each processors and save its ids
1587	*
1588	* \see nn_prcs::getShiftvectors()
1589	*
1590	* \param v_pos vector of particle positions
1591	* \param v_prp vector of particle properties
1592	* \param prc for each particle it label the processor id (the owner of the particle, or where it should go the particle)
1593	* \param g_m ghost marker
1594	* \param opt ghost_get options
1595	*
1596	*/
1597	void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1598	openfpm::vector<prop,Memory,layout_base> & v_prp,
1599	openfpm::vector<size_t> & prc,
1600	openfpm::vector<size_t> & prc_sz,
1601	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
1602	size_t & g_m,
1603	size_t opt)
1604	{
1605	// Buffer that contain for each processor the id of the particle to send
1606	prc_sz.clear();
1607	g_opart.clear();
1608	g_opart.resize(dec.getNNProcessors());
1609	prc_g_opart.clear();
1610
1611	if (opt & RUN_ON_DEVICE)
1612	{
1613	labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,
1614	Decomposition,std::is_same<Memory,CudaMemory>::value>
1615	::run(mem,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt);
1616	}
1617	else
1618	{
1619	// Iterate over all particles
1620	auto it = v_pos.getIteratorTo(g_m);
1621	while (it.isNext())
1622	{
1623	auto key = it.get();
1624
1625	// Given a particle, it return which processor require it (first id) and shift id, second id
1626	// For an explanation about shifts vectors please consult getShiftVector in ie_ghost
1627	const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE);
1628
1629	for (size_t i = `0`; i < vp_id.size(); i++)
1630	{
1631	// processor id
1632	size_t p_id = vp_id.get(i).first;
1633
1634	// add particle to communicate
1635	g_opart.get(p_id).add();
1636	g_opart.get(p_id).last().template get<`0`>() = key;
1637	g_opart.get(p_id).last().template get<`1`>() = vp_id.get(i).second;
1638	}
1639
1640	++it;
1641	}
1642
1643	// remove all zero entry and construct prc (the list of the sending processors)
1644	openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f;
1645
1646	// count the non zero element
1647	for (size_t i = `0` ; i < g_opart.size() ; i++)
1648	{
1649	if (g_opart.get(i).size() != `0`)
1650	{
1651	prc_sz.add(g_opart.get(i).size());
1652	g_opart_f.add();
1653	g_opart.get(i).swap(g_opart_f.last());
1654	prc.add(dec.IDtoProc(i));
1655	}
1656	}
1657
1658	g_opart.swap(g_opart_f);
1659	}
1660	#ifdef EXTREA_TRACE_PRE_COMM
1661	Extrae_user_function (`0`);
1662	#endif
1663	}
1664
1665	/! \brief Call-back to allocate buffer to receive incoming elements (particles)*
1666	*
1667	* \param msg_i size required to receive the message from i
1668	* \param total_msg total size to receive from all the processors
1669	* \param total_p the total number of processor that want to communicate with you
1670	* \param i processor id
1671	* \param ri request id (it is an id that goes from 0 to total_p, and is unique
1672	* every time message_alloc is called)
1673	* \param ptr a pointer to the vector_dist structure
1674	*
1675	* \return the pointer where to store the message for the processor i
1676	*
1677	*/
1678	static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr)
1679	{
1680	// cast the pointer
1681	vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> * vd = static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory, layout_base> *>(ptr);
1682
1683	vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits());
1684	vd->recv_mem_gm.get(i).resize(msg_i);
1685
1686	return vd->recv_mem_gm.get(i).getPointer();
1687	}
1688
1689	public:
1690
1691	/! \brief Copy Constructor*
1692	*
1693	* \param v vector to copy
1694	*
1695	*/
1696	vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & v)
1697	:v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(`0`)
1698	{
1699	this->operator=(v);
1700	}
1701
1702
1703	/! \brief Constructor*
1704	*
1705	* \param dec Domain decompositon
1706	*
1707	*/
1708	vector_dist_comm(const Decomposition & dec)
1709	:v_cl(create_vcluster<Memory>()),dec(dec),lg_m(`0`)
1710	{
1711
1712	}
1713
1714	/! \brief Constructor*
1715	*
1716	* \param dec Domain decompositon
1717	*
1718	*/
1719	vector_dist_comm(Decomposition && dec)
1720	:v_cl(create_vcluster<Memory>()),dec(dec),lg_m(`0`)
1721	{
1722
1723	}
1724
1725	/! \brief Constructor*
1726	*
1727	*/
1728	vector_dist_comm()
1729	:v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(`0`)
1730	{
1731	}
1732
1733	/! \brief Destructor*
1734	*
1735	* Release the retained buffer
1736	*
1737	*/
1738	~vector_dist_comm()
1739	{
1740	for (size_t i = `0` ; i < hsmem.size() ; i++)
1741	{
1742	if (hsmem.get(i).ref() == `1`)
1743	hsmem.get(i).decRef();
1744	else
1745	std::cout << __FILE__ << ":" << __LINE__ << " internal error memory is in an invalid state " << std::endl;
1746	}
1747
1748	}
1749
1750	/! \brief Get the number of minimum sub-domain per processor*
1751	*
1752	* \return minimum number
1753	*
1754	*/
1755	size_t getDecompositionGranularity()
1756	{
1757	return v_sub_unit_factor;
1758	}
1759
1760	/! \brief Set the minimum number of sub-domain per processor*
1761	*
1762	* \param n_sub
1763	*
1764	*/
1765	void setDecompositionGranularity(size_t n_sub)
1766	{
1767	this->v_sub_unit_factor = n_sub;
1768	}
1769
1770	/! \brief Initialize the decomposition*
1771	*
1772	* \param box domain
1773	* \param bc boundary conditions
1774	* \param g ghost extension
1775	* \param opt additional options
1776	*
1777	*/
1778	void init_decomposition(Box<dim,St> & box,
1779	const size_t (& bc)[dim],
1780	const Ghost<dim,St> & g,
1781	size_t opt,
1782	const grid_sm<dim,void> & gdist)
1783	{
1784	size_t div[dim];
1785
1786	if (opt & BIND_DEC_TO_GHOST)
1787	{
1788	// padding
1789	size_t pad = `0`;
1790
1791	// CellDecomposer
1792	CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
1793
1794	// Calculate the divisions for the symmetric Cell-lists
1795	cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
1796
1797	for (size_t i = `0` ; i < dim ; i++)
1798	{div[i] = cd_sm.getDiv()[i] - `2`*pad;}
1799
1800	// Create the sub-domains
1801	dec.setParameters(div, box, bc, g, gdist);
1802	}
1803	else
1804	{
1805	dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist);
1806	}
1807	dec.decompose();
1808	}
1809
1810	/! \brief Initialize the decomposition*
1811	*
1812	* \param box domain
1813	* \param bc boundary conditions
1814	* \param g ghost extension
1815	* \param opt additional options
1816	*
1817	*/
1818	void init_decomposition_gr_cell(Box<dim,St> & box,
1819	const size_t (& bc)[dim],
1820	const Ghost<dim,St> & g,
1821	size_t opt,
1822	const grid_sm<dim,void> & gdist)
1823	{
1824	size_t div[dim];
1825
1826	for (size_t i = `0` ; i < dim ; i++)
1827	{div[i] = gdist.size(i);}
1828
1829	// Create the sub-domains
1830	dec.setParameters(div, box, bc, g);
1831
1832	dec.decompose();
1833	}
1834
1835	/! \brief It synchronize the properties and position of the ghost particles*
1836	*
1837	* \tparam prp list of properties to get synchronize
1838	*
1839	* \param opt options WITH_POSITION, it send also the positional information of the particles
1840	* \param v_pos vector of position to update
1841	* \param v_prp vector of properties to update
1842	* \param g_m marker between real and ghost particles
1843	*
1844	*/
1845	template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1846	openfpm::vector<prop,Memory,layout_base> & v_prp,
1847	size_t & g_m,
1848	size_t opt = WITH_POSITION)
1849	{
1850	#ifdef PROFILE_SCOREP
1851	SCOREP_USER_REGION("ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
1852	#endif
1853
1854	// Sending property object
1855	typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1856
1857	// send vector for each processor
1858	typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
1859
1860	if (!(opt & NO_POSITION))
1861	{v_pos.resize(g_m);}
1862
1863	// reset the ghost part
1864
1865	if (!(opt & SKIP_LABELLING))
1866	{v_prp.resize(g_m);}
1867
1868	// Label all the particles
1869	if ((opt & SKIP_LABELLING) == false)
1870	{labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz_gg,prc_offset,g_m,opt);}
1871
1872	{
1873	// Send and receive ghost particle information
1874	openfpm::vector<send_vector> g_send_prp;
1875
1876	fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz_gg,g_send_prp,opt);
1877
1878	#if defined(CUDA_GPU) && defined(__NVCC__)
1879	cudaDeviceSynchronize();
1880	#endif
1881
1882	// if there are no properties skip
1883	// SSendRecvP send everything when we do not give properties
1884
1885	ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1886	sendrecv_prp(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1887	prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1888	}
1889
1890	if (!(opt & NO_POSITION))
1891	{
1892	// Sending buffer for the ghost particles position
1893	openfpm::vector<send_pos_vector> g_pos_send;
1894
1895	fill_send_ghost_pos_buf(v_pos,prc_sz_gg,g_pos_send,opt,impl == GHOST_ASYNC);
1896
1897	#if defined(CUDA_GPU) && defined(__NVCC__)
1898	cudaDeviceSynchronize();
1899	#endif
1900
1901	ghost_exchange_comm_impl<impl,layout_base,prp ...>::template
1902	sendrecv_pos(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1903
1904	// fill g_opart_sz
1905	g_opart_sz.resize(prc_g_opart.size());
1906
1907	for (size_t i = `0` ; i < prc_g_opart.size() ; i++)
1908	g_opart_sz.get(i) = g_pos_send.get(i).size();
1909	}
1910
1911	// Important to ensure that the number of particles in v_prp must be equal to v_pos
1912	// Note that if we do not give properties sizeof...(prp) == 0 in general at this point
1913	// v_prp.size() != v_pos.size()
1914	if (!(opt & SKIP_LABELLING))
1915	{
1916	v_prp.resize(v_pos.size());
1917	}
1918
1919	add_loc_particles_bc(v_pos,v_prp,g_m,opt);
1920	}
1921
1922	/! \brief It synchronize the properties and position of the ghost particles*
1923	*
1924	* \tparam prp list of properties to get synchronize
1925	*
1926	* \param opt options WITH_POSITION, it send also the positional information of the particles
1927	* \param v_pos vector of position to update
1928	* \param v_prp vector of properties to update
1929	* \param g_m marker between real and ghost particles
1930	*
1931	*/
1932	template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
1933	openfpm::vector<prop,Memory,layout_base> & v_prp,
1934	size_t & g_m,
1935	size_t opt = WITH_POSITION)
1936	{
1937	// Sending property object
1938	typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
1939
1940	// send vector for each processor
1941	typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
1942
1943	// Send and receive ghost particle information
1944	openfpm::vector<send_vector> g_send_prp;
1945	openfpm::vector<send_pos_vector> g_pos_send;
1946
1947	ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1948	sendrecv_prp_wait(v_cl,g_send_prp,v_prp,v_pos,prc_g_opart,
1949	prc_recv_get_prp,recv_sz_get_prp,recv_sz_get_byte,g_opart_sz,g_m,opt);
1950
1951
1952	ghost_exchange_comm_impl<GHOST_ASYNC,layout_base,prp ...>::template
1953	sendrecv_pos_wait(v_cl,g_pos_send,v_prp,v_pos,prc_recv_get_pos,recv_sz_get_pos,prc_g_opart,opt);
1954	}
1955
1956	/! \brief It move all the particles that does not belong to the local processor to the respective processor*
1957	*
1958	* \tparam out of bound policy it specify what to do when the particles are detected out of bound
1959	*
1960	* In general this function is called after moving the particles to move the
1961	* elements out the local processor. Or just after initialization if each processor
1962	* contain non local particles
1963	*
1964	* \tparam prp properties to communicate
1965	*
1966	* \param v_pos vector of particle positions
1967	* \param v_prp vector of particle properties
1968	* \param g_m ghost marker
1969	* \param opt options
1970	*
1971	*/
1972	template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt)
1973	{
1974	if (opt & RUN_ON_DEVICE)
1975	{
1976	std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " map_list is unsupported on device (coming soon)" << std::endl;
1977	return;
1978	}
1979
1980	typedef KillParticle obp;
1981
1982	// Processor communication size
1983	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits());
1984
1985	// map completely reset the ghost part
1986	v_pos.resize(g_m);
1987	v_prp.resize(g_m);
1988
1989	// m_opart, Contain the processor id of each particle (basically where they have to go)
1990	labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
1991
1992	// Calculate the sending buffer size for each processor, put this information in
1993	// a contiguous buffer
1994	p_map_req.resize(v_cl.getProcessingUnits());
1995	openfpm::vector<size_t> prc_sz_r;
1996	openfpm::vector<size_t> prc_r;
1997
1998	for (size_t i = `0`; i < v_cl.getProcessingUnits(); i++)
1999	{
2000	if (prc_sz.template get<`0`>(i) != `0`)
2001	{
2002	p_map_req.get(i) = prc_r.size();
2003	prc_r.add(i);
2004	prc_sz_r.add(prc_sz.template get<`0`>(i));
2005	}
2006	}
2007
2008	if (opt & MAP_LOCAL)
2009	{
2010	// if the map is local we indicate that we receive only from the neighborhood processors
2011
2012	prc_recv_map.clear();
2013	for (size_t i = `0` ; i < dec.getNNProcessors() ; i++)
2014	{prc_recv_map.add(dec.IDtoProc(i));}
2015	}
2016
2017	// Sending property object
2018	typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
2019
2020	//! position vector
2021	openfpm::vector<openfpm::vector<Point<dim, St>>> m_pos;
2022	//! properties vector
2023	openfpm::vector<openfpm::vector<prp_object>> m_prp;
2024
2025	fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
2026
2027	v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
2028	v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
2029
2030	// mark the ghost part
2031
2032	g_m = v_pos.size();
2033	}
2034
2035	/! \brief It move all the particles that does not belong to the local processor to the respective processor*
2036	*
2037	* \tparam out of bound policy it specify what to do when the particles are detected out of bound
2038	*
2039	* In general this function is called after moving the particles to move the
2040	* elements out the local processor. Or just after initialization if each processor
2041	* contain non local particles
2042	*
2043	* \param v_pos vector of particle positions
2044	* \param v_prp vector of particle properties
2045	* \param g_m ghost marker
2046	*
2047	*/
2048	template<typename obp = KillParticle>
2049	void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
2050	openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m,
2051	size_t opt)
2052	{
2053	#ifdef PROFILE_SCOREP
2054	SCOREP_USER_REGION("map",SCOREP_USER_REGION_TYPE_FUNCTION)
2055	#endif
2056
2057	prc_sz.resize(v_cl.getProcessingUnits());
2058
2059	// map completely reset the ghost part
2060	v_pos.resize(g_m);
2061	v_prp.resize(g_m);
2062
2063	// Contain the processor id of each particle (basically where they have to go)
2064	labelParticleProcessor<obp>(v_pos,m_opart, prc_sz,opt);
2065
2066	openfpm::vector<size_t> prc_sz_r;
2067	openfpm::vector<size_t> prc_r;
2068
2069	// Calculate the sending buffer size for each processor, put this information in
2070	// a contiguous buffer
2071	calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt);
2072
2073	//! position vector
2074	openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos;
2075	//! properties vector
2076	openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp;
2077
2078	fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt);
2079
2080	size_t opt_ = `0`;
2081	if (opt & RUN_ON_DEVICE)
2082	{
2083	#if defined(CUDA_GPU) && defined(__NVCC__)
2084	// Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2085	cudaDeviceSynchronize();
2086	opt_ \|= MPI_GPU_DIRECT;
2087	#else
2088	std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2089	#endif
2090	}
2091
2092	v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>,
2093	openfpm::vector<Point<dim, St>,Memory,layout_base>,
2094	layout_base>
2095	(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_);
2096
2097	v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
2098	openfpm::vector<prop,Memory,layout_base>,
2099	layout_base>
2100	(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_);
2101
2102	// mark the ghost part
2103
2104	g_m = v_pos.size();
2105	}
2106
2107	/! \brief Get the decomposition*
2108	*
2109	* \return
2110	*
2111	*/
2112	inline Decomposition & getDecomposition()
2113	{
2114	return dec;
2115	}
2116
2117	/! \brief Get the decomposition*
2118	*
2119	* \return
2120	*
2121	*/
2122	inline const Decomposition & getDecomposition() const
2123	{
2124	return dec;
2125	}
2126
2127	/! \brief Copy a vector*
2128	*
2129	* \param vc vector to copy
2130	*
2131	* \return iteself
2132	*
2133	*/
2134	vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & vc)
2135	{
2136	dec = vc.dec;
2137
2138	return *this;
2139	}
2140
2141	/! \brief Copy a vector*
2142	*
2143	* \param vc vector to copy
2144	*
2145	* \return itself
2146	*
2147	*/
2148	vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & operator=(vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> && vc)
2149	{
2150	dec = vc.dec;
2151
2152	return *this;
2153	}
2154
2155	/! \brief Ghost put*
2156	*
2157	* \tparam op operation to apply
2158	* \tparam prp set of properties
2159	*
2160	* \param v_pos vector of particle positions
2161	* \param v_prp vector od particle properties
2162	* \param g_m ghost marker
2163	* \param opt options
2164	*
2165	*/
2166	template<template<typename,typename> class op, int ... prp>
2167	void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
2168	openfpm::vector<prop,Memory,layout_base> & v_prp,
2169	size_t & g_m,
2170	size_t opt)
2171	{
2172	// Sending property object
2173	typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
2174
2175	// send vector for each processor
2176	typedef openfpm::vector<prp_object,Memory,layout_base> send_vector;
2177
2178	openfpm::vector<send_vector> g_send_prp;
2179	fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt);
2180
2181	if (opt & RUN_ON_DEVICE)
2182	{
2183	#if defined(CUDA_GPU) && defined(__NVCC__)
2184	// Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
2185	cudaDeviceSynchronize();
2186	#else
2187	std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
2188	#endif
2189	}
2190
2191	// Send and receive ghost particle information
2192	if (opt & NO_CHANGE_ELEMENTS)
2193	{
2194	size_t opt_ = compute_options(opt);
2195
2196	if (opt & RUN_ON_DEVICE)
2197	{
2198	op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset);
2199	v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2200	send_vector,
2201	decltype(v_prp),
2202	layout_base,
2203	prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2204	}
2205	else
2206	{
2207	op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2208	v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2209	send_vector,
2210	decltype(v_prp),
2211	layout_base,
2212	prp...>(g_send_prp,v_prp,prc_recv_get_prp,opm,prc_g_opart,g_opart_sz,opt_);
2213	}
2214	}
2215	else
2216	{
2217	size_t opt_ = compute_options(opt);
2218
2219	if (opt & RUN_ON_DEVICE)
2220	{
2221	op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)> opm(g_opart_device,prc_offset);
2222	v_cl.template SSendRecvP_op<op_ssend_recv_merge_gpu<op,decltype(g_opart_device),decltype(prc_offset)>,
2223	send_vector,
2224	decltype(v_prp),
2225	layout_base,
2226	prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2227	}
2228	else
2229	{
2230	op_ssend_recv_merge<op,decltype(g_opart)> opm(g_opart);
2231	v_cl.template SSendRecvP_op<op_ssend_recv_merge<op,decltype(g_opart)>,
2232	send_vector,
2233	decltype(v_prp),
2234	layout_base,
2235	prp...>(g_send_prp,v_prp,get_last_ghost_get_num_proc_vector(),opm,prc_recv_put,recv_sz_put,opt_);
2236	}
2237	}
2238
2239	// process also the local replicated particles
2240
2241	if (lg_m < v_prp.size() && v_prp.size() - lg_m != o_part_loc.size())
2242	{
2243	std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Local ghost particles = " << v_prp.size() - lg_m << " != " << o_part_loc.size() << std::endl;
2244	std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Check that you did a ghost_get before a ghost_put" << std::endl;
2245	}
2246
2247
2248	if (opt & RUN_ON_DEVICE)
2249	{
2250	v_prp.template merge_prp_v_device<op,prop,Memory,
2251	openfpm::grow_policy_double,
2252	layout_base,
2253	decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2254	}
2255	else
2256	{
2257	v_prp.template merge_prp_v<op,prop,Memory,
2258	openfpm::grow_policy_double,
2259	layout_base,
2260	decltype(o_part_loc),prp ...>(v_prp,lg_m,o_part_loc);
2261	}
2262	}
2263	};
2264
2265
2266	#endif /* SRC_VECTOR_VECTOR_DIST_COMM_HPP_ */
2267

Browse the source code of openfpm/src/Vector/vector_dist_comm.hpp