vector_dist_comm_util_funcs.cuh source code [openfpm/src/Vector/cuda/vector_dist_comm_util_funcs.cuh]

1	/*
2	* vector_dist_comm_util_funcs.hpp
3	*
4	* Created on: Sep 13, 2018
5	* Author: i-bird
6	*/
7
8	#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
9	#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
10
11	#include "util/common_pdata.hpp"
12
13	constexpr int NO_POSITION = `1`;
14	constexpr int WITH_POSITION = `2`;
15	constexpr int NO_CHANGE_ELEMENTS = `4`;
16
17	constexpr int BIND_DEC_TO_GHOST = `1`;
18
19	constexpr int MAP_LOCAL = `2`;
20
21	constexpr int GHOST_SYNC = `0`;
22	constexpr int GHOST_ASYNC = `1`;
23
24	template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda>
25	struct labelParticlesGhost_impl
26	{
27	static void run(CudaMemory & mem,
28	Decomposition & dec,
29	openfpm::vector<aggregate<unsigned int,unsigned long int>,
30	CudaMemory,
31	memory_traits_inte> & g_opart_device,
32	openfpm::vector<aggregate<unsigned int>,
33	Memory,
34	layout_base> & proc_id_out,
35	openfpm::vector<aggregate<unsigned int>,
36	Memory,
37	layout_base> & starts,
38	Vcluster<Memory> & v_cl,
39	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
40	openfpm::vector<prop,Memory,layout_base> & v_prp,
41	openfpm::vector<size_t> & prc,
42	openfpm::vector<size_t> & prc_sz,
43	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
44	size_t & g_m,
45	size_t opt)
46	{
47	std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
48	}
49	};
50
51
52
53	template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition>
54	struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true>
55	{
56	static void run(CudaMemory & mem,
57	Decomposition & dec,
58	openfpm::vector<aggregate<unsigned int,unsigned long int>,
59	CudaMemory,
60	memory_traits_inte> & g_opart_device,
61	openfpm::vector<aggregate<unsigned int>,
62	Memory,
63	layout_base> & proc_id_out,
64	openfpm::vector<aggregate<unsigned int>,
65	Memory,
66	layout_base> & starts,
67	Vcluster<Memory> & v_cl,
68	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
69	openfpm::vector<prop,Memory,layout_base> & v_prp,
70	openfpm::vector<size_t> & prc,
71	openfpm::vector<size_t> & prc_sz,
72	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
73	size_t & g_m,
74	size_t opt)
75	{
76	#if defined(CUDA_GPU) && defined(__NVCC__)
77
78	if (v_cl.size() == `1`)
79	{return;}
80
81	proc_id_out.resize(v_pos.size()+`1`);
82	proc_id_out.template get<`0`>(proc_id_out.size()-`1`) = `0`;
83	proc_id_out.template hostToDevice(proc_id_out.size()-`1`,proc_id_out.size()-`1`);
84
85	auto ite = v_pos.getGPUIterator();
86
87	// no work to do return
88	if (ite.wthr.x == `0`)
89	{return;}
90
91	// First we have to see how many entry each particle produce
92	CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>),
93	ite,
94	dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
95
96	// scan
97	//sc.scan_(proc_id_out,starts);
98	starts.resize(proc_id_out.size());
99	openfpm::scan((unsigned int )proc_id_out.template getDeviceBuffer<`0`>(), proc_id_out.size(), (unsigned* int *)starts.template getDeviceBuffer<`0`>() , v_cl.getmgpuContext());
100	starts.template deviceToHost<`0`>(starts.size()-`1`,starts.size()-`1`);
101	size_t sz = starts.template get<`0`>(starts.size()-`1`);
102
103	// we compute processor id for each particle
104
105	g_opart_device.resize(sz);
106
107	ite = v_pos.getGPUIterator();
108
109	// we compute processor id for each particle
110	CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>),
111	ite,
112	dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
113
114	// sort particles
115	openfpm::sort((int )g_opart_device.template getDeviceBuffer<`0`>(),(long* unsigned int )g_opart_device.template getDeviceBuffer<`1`>(), g_opart_device.size(), mgpu::template* less_t<int>(), v_cl.getmgpuContext());
116
117	mem.allocate(sizeof(int));
118	mem.fill(`0`);
119	prc_offset.resize(v_cl.size());
120
121	ite = g_opart_device.getGPUIterator();
122
123	if (ite.wthr.x != `0`)
124	{
125	// Find the buffer bases
126	CUDA_LAUNCH((find_buffer_offsets<`0`,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>),
127	ite,
128	g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
129	}
130
131	// Trasfer the number of offsets on CPU
132	mem.deviceToHost();
133	int noff = (int* *)mem.getPointer();
134
135	// create the terminal of prc_offset
136	prc_offset.resize(noff+`1`,DATA_ON_DEVICE);
137
138	// Move the last processor index on device (id)
139	if (g_opart_device.size() != `0`)
140	{g_opart_device.template deviceToHost<`0`>(g_opart_device.size()-`1`,g_opart_device.size()-`1`);}
141	prc_offset.template get<`0`>(prc_offset.size()-`1`) = g_opart_device.size();
142	if (g_opart_device.size() != `0`)
143	{prc_offset.template get<`1`>(prc_offset.size()-`1`) = g_opart_device.template get<`0`>(g_opart_device.size()-`1`);}
144	else
145	{prc_offset.template get<`1`>(prc_offset.size()-`1`) = `0`;}
146
147	prc_offset.template hostToDevice<`0`,`1`>(prc_offset.size()-`1`,prc_offset.size()-`1`);
148
149	// Here we reorder the offsets in ascending order
150	openfpm::sort((int )prc_offset.template getDeviceBuffer<`0`>(),(int* )prc_offset.template getDeviceBuffer<`1`>(), prc_offset.size(), mgpu::template* less_t<int>(), v_cl.getmgpuContext());
151
152	prc_offset.template deviceToHost<`0`,`1`>();
153
154	// In this case we do not have communications at all
155	if (g_opart_device.size() == `0`)
156	{noff = -`1`;}
157
158	prc.resize(noff+`1`);
159	prc_sz.resize(noff+`1`);
160
161	size_t base_offset = `0`;
162
163	// Transfert to prc the list of processors
164	prc.resize(noff+`1`);
165	for (size_t i = `0` ; i < noff+`1` ; i++)
166	{
167	prc.get(i) = prc_offset.template get<`1`>(i);
168	prc_sz.get(i) = prc_offset.template get<`0`>(i) - base_offset;
169	base_offset = prc_offset.template get<`0`>(i);
170	}
171	#else
172
173	std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;
174
175	#endif
176	}
177	};
178
179	template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
180	struct local_ghost_from_opart_impl
181	{
182	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
183	const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
184	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
185	openfpm::vector<prop,Memory,layout_base> & v_prp,
186	size_t opt)
187	{
188	std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
189	}
190	};
191
192	template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
193	struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
194	{
195	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
196	const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
197	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
198	openfpm::vector<prop,Memory,layout_base> & v_prp,
199	size_t opt)
200	{
201	#if defined(CUDA_GPU) && defined(__NVCC__)
202
203	auto ite = o_part_loc.getGPUIterator();
204
205	size_t old = v_pos.size();
206
207	if (!(opt & NO_POSITION))
208	{v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);}
209
210	if (!(opt & SKIP_LABELLING))
211	{
212	v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE);
213	}
214
215
216	if (ite.wthr.x != `0`)
217	{
218	CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>),
219	ite,
220	o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
221	}
222	#else
223	std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
224	#endif
225	}
226	};
227
228	template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
229	struct local_ghost_from_dec_impl
230	{
231	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
232	const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
233	openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
234	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
235	Vcluster<Memory> & v_cl,
236	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
237	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
238	openfpm::vector<prop,Memory,layout_base> & v_prp,
239	size_t & g_m,
240	size_t opt)
241	{
242	std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
243	}
244	};
245
246
247	template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
248	struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
249	{
250	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
251	const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
252	openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
253	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
254	Vcluster<Memory> & v_cl,
255	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
256	openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
257	openfpm::vector<prop,Memory,layout_base> & v_prp,
258	size_t & g_m,
259	size_t opt)
260	{
261	#if defined(CUDA_GPU) && defined(__NVCC__)
262
263	o_part_loc.resize(g_m+`1`);
264	o_part_loc.template get<`0`>(o_part_loc.size()-`1`) = `0`;
265	o_part_loc.template hostToDevice(o_part_loc.size()-`1`,o_part_loc.size()-`1`);
266
267	// Label the internal (assigned) particles
268	auto ite = v_pos.getGPUIteratorTo(g_m);
269
270	// label particle processor
271	CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>),
272	ite,
273	box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
274
275	starts.resize(o_part_loc.size());
276	openfpm::scan((unsigned int )o_part_loc.template getDeviceBuffer<`0`>(), o_part_loc.size(), (unsigned* int *)starts.template getDeviceBuffer<`0`>() , v_cl.getmgpuContext());
277
278	starts.template deviceToHost<`0`>(starts.size()-`1`,starts.size()-`1`);
279	size_t total = starts.template get<`0`>(starts.size()-`1`);
280	size_t old = v_pos.size();
281
282	v_pos.resize(v_pos.size() + total);
283	v_prp.resize(v_prp.size() + total);
284
285	// Label the internal (assigned) particles
286	ite = v_pos.getGPUIteratorTo(g_m);
287
288	// resize o_part_loc
289	o_part_loc.resize(total);
290
291	CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
292	decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
293	decltype(starts.toKernel()),decltype(shifts.toKernel()),
294	decltype(o_part_loc.toKernel())>),
295	ite,
296	box_f_dev.toKernel(),box_f_sv.toKernel(),
297	v_pos.toKernel(),v_prp.toKernel(),
298	starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
299
300	#else
301	std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
302	#endif
303	}
304	};
305
306	#endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */
307

Browse the source code of openfpm/src/Vector/cuda/vector_dist_comm_util_funcs.cuh