1 | /* |
2 | * vector_dist_comm_util_funcs.hpp |
3 | * |
4 | * Created on: Sep 13, 2018 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ |
9 | #define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ |
10 | |
11 | #include "util/common_pdata.hpp" |
12 | |
13 | constexpr int NO_POSITION = 1; |
14 | constexpr int WITH_POSITION = 2; |
15 | constexpr int NO_CHANGE_ELEMENTS = 4; |
16 | |
17 | constexpr int BIND_DEC_TO_GHOST = 1; |
18 | |
19 | constexpr int MAP_LOCAL = 2; |
20 | |
21 | constexpr int GHOST_SYNC = 0; |
22 | constexpr int GHOST_ASYNC = 1; |
23 | |
24 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda> |
25 | struct labelParticlesGhost_impl |
26 | { |
27 | static void run(CudaMemory & mem, |
28 | Decomposition & dec, |
29 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
30 | CudaMemory, |
31 | memory_traits_inte> & g_opart_device, |
32 | openfpm::vector<aggregate<unsigned int>, |
33 | Memory, |
34 | layout_base> & proc_id_out, |
35 | openfpm::vector<aggregate<unsigned int>, |
36 | Memory, |
37 | layout_base> & starts, |
38 | Vcluster<Memory> & v_cl, |
39 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
40 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
41 | openfpm::vector<size_t> & prc, |
42 | openfpm::vector<size_t> & prc_sz, |
43 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
44 | size_t & g_m, |
45 | size_t opt) |
46 | { |
47 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
48 | } |
49 | }; |
50 | |
51 | |
52 | |
53 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition> |
54 | struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true> |
55 | { |
56 | static void run(CudaMemory & mem, |
57 | Decomposition & dec, |
58 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
59 | CudaMemory, |
60 | memory_traits_inte> & g_opart_device, |
61 | openfpm::vector<aggregate<unsigned int>, |
62 | Memory, |
63 | layout_base> & proc_id_out, |
64 | openfpm::vector<aggregate<unsigned int>, |
65 | Memory, |
66 | layout_base> & starts, |
67 | Vcluster<Memory> & v_cl, |
68 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
69 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
70 | openfpm::vector<size_t> & prc, |
71 | openfpm::vector<size_t> & prc_sz, |
72 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
73 | size_t & g_m, |
74 | size_t opt) |
75 | { |
76 | #if defined(CUDA_GPU) && defined(__NVCC__) |
77 | |
78 | if (v_cl.size() == 1) |
79 | {return;} |
80 | |
81 | proc_id_out.resize(v_pos.size()+1); |
82 | proc_id_out.template get<0>(proc_id_out.size()-1) = 0; |
83 | proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1); |
84 | |
85 | auto ite = v_pos.getGPUIterator(); |
86 | |
87 | // no work to do return |
88 | if (ite.wthr.x == 0) |
89 | {return;} |
90 | |
91 | // First we have to see how many entry each particle produce |
92 | CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>), |
93 | ite, |
94 | dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel()); |
95 | |
96 | // scan |
97 | //sc.scan_(proc_id_out,starts); |
98 | starts.resize(proc_id_out.size()); |
99 | openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
100 | starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); |
101 | size_t sz = starts.template get<0>(starts.size()-1); |
102 | |
103 | // we compute processor id for each particle |
104 | |
105 | g_opart_device.resize(sz); |
106 | |
107 | ite = v_pos.getGPUIterator(); |
108 | |
109 | // we compute processor id for each particle |
110 | CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>), |
111 | ite, |
112 | dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel()); |
113 | |
114 | // sort particles |
115 | openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
116 | |
117 | mem.allocate(sizeof(int)); |
118 | mem.fill(0); |
119 | prc_offset.resize(v_cl.size()); |
120 | |
121 | ite = g_opart_device.getGPUIterator(); |
122 | |
123 | if (ite.wthr.x != 0) |
124 | { |
125 | // Find the buffer bases |
126 | CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>), |
127 | ite, |
128 | g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel()); |
129 | } |
130 | |
131 | // Trasfer the number of offsets on CPU |
132 | mem.deviceToHost(); |
133 | int noff = *(int *)mem.getPointer(); |
134 | |
135 | // create the terminal of prc_offset |
136 | prc_offset.resize(noff+1,DATA_ON_DEVICE); |
137 | |
138 | // Move the last processor index on device (id) |
139 | if (g_opart_device.size() != 0) |
140 | {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);} |
141 | prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size(); |
142 | if (g_opart_device.size() != 0) |
143 | {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);} |
144 | else |
145 | {prc_offset.template get<1>(prc_offset.size()-1) = 0;} |
146 | |
147 | prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1); |
148 | |
149 | // Here we reorder the offsets in ascending order |
150 | openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
151 | |
152 | prc_offset.template deviceToHost<0,1>(); |
153 | |
154 | // In this case we do not have communications at all |
155 | if (g_opart_device.size() == 0) |
156 | {noff = -1;} |
157 | |
158 | prc.resize(noff+1); |
159 | prc_sz.resize(noff+1); |
160 | |
161 | size_t base_offset = 0; |
162 | |
163 | // Transfert to prc the list of processors |
164 | prc.resize(noff+1); |
165 | for (size_t i = 0 ; i < noff+1 ; i++) |
166 | { |
167 | prc.get(i) = prc_offset.template get<1>(i); |
168 | prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset; |
169 | base_offset = prc_offset.template get<0>(i); |
170 | } |
171 | #else |
172 | |
173 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl; |
174 | |
175 | #endif |
176 | } |
177 | }; |
178 | |
179 | template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> |
180 | struct local_ghost_from_opart_impl |
181 | { |
182 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
183 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
184 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
185 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
186 | size_t opt) |
187 | { |
188 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
189 | } |
190 | }; |
191 | |
192 | template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> |
193 | struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true> |
194 | { |
195 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
196 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
197 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
198 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
199 | size_t opt) |
200 | { |
201 | #if defined(CUDA_GPU) && defined(__NVCC__) |
202 | |
203 | auto ite = o_part_loc.getGPUIterator(); |
204 | |
205 | size_t old = v_pos.size(); |
206 | |
207 | if (!(opt & NO_POSITION)) |
208 | {v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);} |
209 | |
210 | if (!(opt & SKIP_LABELLING)) |
211 | { |
212 | v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE); |
213 | } |
214 | |
215 | |
216 | if (ite.wthr.x != 0) |
217 | { |
218 | CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>), |
219 | ite, |
220 | o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old); |
221 | } |
222 | #else |
223 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
224 | #endif |
225 | } |
226 | }; |
227 | |
228 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> |
229 | struct local_ghost_from_dec_impl |
230 | { |
231 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
232 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
233 | openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, |
234 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, |
235 | Vcluster<Memory> & v_cl, |
236 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, |
237 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
238 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
239 | size_t & g_m, |
240 | size_t opt) |
241 | { |
242 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
243 | } |
244 | }; |
245 | |
246 | |
247 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> |
248 | struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> |
249 | { |
250 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
251 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
252 | openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, |
253 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, |
254 | Vcluster<Memory> & v_cl, |
255 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, |
256 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
257 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
258 | size_t & g_m, |
259 | size_t opt) |
260 | { |
261 | #if defined(CUDA_GPU) && defined(__NVCC__) |
262 | |
263 | o_part_loc.resize(g_m+1); |
264 | o_part_loc.template get<0>(o_part_loc.size()-1) = 0; |
265 | o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1); |
266 | |
267 | // Label the internal (assigned) particles |
268 | auto ite = v_pos.getGPUIteratorTo(g_m); |
269 | |
270 | // label particle processor |
271 | CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>), |
272 | ite, |
273 | box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m); |
274 | |
275 | starts.resize(o_part_loc.size()); |
276 | openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
277 | |
278 | starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); |
279 | size_t total = starts.template get<0>(starts.size()-1); |
280 | size_t old = v_pos.size(); |
281 | |
282 | v_pos.resize(v_pos.size() + total); |
283 | v_prp.resize(v_prp.size() + total); |
284 | |
285 | // Label the internal (assigned) particles |
286 | ite = v_pos.getGPUIteratorTo(g_m); |
287 | |
288 | // resize o_part_loc |
289 | o_part_loc.resize(total); |
290 | |
291 | CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), |
292 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel()), |
293 | decltype(starts.toKernel()),decltype(shifts.toKernel()), |
294 | decltype(o_part_loc.toKernel())>), |
295 | ite, |
296 | box_f_dev.toKernel(),box_f_sv.toKernel(), |
297 | v_pos.toKernel(),v_prp.toKernel(), |
298 | starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m); |
299 | |
300 | #else |
301 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
302 | #endif |
303 | } |
304 | }; |
305 | |
306 | #endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */ |
307 | |