| 1 | /* |
| 2 | * vector_dist_comm_util_funcs.hpp |
| 3 | * |
| 4 | * Created on: Sep 13, 2018 |
| 5 | * Author: i-bird |
| 6 | */ |
| 7 | |
| 8 | #ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ |
| 9 | #define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ |
| 10 | |
| 11 | #include "util/common_pdata.hpp" |
| 12 | |
| 13 | constexpr int NO_POSITION = 1; |
| 14 | constexpr int WITH_POSITION = 2; |
| 15 | constexpr int NO_CHANGE_ELEMENTS = 4; |
| 16 | |
| 17 | constexpr int BIND_DEC_TO_GHOST = 1; |
| 18 | |
| 19 | constexpr int MAP_LOCAL = 2; |
| 20 | |
| 21 | constexpr int GHOST_SYNC = 0; |
| 22 | constexpr int GHOST_ASYNC = 1; |
| 23 | |
| 24 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda> |
| 25 | struct labelParticlesGhost_impl |
| 26 | { |
| 27 | static void run(CudaMemory & mem, |
| 28 | Decomposition & dec, |
| 29 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
| 30 | CudaMemory, |
| 31 | memory_traits_inte> & g_opart_device, |
| 32 | openfpm::vector<aggregate<unsigned int>, |
| 33 | Memory, |
| 34 | layout_base> & proc_id_out, |
| 35 | openfpm::vector<aggregate<unsigned int>, |
| 36 | Memory, |
| 37 | layout_base> & starts, |
| 38 | Vcluster<Memory> & v_cl, |
| 39 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 40 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 41 | openfpm::vector<size_t> & prc, |
| 42 | openfpm::vector<size_t> & prc_sz, |
| 43 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
| 44 | size_t & g_m, |
| 45 | size_t opt) |
| 46 | { |
| 47 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
| 48 | } |
| 49 | }; |
| 50 | |
| 51 | |
| 52 | |
| 53 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition> |
| 54 | struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true> |
| 55 | { |
| 56 | static void run(CudaMemory & mem, |
| 57 | Decomposition & dec, |
| 58 | openfpm::vector<aggregate<unsigned int,unsigned long int>, |
| 59 | CudaMemory, |
| 60 | memory_traits_inte> & g_opart_device, |
| 61 | openfpm::vector<aggregate<unsigned int>, |
| 62 | Memory, |
| 63 | layout_base> & proc_id_out, |
| 64 | openfpm::vector<aggregate<unsigned int>, |
| 65 | Memory, |
| 66 | layout_base> & starts, |
| 67 | Vcluster<Memory> & v_cl, |
| 68 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 69 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 70 | openfpm::vector<size_t> & prc, |
| 71 | openfpm::vector<size_t> & prc_sz, |
| 72 | openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, |
| 73 | size_t & g_m, |
| 74 | size_t opt) |
| 75 | { |
| 76 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 77 | |
| 78 | if (v_cl.size() == 1) |
| 79 | {return;} |
| 80 | |
| 81 | proc_id_out.resize(v_pos.size()+1); |
| 82 | proc_id_out.template get<0>(proc_id_out.size()-1) = 0; |
| 83 | proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1); |
| 84 | |
| 85 | auto ite = v_pos.getGPUIterator(); |
| 86 | |
| 87 | // no work to do return |
| 88 | if (ite.wthr.x == 0) |
| 89 | {return;} |
| 90 | |
| 91 | // First we have to see how many entry each particle produce |
| 92 | CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>), |
| 93 | ite, |
| 94 | dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel()); |
| 95 | |
| 96 | // scan |
| 97 | //sc.scan_(proc_id_out,starts); |
| 98 | starts.resize(proc_id_out.size()); |
| 99 | openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
| 100 | starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); |
| 101 | size_t sz = starts.template get<0>(starts.size()-1); |
| 102 | |
| 103 | // we compute processor id for each particle |
| 104 | |
| 105 | g_opart_device.resize(sz); |
| 106 | |
| 107 | ite = v_pos.getGPUIterator(); |
| 108 | |
| 109 | // we compute processor id for each particle |
| 110 | CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>), |
| 111 | ite, |
| 112 | dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel()); |
| 113 | |
| 114 | // sort particles |
| 115 | openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
| 116 | |
| 117 | mem.allocate(sizeof(int)); |
| 118 | mem.fill(0); |
| 119 | prc_offset.resize(v_cl.size()); |
| 120 | |
| 121 | ite = g_opart_device.getGPUIterator(); |
| 122 | |
| 123 | if (ite.wthr.x != 0) |
| 124 | { |
| 125 | // Find the buffer bases |
| 126 | CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>), |
| 127 | ite, |
| 128 | g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel()); |
| 129 | } |
| 130 | |
| 131 | // Trasfer the number of offsets on CPU |
| 132 | mem.deviceToHost(); |
| 133 | int noff = *(int *)mem.getPointer(); |
| 134 | |
| 135 | // create the terminal of prc_offset |
| 136 | prc_offset.resize(noff+1,DATA_ON_DEVICE); |
| 137 | |
| 138 | // Move the last processor index on device (id) |
| 139 | if (g_opart_device.size() != 0) |
| 140 | {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);} |
| 141 | prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size(); |
| 142 | if (g_opart_device.size() != 0) |
| 143 | {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);} |
| 144 | else |
| 145 | {prc_offset.template get<1>(prc_offset.size()-1) = 0;} |
| 146 | |
| 147 | prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1); |
| 148 | |
| 149 | // Here we reorder the offsets in ascending order |
| 150 | openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); |
| 151 | |
| 152 | prc_offset.template deviceToHost<0,1>(); |
| 153 | |
| 154 | // In this case we do not have communications at all |
| 155 | if (g_opart_device.size() == 0) |
| 156 | {noff = -1;} |
| 157 | |
| 158 | prc.resize(noff+1); |
| 159 | prc_sz.resize(noff+1); |
| 160 | |
| 161 | size_t base_offset = 0; |
| 162 | |
| 163 | // Transfert to prc the list of processors |
| 164 | prc.resize(noff+1); |
| 165 | for (size_t i = 0 ; i < noff+1 ; i++) |
| 166 | { |
| 167 | prc.get(i) = prc_offset.template get<1>(i); |
| 168 | prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset; |
| 169 | base_offset = prc_offset.template get<0>(i); |
| 170 | } |
| 171 | #else |
| 172 | |
| 173 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl; |
| 174 | |
| 175 | #endif |
| 176 | } |
| 177 | }; |
| 178 | |
| 179 | template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> |
| 180 | struct local_ghost_from_opart_impl |
| 181 | { |
| 182 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
| 183 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
| 184 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 185 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 186 | size_t opt) |
| 187 | { |
| 188 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
| 189 | } |
| 190 | }; |
| 191 | |
| 192 | template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> |
| 193 | struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true> |
| 194 | { |
| 195 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
| 196 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
| 197 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 198 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 199 | size_t opt) |
| 200 | { |
| 201 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 202 | |
| 203 | auto ite = o_part_loc.getGPUIterator(); |
| 204 | |
| 205 | size_t old = v_pos.size(); |
| 206 | |
| 207 | if (!(opt & NO_POSITION)) |
| 208 | {v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);} |
| 209 | |
| 210 | if (!(opt & SKIP_LABELLING)) |
| 211 | { |
| 212 | v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE); |
| 213 | } |
| 214 | |
| 215 | |
| 216 | if (ite.wthr.x != 0) |
| 217 | { |
| 218 | CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>), |
| 219 | ite, |
| 220 | o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old); |
| 221 | } |
| 222 | #else |
| 223 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
| 224 | #endif |
| 225 | } |
| 226 | }; |
| 227 | |
| 228 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> |
| 229 | struct local_ghost_from_dec_impl |
| 230 | { |
| 231 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
| 232 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
| 233 | openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, |
| 234 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, |
| 235 | Vcluster<Memory> & v_cl, |
| 236 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, |
| 237 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 238 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 239 | size_t & g_m, |
| 240 | size_t opt) |
| 241 | { |
| 242 | std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; |
| 243 | } |
| 244 | }; |
| 245 | |
| 246 | |
| 247 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> |
| 248 | struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> |
| 249 | { |
| 250 | static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, |
| 251 | const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, |
| 252 | openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, |
| 253 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, |
| 254 | Vcluster<Memory> & v_cl, |
| 255 | openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, |
| 256 | openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, |
| 257 | openfpm::vector<prop,Memory,layout_base> & v_prp, |
| 258 | size_t & g_m, |
| 259 | size_t opt) |
| 260 | { |
| 261 | #if defined(CUDA_GPU) && defined(__NVCC__) |
| 262 | |
| 263 | o_part_loc.resize(g_m+1); |
| 264 | o_part_loc.template get<0>(o_part_loc.size()-1) = 0; |
| 265 | o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1); |
| 266 | |
| 267 | // Label the internal (assigned) particles |
| 268 | auto ite = v_pos.getGPUIteratorTo(g_m); |
| 269 | |
| 270 | // label particle processor |
| 271 | CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>), |
| 272 | ite, |
| 273 | box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m); |
| 274 | |
| 275 | starts.resize(o_part_loc.size()); |
| 276 | openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); |
| 277 | |
| 278 | starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); |
| 279 | size_t total = starts.template get<0>(starts.size()-1); |
| 280 | size_t old = v_pos.size(); |
| 281 | |
| 282 | v_pos.resize(v_pos.size() + total); |
| 283 | v_prp.resize(v_prp.size() + total); |
| 284 | |
| 285 | // Label the internal (assigned) particles |
| 286 | ite = v_pos.getGPUIteratorTo(g_m); |
| 287 | |
| 288 | // resize o_part_loc |
| 289 | o_part_loc.resize(total); |
| 290 | |
| 291 | CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), |
| 292 | decltype(v_pos.toKernel()),decltype(v_prp.toKernel()), |
| 293 | decltype(starts.toKernel()),decltype(shifts.toKernel()), |
| 294 | decltype(o_part_loc.toKernel())>), |
| 295 | ite, |
| 296 | box_f_dev.toKernel(),box_f_sv.toKernel(), |
| 297 | v_pos.toKernel(),v_prp.toKernel(), |
| 298 | starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m); |
| 299 | |
| 300 | #else |
| 301 | std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl; |
| 302 | #endif |
| 303 | } |
| 304 | }; |
| 305 | |
| 306 | #endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */ |
| 307 | |