| 1 | /* | 
| 2 |  * vector_dist_comm_util_funcs.hpp | 
| 3 |  * | 
| 4 |  *  Created on: Sep 13, 2018 | 
| 5 |  *      Author: i-bird | 
| 6 |  */ | 
| 7 |  | 
| 8 | #ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ | 
| 9 | #define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ | 
| 10 |  | 
| 11 | #include "util/common_pdata.hpp" | 
| 12 |  | 
| 13 | constexpr int NO_POSITION = 1; | 
| 14 | constexpr int WITH_POSITION = 2; | 
| 15 | constexpr int NO_CHANGE_ELEMENTS = 4; | 
| 16 |  | 
| 17 | constexpr int BIND_DEC_TO_GHOST = 1; | 
| 18 |  | 
| 19 | constexpr int MAP_LOCAL = 2; | 
| 20 |  | 
| 21 | constexpr int GHOST_SYNC = 0; | 
| 22 | constexpr int GHOST_ASYNC = 1; | 
| 23 |  | 
| 24 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda> | 
| 25 | struct labelParticlesGhost_impl | 
| 26 | { | 
| 27 | 	static void run(CudaMemory & mem, | 
| 28 | 					Decomposition & dec, | 
| 29 | 					openfpm::vector<aggregate<unsigned int,unsigned long int>, | 
| 30 | 							CudaMemory, | 
| 31 | 							memory_traits_inte> & g_opart_device, | 
| 32 | 				    openfpm::vector<aggregate<unsigned int>, | 
| 33 | 				                            Memory, | 
| 34 | 				                            layout_base> & proc_id_out, | 
| 35 | 				    openfpm::vector<aggregate<unsigned int>, | 
| 36 | 				                             Memory, | 
| 37 | 				                             layout_base> & starts, | 
| 38 | 		            Vcluster<Memory> & v_cl, | 
| 39 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 40 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 41 |             		openfpm::vector<size_t> & prc, | 
| 42 |             		openfpm::vector<size_t> & prc_sz, | 
| 43 |             		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, | 
| 44 |             		size_t & g_m, | 
| 45 |             		size_t opt) | 
| 46 | 	{ | 
| 47 | 		std::cout << __FILE__ << ":"  << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures"  << std::endl; | 
| 48 | 	} | 
| 49 | }; | 
| 50 |  | 
| 51 |  | 
| 52 |  | 
| 53 | template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition> | 
| 54 | struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true> | 
| 55 | { | 
| 56 | 	static void run(CudaMemory & mem, | 
| 57 | 					Decomposition & dec, | 
| 58 | 					openfpm::vector<aggregate<unsigned int,unsigned long int>, | 
| 59 | 							CudaMemory, | 
| 60 | 							memory_traits_inte> & g_opart_device, | 
| 61 | 				    openfpm::vector<aggregate<unsigned int>, | 
| 62 | 				                            Memory, | 
| 63 | 				                            layout_base> & proc_id_out, | 
| 64 | 				    openfpm::vector<aggregate<unsigned int>, | 
| 65 | 				                             Memory, | 
| 66 | 				                             layout_base> & starts, | 
| 67 | 					Vcluster<Memory> & v_cl, | 
| 68 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 69 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 70 |             		openfpm::vector<size_t> & prc, | 
| 71 |             		openfpm::vector<size_t> & prc_sz, | 
| 72 |             		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, | 
| 73 |             		size_t & g_m, | 
| 74 |             		size_t opt) | 
| 75 | 	{ | 
| 76 | #if defined(CUDA_GPU) && defined(__NVCC__) | 
| 77 |  | 
| 78 | 			if (v_cl.size() == 1) | 
| 79 | 			{return;} | 
| 80 |  | 
| 81 | 			proc_id_out.resize(v_pos.size()+1); | 
| 82 | 			proc_id_out.template get<0>(proc_id_out.size()-1) = 0; | 
| 83 | 			proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1); | 
| 84 |  | 
| 85 | 			auto ite = v_pos.getGPUIterator(); | 
| 86 |  | 
| 87 | 			// no work to do return | 
| 88 | 			if (ite.wthr.x == 0) | 
| 89 | 			{return;} | 
| 90 |  | 
| 91 | 			// First we have to see how many entry each particle produce | 
| 92 | 			CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>), | 
| 93 | 			ite, | 
| 94 | 			dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel()); | 
| 95 |  | 
| 96 | 			// scan | 
| 97 | 			//sc.scan_(proc_id_out,starts); | 
| 98 | 			starts.resize(proc_id_out.size()); | 
| 99 | 			openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); | 
| 100 | 			starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); | 
| 101 | 			size_t sz = starts.template get<0>(starts.size()-1); | 
| 102 |  | 
| 103 | 			// we compute processor id for each particle | 
| 104 |  | 
| 105 | 		    g_opart_device.resize(sz); | 
| 106 |  | 
| 107 | 			ite = v_pos.getGPUIterator(); | 
| 108 |  | 
| 109 | 			// we compute processor id for each particle | 
| 110 | 			CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>), | 
| 111 | 			ite, | 
| 112 | 			dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel()); | 
| 113 |  | 
| 114 | 			// sort particles | 
| 115 | 			openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); | 
| 116 |  | 
| 117 | 			mem.allocate(sizeof(int)); | 
| 118 | 			mem.fill(0); | 
| 119 | 			prc_offset.resize(v_cl.size()); | 
| 120 |  | 
| 121 | 			ite = g_opart_device.getGPUIterator(); | 
| 122 |  | 
| 123 | 			if (ite.wthr.x != 0) | 
| 124 | 			{ | 
| 125 | 				// Find the buffer bases | 
| 126 | 				CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>), | 
| 127 | 					    ite, | 
| 128 | 					    g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel()); | 
| 129 | 			} | 
| 130 |  | 
| 131 | 			// Trasfer the number of offsets on CPU | 
| 132 | 			mem.deviceToHost(); | 
| 133 | 			int noff = *(int *)mem.getPointer(); | 
| 134 |  | 
| 135 | 			// create the terminal of prc_offset | 
| 136 | 			prc_offset.resize(noff+1,DATA_ON_DEVICE); | 
| 137 |  | 
| 138 | 			// Move the last processor index on device (id) | 
| 139 | 			if (g_opart_device.size() != 0) | 
| 140 | 			{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);} | 
| 141 | 			prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size(); | 
| 142 | 			if (g_opart_device.size() != 0) | 
| 143 | 			{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);} | 
| 144 | 			else | 
| 145 | 			{prc_offset.template get<1>(prc_offset.size()-1) = 0;} | 
| 146 |  | 
| 147 | 			prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1); | 
| 148 |  | 
| 149 | 			// Here we reorder the offsets in ascending order | 
| 150 | 			openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); | 
| 151 |  | 
| 152 | 			prc_offset.template deviceToHost<0,1>(); | 
| 153 |  | 
| 154 | 			// In this case we do not have communications at all | 
| 155 | 			if (g_opart_device.size() == 0) | 
| 156 | 			{noff = -1;} | 
| 157 |  | 
| 158 | 			prc.resize(noff+1); | 
| 159 | 			prc_sz.resize(noff+1); | 
| 160 |  | 
| 161 | 			size_t base_offset = 0; | 
| 162 |  | 
| 163 | 			// Transfert to prc the list of processors | 
| 164 | 			prc.resize(noff+1); | 
| 165 | 			for (size_t i = 0 ; i < noff+1 ; i++) | 
| 166 | 			{ | 
| 167 | 				prc.get(i) = prc_offset.template get<1>(i); | 
| 168 | 				prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset; | 
| 169 | 				base_offset = prc_offset.template get<0>(i); | 
| 170 | 			} | 
| 171 | #else | 
| 172 |  | 
| 173 | 			std::cout << __FILE__ << ":"  << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC"  << std::endl; | 
| 174 |  | 
| 175 | #endif | 
| 176 | 	} | 
| 177 | }; | 
| 178 |  | 
| 179 | template<bool with_pos,unsigned int dim, typename St,  typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> | 
| 180 | struct local_ghost_from_opart_impl | 
| 181 | { | 
| 182 | 	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, | 
| 183 | 					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, | 
| 184 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 185 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 186 |             		size_t opt) | 
| 187 | 	{ | 
| 188 | 		std::cout << __FILE__ << ":"  << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures"  << std::endl; | 
| 189 | 	} | 
| 190 | }; | 
| 191 |  | 
| 192 | template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> | 
| 193 | struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true> | 
| 194 | { | 
| 195 | 	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, | 
| 196 | 					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, | 
| 197 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 198 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 199 |             		size_t opt) | 
| 200 | 	{ | 
| 201 | #if defined(CUDA_GPU) && defined(__NVCC__) | 
| 202 |  | 
| 203 | 				auto ite = o_part_loc.getGPUIterator(); | 
| 204 |  | 
| 205 | 				size_t old = v_pos.size(); | 
| 206 |  | 
| 207 | 				if (!(opt & NO_POSITION)) | 
| 208 | 				{v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);} | 
| 209 |  | 
| 210 | 				if (!(opt & SKIP_LABELLING)) | 
| 211 | 				{ | 
| 212 | 					v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE); | 
| 213 | 				} | 
| 214 |  | 
| 215 |  | 
| 216 | 				if (ite.wthr.x != 0) | 
| 217 | 				{ | 
| 218 | 					CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>), | 
| 219 | 					ite, | 
| 220 | 					o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old); | 
| 221 | 				} | 
| 222 | #else | 
| 223 | 				std::cout << __FILE__ << ":"  << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC"  << std::endl; | 
| 224 | #endif | 
| 225 | 	} | 
| 226 | }; | 
| 227 |  | 
| 228 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> | 
| 229 | struct local_ghost_from_dec_impl | 
| 230 | { | 
| 231 | 	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, | 
| 232 | 					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, | 
| 233 | 					openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, | 
| 234 | 					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, | 
| 235 | 					Vcluster<Memory> & v_cl, | 
| 236 | 					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, | 
| 237 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 238 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 239 |             		size_t & g_m, | 
| 240 |             		size_t opt) | 
| 241 | 	{ | 
| 242 | 		std::cout << __FILE__ << ":"  << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures"  << std::endl; | 
| 243 | 	} | 
| 244 | }; | 
| 245 |  | 
| 246 |  | 
| 247 | template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> | 
| 248 | struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> | 
| 249 | { | 
| 250 | 	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, | 
| 251 | 					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, | 
| 252 | 					openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, | 
| 253 | 					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, | 
| 254 | 					Vcluster<Memory> & v_cl, | 
| 255 | 					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, | 
| 256 | 					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, | 
| 257 |             		openfpm::vector<prop,Memory,layout_base> & v_prp, | 
| 258 |             		size_t & g_m, | 
| 259 |             		size_t opt) | 
| 260 | 	{ | 
| 261 | #if defined(CUDA_GPU) && defined(__NVCC__) | 
| 262 |  | 
| 263 | 		o_part_loc.resize(g_m+1); | 
| 264 | 		o_part_loc.template get<0>(o_part_loc.size()-1) = 0; | 
| 265 | 		o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1); | 
| 266 |  | 
| 267 | 		// Label the internal (assigned) particles | 
| 268 | 		auto ite = v_pos.getGPUIteratorTo(g_m); | 
| 269 |  | 
| 270 | 		// label particle processor | 
| 271 | 		CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>), | 
| 272 | 		ite, | 
| 273 | 		box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m); | 
| 274 |  | 
| 275 | 		starts.resize(o_part_loc.size()); | 
| 276 | 		openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); | 
| 277 |  | 
| 278 | 		starts.template deviceToHost<0>(starts.size()-1,starts.size()-1); | 
| 279 | 		size_t total = starts.template get<0>(starts.size()-1); | 
| 280 | 		size_t old = v_pos.size(); | 
| 281 |  | 
| 282 | 		v_pos.resize(v_pos.size() + total); | 
| 283 | 		v_prp.resize(v_prp.size() + total); | 
| 284 |  | 
| 285 | 		// Label the internal (assigned) particles | 
| 286 | 		ite = v_pos.getGPUIteratorTo(g_m); | 
| 287 |  | 
| 288 | 		// resize o_part_loc | 
| 289 | 		o_part_loc.resize(total); | 
| 290 |  | 
| 291 | 		CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), | 
| 292 | 									 decltype(v_pos.toKernel()),decltype(v_prp.toKernel()), | 
| 293 | 									 decltype(starts.toKernel()),decltype(shifts.toKernel()), | 
| 294 | 									 decltype(o_part_loc.toKernel())>), | 
| 295 | 		ite, | 
| 296 | 		box_f_dev.toKernel(),box_f_sv.toKernel(), | 
| 297 | 		 v_pos.toKernel(),v_prp.toKernel(), | 
| 298 | 		 starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m); | 
| 299 |  | 
| 300 | #else | 
| 301 | 		std::cout << __FILE__ << ":"  << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC"  << std::endl; | 
| 302 | #endif | 
| 303 | 	} | 
| 304 | }; | 
| 305 |  | 
| 306 | #endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */ | 
| 307 |  |