| 1 | /* | 
| 2 |  * CellList_gpu.hpp | 
| 3 |  * | 
| 4 |  *  Created on: Jun 11, 2018 | 
| 5 |  *      Author: i-bird | 
| 6 |  */ | 
| 7 |  | 
| 8 | #ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ | 
| 9 | #define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ | 
| 10 |  | 
| 11 | #include "config.h" | 
| 12 |  | 
| 13 | #ifdef CUDA_GPU | 
| 14 |  | 
| 15 | #include "Vector/map_vector_sparse.hpp" | 
| 16 | #include "NN/CellList/CellDecomposer.hpp" | 
| 17 | #include "Vector/map_vector.hpp" | 
| 18 | #include "Cuda_cell_list_util_func.hpp" | 
| 19 | #include "NN/CellList/cuda/CellList_gpu_ker.cuh" | 
| 20 | #include "util/cuda_util.hpp" | 
| 21 | #include "NN/CellList/CellList_util.hpp" | 
| 22 | #include "NN/CellList/CellList.hpp" | 
| 23 | #include "util/cuda/scan_ofp.cuh" | 
| 24 |  | 
| 25 | constexpr int count = 0; | 
| 26 | constexpr int start = 1; | 
| 27 |  | 
| 28 | template<unsigned int dim, typename T, | 
| 29 |          typename cnt_type, typename ids_type, | 
| 30 |          typename Memory,typename transform, | 
| 31 |          typename vector_cnt_type, typename vector_cnt_type2, | 
| 32 |          typename cl_sparse_type, | 
| 33 |          bool is_sparse> | 
| 34 | struct CellList_gpu_ker_selector | 
| 35 | { | 
| 36 | 	static inline CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse> get(vector_cnt_type & starts, | 
| 37 | 																			 vector_cnt_type & cell_nn, | 
| 38 | 																			 vector_cnt_type2 & cell_nn_list, | 
| 39 | 																			 cl_sparse_type & cl_sparse, | 
| 40 | 																			 vector_cnt_type & sorted_to_not_sorted, | 
| 41 | 																			 vector_cnt_type & sorted_domain_particles_ids, | 
| 42 | 																			 openfpm::vector<aggregate<int>,Memory,memory_traits_inte> & nnc_rad, | 
| 43 | 																			 openfpm::array<T,dim,cnt_type> & spacing_c, | 
| 44 | 																			 openfpm::array<ids_type,dim,cnt_type> & div_c, | 
| 45 | 																			 openfpm::array<ids_type,dim,cnt_type> & off, | 
| 46 | 																			 const transform & t, | 
| 47 | 																			 unsigned int g_m) | 
| 48 | 	{ | 
| 49 | 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse>(starts.toKernel(), | 
| 50 | 																			sorted_to_not_sorted.toKernel(), | 
| 51 | 																			sorted_domain_particles_ids.toKernel(), | 
| 52 | 																			nnc_rad.toKernel(), | 
| 53 | 																			spacing_c, | 
| 54 | 																			div_c, | 
| 55 | 																			off, | 
| 56 | 																			t, | 
| 57 | 																			g_m); | 
| 58 | 	} | 
| 59 | }; | 
| 60 |  | 
| 61 | template<unsigned int dim, typename T, | 
| 62 |          typename cnt_type, typename ids_type, | 
| 63 |          typename Memory,typename transform, | 
| 64 |          typename vector_cnt_type, typename vector_cnt_type2, | 
| 65 |          typename cl_sparse_type> | 
| 66 | struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector_cnt_type,vector_cnt_type2,cl_sparse_type,true> | 
| 67 | { | 
| 68 | 	static CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true> get(vector_cnt_type & starts, | 
| 69 | 			 vector_cnt_type & cell_nn, | 
| 70 | 			 vector_cnt_type2 & cell_nn_list, | 
| 71 | 			 cl_sparse_type & cl_sparse, | 
| 72 | 			 vector_cnt_type & srt, | 
| 73 | 			 vector_cnt_type & dprt, | 
| 74 | 			 openfpm::vector<aggregate<int>,Memory,memory_traits_inte> & nnc_rad, | 
| 75 | 			 openfpm::array<T,dim,cnt_type> & spacing_c, | 
| 76 | 	         openfpm::array<ids_type,dim,cnt_type> & div_c, | 
| 77 | 	         openfpm::array<ids_type,dim,cnt_type> & off, | 
| 78 | 	         const transform & t, | 
| 79 | 	         unsigned int g_m) | 
| 80 | 	{ | 
| 81 | 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true>(cell_nn.toKernel(), | 
| 82 | 																		cell_nn_list.toKernel(), | 
| 83 | 																		cl_sparse.toKernel(), | 
| 84 | 																		srt.toKernel(), | 
| 85 | 																		dprt.toKernel(), | 
| 86 | 																		spacing_c, | 
| 87 | 																		div_c, | 
| 88 | 																		off, | 
| 89 | 																		t,g_m); | 
| 90 | 	} | 
| 91 | }; | 
| 92 |  | 
| 93 | template<unsigned int dim, | 
| 94 | 		 typename T, | 
| 95 | 		 typename Memory, | 
| 96 | 		 typename transform = no_transform_only<dim,T>, | 
| 97 | 		 typename cnt_type = unsigned int, | 
| 98 | 		 typename ids_type = int, | 
| 99 | 		 bool is_sparse = false> | 
| 100 | class CellList_gpu : public CellDecomposer_sm<dim,T,transform> | 
| 101 | { | 
| 102 | 	typedef openfpm::vector<aggregate<cnt_type>,Memory,memory_traits_inte> vector_cnt_type; | 
| 103 |  | 
| 104 | 	//! \brief Number of particles in each cell | 
| 105 | 	vector_cnt_type cl_n; | 
| 106 |  | 
| 107 | 	//! \brief for each cell the particles id in it | 
| 108 | 	vector_cnt_type cells; | 
| 109 |  | 
| 110 | 	//! \brief Cell scan with + operation of cl_n (in case of sparse it contain the cell index of the particles) | 
| 111 | 	vector_cnt_type starts; | 
| 112 |  | 
| 113 | 	//! \brief sparse vector in case of sparse Cell-list | 
| 114 | 	openfpm::vector_sparse_gpu<aggregate<cnt_type>> cl_sparse; | 
| 115 |  | 
| 116 | 	//! \brief number of neighborhood each cell cell has + offset | 
| 117 | 	openfpm::vector_gpu<aggregate<cnt_type>> cells_nn; | 
| 118 |  | 
| 119 | 	//! \brief For each cell the list of the neighborhood cells | 
| 120 | 	openfpm::vector_gpu<aggregate<cnt_type,cnt_type>> cells_nn_list; | 
| 121 |  | 
| 122 |     //! \brief particle ids information the first "dim" componets is the cell-id in grid coordinates, the last is the local-id inside the cell | 
| 123 |     openfpm::vector<aggregate<cnt_type[2]>,Memory,memory_traits_inte> part_ids; | 
| 124 |  | 
| 125 | 	//! \breif Size of the Neighborhood cells | 
| 126 | 	int cells_nn_test_size; | 
| 127 |  | 
| 128 | 	//! \brief Neighborhood of a cell to test | 
| 129 | 	openfpm::vector_gpu<aggregate<int>> cells_nn_test; | 
| 130 |  | 
| 131 | 	//! \brief for each sorted index it show the index in the unordered | 
| 132 | 	vector_cnt_type sorted_to_not_sorted; | 
| 133 |  | 
| 134 | 	//! Sorted domain particles domain or ghost | 
| 135 | 	vector_cnt_type sorted_domain_particles_dg; | 
| 136 |  | 
| 137 | 	//! \brief the index of all the domain particles in the sorted vector | 
| 138 | 	vector_cnt_type sorted_domain_particles_ids; | 
| 139 |  | 
| 140 | 	//! \brief for each non sorted index it show the index in the ordered vector | 
| 141 | 	vector_cnt_type non_sorted_to_sorted; | 
| 142 |  | 
| 143 | 	//! Spacing | 
| 144 | 	openfpm::array<T,dim,cnt_type> spacing_c; | 
| 145 |  | 
| 146 | 	//! \brief number of sub-divisions in each direction | 
| 147 | 	openfpm::array<ids_type,dim,cnt_type> div_c; | 
| 148 |  | 
| 149 | 	//! \brief cell padding | 
| 150 | 	openfpm::array<ids_type,dim,cnt_type> off; | 
| 151 |  | 
| 152 | 	//! Radius neighborhood | 
| 153 | 	openfpm::vector<aggregate<int>,Memory,memory_traits_inte> nnc_rad; | 
| 154 |  | 
| 155 | 	//! Additional information in general (used to understand if the cell-list) | 
| 156 | 	//! has been constructed from an old decomposition | 
| 157 | 	size_t n_dec; | 
| 158 |  | 
| 159 | 	//! Initialize the structures of the data structure | 
| 160 | 	void InitializeStructures(const size_t (& div)[dim], size_t tot_n_cell, size_t pad) | 
| 161 | 	{ | 
| 162 | 		for (size_t i = 0 ; i < dim ; i++) | 
| 163 | 		{ | 
| 164 | 			div_c[i] = div[i]; | 
| 165 | 			spacing_c[i] = this->getCellBox().getP2().get(i); | 
| 166 | 			off[i] = pad; | 
| 167 | 		} | 
| 168 |  | 
| 169 | 		cl_n.resize(tot_n_cell); | 
| 170 |  | 
| 171 | 		cells_nn_test_size = 1; | 
| 172 | 		construct_cell_nn_test(cells_nn_test_size); | 
| 173 | 	} | 
| 174 |  | 
| 175 | 	void construct_cell_nn_test(unsigned int box_nn = 1) | 
| 176 | 	{ | 
| 177 | 		auto & gs = this->getGrid(); | 
| 178 |  | 
| 179 | 		grid_key_dx<dim> start; | 
| 180 | 		grid_key_dx<dim> stop; | 
| 181 | 		grid_key_dx<dim> middle; | 
| 182 |  | 
| 183 | 		for (size_t i = 0 ; i < dim ; i++) | 
| 184 | 		{ | 
| 185 | 			start.set_d(i,0); | 
| 186 | 			stop.set_d(i,2*box_nn); | 
| 187 | 			middle.set_d(i,box_nn); | 
| 188 | 		} | 
| 189 |  | 
| 190 | 		cells_nn_test.resize(openfpm::math::pow(2*box_nn+1,dim)); | 
| 191 |  | 
| 192 | 		int mid = gs.LinId(middle); | 
| 193 |  | 
| 194 | 		grid_key_dx_iterator_sub<dim> it(gs,start,stop); | 
| 195 |  | 
| 196 | 		size_t i = 0; | 
| 197 | 		while (it.isNext()) | 
| 198 | 		{ | 
| 199 | 			auto p = it.get(); | 
| 200 |  | 
| 201 | 			cells_nn_test.template get<0>(i) = (int)gs.LinId(p) - mid; | 
| 202 |  | 
| 203 | 			++i; | 
| 204 | 			++it; | 
| 205 | 		} | 
| 206 |  | 
| 207 | 		cells_nn_test.template hostToDevice<0>(); | 
| 208 |  | 
| 209 | #if defined(__NVCC__) && defined(USE_LOW_REGISTER_ITERATOR) | 
| 210 |  | 
| 211 | 		// copy to the constant memory | 
| 212 | 		cudaMemcpyToSymbol(cells_striding,cells_nn_test.template getPointer<0>(),cells_nn_test.size()*sizeof(int)); | 
| 213 |  | 
| 214 | #endif | 
| 215 | 	} | 
| 216 |  | 
| 217 | 	/*! \brief This function construct a sparse cell-list | 
| 218 | 	 * | 
| 219 | 	 * | 
| 220 | 	 */ | 
| 221 | 	template<typename vector, typename vector_prp, unsigned int ... prp> | 
| 222 | 	void construct_sparse(vector & pl, | 
| 223 |   	   	   	 	 	 	  vector & pl_out, | 
| 224 |   	   	   	 	 	 	  vector_prp & pl_prp, | 
| 225 |   	   	   	 	 	 	  vector_prp & pl_prp_out, | 
| 226 |   	   	   	 	 	 	  mgpu::ofp_context_t & mgpuContext, | 
| 227 |   	   	   	 	 	 	  size_t g_m, | 
| 228 |  			   	   	   	  size_t start, | 
| 229 |  			   	   	   	  size_t stop, | 
| 230 |   	   	   	 	 	 	  cl_construct_opt opt = cl_construct_opt::Full) | 
| 231 | 	{ | 
| 232 | #ifdef __NVCC__ | 
| 233 |  | 
| 234 | 		part_ids.resize(stop - start); | 
| 235 | 		starts.resize(stop - start); | 
| 236 |  | 
| 237 | 		// Than we construct the ids | 
| 238 |  | 
| 239 | 		auto ite_gpu = pl.getGPUIteratorTo(stop-start); | 
| 240 |  | 
| 241 | 		if (ite_gpu.wthr.x == 0) | 
| 242 | 		{ | 
| 243 | 			return; | 
| 244 | 		} | 
| 245 |  | 
| 246 | 		CUDA_LAUNCH((subindex<true,dim,T,cnt_type,ids_type>),ite_gpu,div_c, | 
| 247 | 																		spacing_c, | 
| 248 | 																		off, | 
| 249 | 																		this->getTransform(), | 
| 250 | 																		pl.capacity(), | 
| 251 | 																		pl.size(), | 
| 252 | 																		part_ids.capacity(), | 
| 253 | 																		start, | 
| 254 | 																		static_cast<T *>(pl.template getDeviceBuffer<0>()), | 
| 255 | 																		static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), | 
| 256 | 																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); | 
| 257 |  | 
| 258 | 		// now we construct the cells | 
| 259 |  | 
| 260 | 		cells.resize(stop-start); | 
| 261 |  | 
| 262 | 		// Here we fill the sparse vector | 
| 263 | 		cl_sparse.clear(); | 
| 264 | 		cl_sparse.template setBackground<0>((cnt_type)-1); | 
| 265 | 		cl_sparse.setGPUInsertBuffer(ite_gpu.wthr.x,ite_gpu.thr.x); | 
| 266 | 		CUDA_LAUNCH((fill_cells_sparse),ite_gpu,cl_sparse.toKernel(),starts.toKernel()); | 
| 267 | 		cl_sparse.template flush_vd<sstart_<0>>(cells,mgpuContext,FLUSH_ON_DEVICE); | 
| 268 |  | 
| 269 | 		cells_nn.resize(cl_sparse.size()+1); | 
| 270 | 		cells_nn.template fill<0>(0); | 
| 271 |  | 
| 272 | 		// Here we construct the neighborhood cells for each cell | 
| 273 | 		auto itgg = cl_sparse.getGPUIterator(); | 
| 274 | 		CUDA_LAUNCH((count_nn_cells),itgg,cl_sparse.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel()); | 
| 275 |  | 
| 276 | 		// now we scan | 
| 277 | 		openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , mgpuContext); | 
| 278 |  | 
| 279 | 		cells_nn.template deviceToHost<0>(cells_nn.size() - 1, cells_nn.size() - 1); | 
| 280 | 		size_t n_nn_cells = cells_nn.template get<0>(cells_nn.size() - 1); | 
| 281 |  | 
| 282 | 		cells_nn_list.resize(n_nn_cells); | 
| 283 |  | 
| 284 | 		CUDA_LAUNCH((fill_nn_cells),itgg,cl_sparse.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cells_nn_list.toKernel(),cells.size()); | 
| 285 |  | 
| 286 | 		sorted_to_not_sorted.resize(stop-start); | 
| 287 | 		non_sorted_to_sorted.resize(pl.size()); | 
| 288 |  | 
| 289 | 		auto ite = pl.getGPUIteratorTo(stop-start,64); | 
| 290 |  | 
| 291 | 		// Here we reorder the particles to improve coalescing access | 
| 292 | 		CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), | 
| 293 | 				      decltype(pl.toKernel()), | 
| 294 | 				      decltype(sorted_to_not_sorted.toKernel()), | 
| 295 | 				      cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), | 
| 296 | 				                                                           pl_prp.toKernel(), | 
| 297 | 				                                                           pl_prp_out.toKernel(), | 
| 298 | 				                                                           pl.toKernel(), | 
| 299 | 				                                                           pl_out.toKernel(), | 
| 300 | 				                                                           sorted_to_not_sorted.toKernel(), | 
| 301 | 				                                                           non_sorted_to_sorted.toKernel(), | 
| 302 | 				                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); | 
| 303 |  | 
| 304 | 		if (opt == cl_construct_opt::Full) | 
| 305 | 		{ | 
| 306 | 			construct_domain_ids(mgpuContext,start,stop,g_m); | 
| 307 | 		} | 
| 308 |  | 
| 309 | 	#else | 
| 310 |  | 
| 311 | 			std::cout << "Error: "  <<  __FILE__ << ":"  << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler"  << std::endl; | 
| 312 |  | 
| 313 | 	#endif | 
| 314 | 	} | 
| 315 |  | 
| 316 | 	/*! \brief Construct the ids of the particles domain in the sorted array | 
| 317 | 	 * | 
| 318 | 	 * \param mgpuContext mgpu context | 
| 319 | 	 * | 
| 320 | 	 */ | 
| 321 | 	void construct_domain_ids(mgpu::ofp_context_t & mgpuContext, size_t start, size_t stop, size_t g_m) | 
| 322 | 	{ | 
| 323 | #ifdef __NVCC__ | 
| 324 | 		sorted_domain_particles_dg.resize(stop-start+1); | 
| 325 |  | 
| 326 | 		auto ite = sorted_domain_particles_dg.getGPUIterator(); | 
| 327 |  | 
| 328 | 		CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_dg.toKernel(),g_m); | 
| 329 |  | 
| 330 | 		// lets scan | 
| 331 | 		openfpm::scan((unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),mgpuContext); | 
| 332 |  | 
| 333 | 		sorted_domain_particles_dg.template deviceToHost<0>(sorted_domain_particles_dg.size()-1,sorted_domain_particles_dg.size()-1); | 
| 334 | 		auto sz = sorted_domain_particles_dg.template get<0>(sorted_domain_particles_dg.size()-1); | 
| 335 |  | 
| 336 | 		sorted_domain_particles_ids.resize(sz); | 
| 337 |  | 
| 338 | 		CUDA_LAUNCH((collect_domain_ghost_ids),ite,sorted_domain_particles_dg.toKernel(),sorted_domain_particles_ids.toKernel()); | 
| 339 | #endif | 
| 340 | 	} | 
| 341 |  | 
| 342 | 	/*! \brief This function construct a dense cell-list | 
| 343 | 	 * | 
| 344 | 	 * | 
| 345 | 	 */ | 
| 346 | 	template<typename vector, typename vector_prp, unsigned int ... prp> | 
| 347 | 	void construct_dense(vector & pl, | 
| 348 | 			   	   	   	 vector & pl_out, | 
| 349 | 			   	   	   	 vector_prp & pl_prp, | 
| 350 | 			   	   	   	 vector_prp & pl_prp_out, | 
| 351 | 			   	   	   	 mgpu::ofp_context_t & mgpuContext, | 
| 352 | 			   	   	   	 size_t g_m, | 
| 353 | 			   	   	   	 size_t start, | 
| 354 | 			   	   	   	 size_t stop, | 
| 355 | 			   	   	   	 cl_construct_opt opt = cl_construct_opt::Full) | 
| 356 | 	{ | 
| 357 | #ifdef __NVCC__ | 
| 358 |  | 
| 359 | 		CUDA_SAFE() | 
| 360 |  | 
| 361 | 		// Than we construct the ids | 
| 362 |  | 
| 363 | 		auto ite_gpu = pl.getGPUIteratorTo(stop-start-1); | 
| 364 |  | 
| 365 | 		cl_n.resize(this->gr_cell.size()+1); | 
| 366 | 		cl_n.template fill<0>(0); | 
| 367 | //		CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type))); | 
| 368 |  | 
| 369 | 		part_ids.resize(stop - start); | 
| 370 |  | 
| 371 | 		if (ite_gpu.wthr.x == 0 || pl.size() == 0 || stop == 0) | 
| 372 | 		{ | 
| 373 | 			// no particles | 
| 374 | 			starts.resize(cl_n.size()); | 
| 375 | 			starts.template fill<0>(0); | 
| 376 | 			return; | 
| 377 | 		} | 
| 378 |  | 
| 379 | 		CUDA_LAUNCH((subindex<false,dim,T,cnt_type,ids_type>),ite_gpu,div_c, | 
| 380 | 																		spacing_c, | 
| 381 | 																		off, | 
| 382 | 																		this->getTransform(), | 
| 383 | 																		pl.capacity(), | 
| 384 | 																		stop, | 
| 385 | 																		part_ids.capacity(), | 
| 386 | 																		start, | 
| 387 | 																		static_cast<T *>(pl.template getDeviceBuffer<0>()), | 
| 388 | 																		static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), | 
| 389 | 																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); | 
| 390 |  | 
| 391 | 		// now we scan | 
| 392 | 		starts.resize(cl_n.size()); | 
| 393 | 		openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , mgpuContext); | 
| 394 |  | 
| 395 | 		// now we construct the cells | 
| 396 |  | 
| 397 | 		cells.resize(stop-start); | 
| 398 | 		auto itgg = part_ids.getGPUIterator(); | 
| 399 |  | 
| 400 |  | 
| 401 | #ifdef MAKE_CELLLIST_DETERMINISTIC | 
| 402 |  | 
| 403 |                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, | 
| 404 |                                                                                             part_ids.size(), | 
| 405 |                                                                                             static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); | 
| 406 |  | 
| 407 |                 // sort | 
| 408 |  | 
| 409 |                 mgpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),mgpu::less_t<cnt_type>(),mgpuContext); | 
| 410 |  | 
| 411 | #else | 
| 412 |  | 
| 413 |                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, | 
| 414 |                                                                                                                                                                            div_c, | 
| 415 |                                                                                                                                                                            off, | 
| 416 |                                                                                                                                                                            part_ids.size(), | 
| 417 |                                                                                                                                                                            part_ids.capacity(), | 
| 418 | 																					   start, | 
| 419 |                                                                                                                                                                            static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), | 
| 420 |                                                                                                                                                                            static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()), | 
| 421 |                                                                                                                                                                            static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); | 
| 422 |  | 
| 423 | #endif | 
| 424 |  | 
| 425 |  | 
| 426 | 		sorted_to_not_sorted.resize(stop-start); | 
| 427 | 		non_sorted_to_sorted.resize(pl.size()); | 
| 428 |  | 
| 429 | 		auto ite = pl.getGPUIteratorTo(stop-start,64); | 
| 430 |  | 
| 431 | 		if (sizeof...(prp) == 0) | 
| 432 | 		{ | 
| 433 | 			// Here we reorder the particles to improve coalescing access | 
| 434 | 			CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), | 
| 435 | 						  decltype(pl.toKernel()), | 
| 436 | 						  decltype(sorted_to_not_sorted.toKernel()), | 
| 437 | 						  cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), | 
| 438 | 																			   pl_prp.toKernel(), | 
| 439 | 																			   pl_prp_out.toKernel(), | 
| 440 | 																			   pl.toKernel(), | 
| 441 | 																			   pl_out.toKernel(), | 
| 442 | 																			   sorted_to_not_sorted.toKernel(), | 
| 443 | 																			   non_sorted_to_sorted.toKernel(), | 
| 444 | 																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); | 
| 445 | 		} | 
| 446 | 		else | 
| 447 | 		{ | 
| 448 | 			// Here we reorder the particles to improve coalescing access | 
| 449 | 			CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()), | 
| 450 | 						  decltype(pl.toKernel()), | 
| 451 | 						  decltype(sorted_to_not_sorted.toKernel()), | 
| 452 | 						  cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(), | 
| 453 | 																			   pl_prp.toKernel(), | 
| 454 | 																			   pl_prp_out.toKernel(), | 
| 455 | 																			   pl.toKernel(), | 
| 456 | 																			   pl_out.toKernel(), | 
| 457 | 																			   sorted_to_not_sorted.toKernel(), | 
| 458 | 																			   non_sorted_to_sorted.toKernel(), | 
| 459 | 																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); | 
| 460 | 		} | 
| 461 |  | 
| 462 | 		if (opt == cl_construct_opt::Full) | 
| 463 | 		{ | 
| 464 | 			construct_domain_ids(mgpuContext,start,stop,g_m); | 
| 465 | 		} | 
| 466 |  | 
| 467 | 	#else | 
| 468 |  | 
| 469 | 			std::cout << "Error: "  <<  __FILE__ << ":"  << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler"  << std::endl; | 
| 470 |  | 
| 471 | 	#endif | 
| 472 | 	} | 
| 473 |  | 
| 474 | public: | 
| 475 |  | 
| 476 | 	//! Indicate that this cell list is a gpu type cell-list | 
| 477 | 	typedef int yes_is_gpu_celllist; | 
| 478 |  | 
| 479 | 	//! the type of the space | 
| 480 | 	typedef T stype; | 
| 481 |  | 
| 482 | 	//! dimensions of space | 
| 483 | 	static const unsigned int dims = dim; | 
| 484 |  | 
| 485 | 	//! count type | 
| 486 | 	typedef cnt_type cnt_type_; | 
| 487 |  | 
| 488 | 	//! id type | 
| 489 | 	typedef ids_type ids_type_; | 
| 490 |  | 
| 491 | 	//! transform type | 
| 492 | 	typedef transform transform_; | 
| 493 |  | 
| 494 | 	//! is sparse | 
| 495 | 	typedef boost::mpl::bool_<is_sparse> is_sparse_; | 
| 496 |  | 
| 497 | 	/*! \brief Copy constructor | 
| 498 | 	 * | 
| 499 | 	 * \param clg Cell list to copy | 
| 500 | 	 * | 
| 501 | 	 */ | 
| 502 | 	CellList_gpu(const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> & clg) | 
| 503 | 	{ | 
| 504 | 		this->operator=(clg); | 
| 505 | 	} | 
| 506 |  | 
| 507 | 	/*! \brief Copy constructor from temporal | 
| 508 | 	 * | 
| 509 | 	 * | 
| 510 | 	 * | 
| 511 | 	 */ | 
| 512 | 	CellList_gpu(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg) | 
| 513 | 	{ | 
| 514 | 		this->operator=(clg); | 
| 515 | 	} | 
| 516 |  | 
| 517 | 	/*! \brief default constructor | 
| 518 | 	 * | 
| 519 | 	 * | 
| 520 | 	 */ | 
| 521 | 	CellList_gpu() | 
| 522 | 	{} | 
| 523 |  | 
| 524 | 	CellList_gpu(const Box<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1) | 
| 525 | 	{ | 
| 526 | 		Initialize(box,div,pad); | 
| 527 | 	} | 
| 528 |  | 
| 529 |  | 
| 530 | 	/*! Initialize the cell list | 
| 531 | 	 * | 
| 532 | 	 * \param box Domain where this cell list is living | 
| 533 | 	 * \param div grid size on each dimension | 
| 534 | 	 * \param pad padding cell | 
| 535 | 	 * \param slot maximum number of slot | 
| 536 | 	 * | 
| 537 | 	 */ | 
| 538 | 	void Initialize(const Box<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1) | 
| 539 | 	{ | 
| 540 | 		SpaceBox<dim,T> sbox(box); | 
| 541 |  | 
| 542 | 		// Initialize point transformation | 
| 543 |  | 
| 544 | 		Initialize(sbox,div,pad); | 
| 545 | 	} | 
| 546 |  | 
| 547 | 	void setBoxNN(unsigned int n_NN) | 
| 548 | 	{ | 
| 549 | 		cells_nn_test_size = n_NN; | 
| 550 | 		construct_cell_nn_test(n_NN); | 
| 551 | 	} | 
| 552 |  | 
| 553 | 	void re_setBoxNN() | 
| 554 | 	{ | 
| 555 | 		construct_cell_nn_test(cells_nn_test_size); | 
| 556 | 	} | 
| 557 |  | 
| 558 | 	/*! Initialize the cell list constructor | 
| 559 | 	 * | 
| 560 | 	 * \param box Domain where this cell list is living | 
| 561 | 	 * \param div grid size on each dimension | 
| 562 | 	 * \param pad padding cell | 
| 563 | 	 * \param slot maximum number of slot | 
| 564 | 	 * | 
| 565 | 	 */ | 
| 566 | 	void Initialize(const SpaceBox<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1) | 
| 567 | 	{ | 
| 568 | 		Matrix<dim,T> mat; | 
| 569 | 		CellDecomposer_sm<dim,T,transform>::setDimensions(box,div, mat, pad); | 
| 570 |  | 
| 571 | 		// create the array that store the number of particle on each cell and se it to 0 | 
| 572 | 		InitializeStructures(this->gr_cell.getSize(),this->gr_cell.size(),pad); | 
| 573 | 	} | 
| 574 |  | 
| 575 | 	vector_cnt_type & getSortToNonSort() | 
| 576 | 	{ | 
| 577 | 		return sorted_to_not_sorted; | 
| 578 | 	} | 
| 579 |  | 
| 580 | 	vector_cnt_type & getNonSortToSort() | 
| 581 | 	{ | 
| 582 | 		return non_sorted_to_sorted; | 
| 583 | 	} | 
| 584 |  | 
| 585 | 	vector_cnt_type & getDomainSortIds() | 
| 586 | 	{ | 
| 587 | 		return sorted_domain_particles_ids; | 
| 588 | 	} | 
| 589 |  | 
| 590 |  | 
| 591 | 	/*! \brief Set the radius for the getNNIteratorRadius | 
| 592 | 	 * | 
| 593 | 	 * \param radius | 
| 594 | 	 * | 
| 595 | 	 */ | 
| 596 | 	void setRadius(T radius) | 
| 597 | 	{ | 
| 598 | 		openfpm::vector<long int> nnc_rad_; | 
| 599 |  | 
| 600 | 		NNcalc_rad(radius,nnc_rad_,this->getCellBox(),this->getGrid()); | 
| 601 |  | 
| 602 | 		nnc_rad.resize(nnc_rad_.size(),0); | 
| 603 |  | 
| 604 | 		// copy to nnc_rad | 
| 605 |  | 
| 606 | 		for (unsigned int i = 0 ; i < nnc_rad_.size() ; i++) | 
| 607 | 		{nnc_rad.template get<0>(i) = nnc_rad_.template get<0>(i);} | 
| 608 |  | 
| 609 | 		nnc_rad.template hostToDevice<0>(); | 
| 610 | 	} | 
| 611 |  | 
| 612 | 	/*! \brief construct from a list of particles | 
| 613 | 	 * | 
| 614 | 	 * \warning pl is assumed to be already be in device memory | 
| 615 | 	 * | 
| 616 | 	 * \param pl Particles list | 
| 617 | 	 * | 
| 618 | 	 */ | 
| 619 | 	template<typename vector, typename vector_prp, unsigned int ... prp> | 
| 620 | 	void construct(vector & pl, | 
| 621 | 				   vector & pl_out, | 
| 622 | 				   vector_prp & pl_prp, | 
| 623 | 				   vector_prp & pl_prp_out, | 
| 624 | 				   mgpu::ofp_context_t & mgpuContext, | 
| 625 | 				   size_t g_m = 0, | 
| 626 | 				   size_t start = 0, | 
| 627 | 				   size_t stop = (size_t)-1, | 
| 628 | 				   cl_construct_opt opt = cl_construct_opt::Full) | 
| 629 | 	{ | 
| 630 | 		// if stop if the default set to the number of particles | 
| 631 | 		if (stop == (size_t)-1) | 
| 632 | 		{stop = pl.size();} | 
| 633 |  | 
| 634 | 		if (is_sparse == false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);} | 
| 635 | 		else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);} | 
| 636 | 	} | 
| 637 |  | 
| 638 | 	CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse> toKernel() | 
| 639 | 	{ | 
| 640 | 		if (nnc_rad.size() == 0) | 
| 641 | 		{ | 
| 642 | 			// set the radius equal the cell spacing on direction X | 
| 643 | 			// (must be initialized to something to avoid warnings) | 
| 644 | 			setRadius(this->getCellBox().getHigh(0)); | 
| 645 | 		} | 
| 646 |  | 
| 647 | 		return CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform, | 
| 648 | 								  vector_cnt_type,openfpm::vector_gpu<aggregate<cnt_type,cnt_type>>, | 
| 649 | 								  decltype(cl_sparse),is_sparse> | 
| 650 | 		::get(starts, | 
| 651 | 				cells_nn, | 
| 652 | 				cells_nn_list, | 
| 653 | 				cl_sparse, | 
| 654 | 		    	sorted_to_not_sorted, | 
| 655 | 		    	sorted_domain_particles_ids, | 
| 656 | 		    	nnc_rad, | 
| 657 | 		        spacing_c, | 
| 658 | 		        div_c, | 
| 659 | 		        off, | 
| 660 | 		        this->getTransform(), | 
| 661 | 		        g_m); | 
| 662 | 	} | 
| 663 |  | 
| 664 | 	/*! \brief Clear the structure | 
| 665 | 	 * | 
| 666 | 	 * | 
| 667 | 	 */ | 
| 668 | 	void clear() | 
| 669 | 	{ | 
| 670 | 		cl_n.clear(); | 
| 671 | 		cells.clear(); | 
| 672 | 		starts.clear(); | 
| 673 | 		part_ids.clear(); | 
| 674 | 		sorted_to_not_sorted.clear(); | 
| 675 | 	} | 
| 676 |  | 
| 677 | 	///////////////////////////////////// | 
| 678 |  | 
| 679 | 	//! Ghost marker | 
| 680 | 	size_t g_m = 0; | 
| 681 |  | 
| 682 | 	/*! \brief return the ghost marker | 
| 683 | 	 * | 
| 684 | 	 * \return ghost marker | 
| 685 | 	 * | 
| 686 | 	 */ | 
| 687 | 	inline size_t get_gm() | 
| 688 | 	{ | 
| 689 | 		return g_m; | 
| 690 | 	} | 
| 691 |  | 
| 692 | 	/*! \brief Set the ghost marker | 
| 693 | 	 * | 
| 694 | 	 * \param g_m marker | 
| 695 | 	 * | 
| 696 | 	 */ | 
| 697 | 	inline void set_gm(size_t g_m) | 
| 698 | 	{ | 
| 699 | 		this->g_m = g_m; | 
| 700 | 	} | 
| 701 |  | 
| 702 | 	///////////////////////////////////// | 
| 703 |  | 
| 704 | 	/*! \brief Set the n_dec number | 
| 705 | 	 * | 
| 706 | 	 * \param n_dec | 
| 707 | 	 * | 
| 708 | 	 */ | 
| 709 | 	void set_ndec(size_t n_dec) | 
| 710 | 	{ | 
| 711 | 		this->n_dec = n_dec; | 
| 712 | 	} | 
| 713 |  | 
| 714 | 	/*! \brief Set the n_dec number | 
| 715 | 	 * | 
| 716 | 	 * \return n_dec | 
| 717 | 	 * | 
| 718 | 	 */ | 
| 719 | 	size_t get_ndec() const | 
| 720 | 	{ | 
| 721 | 		return n_dec; | 
| 722 | 	} | 
| 723 |  | 
| 724 | 	///////////////////////////////////// | 
| 725 |  | 
| 726 | 	/*! \brief Transfer the information computed on gpu to construct the cell-list on gpu | 
| 727 | 	 * | 
| 728 | 	 */ | 
| 729 | 	void debug_deviceToHost() | 
| 730 | 	{ | 
| 731 | 		cl_n.template deviceToHost<0>(); | 
| 732 | 		cells.template deviceToHost<0>(); | 
| 733 | 		starts.template deviceToHost<0>(); | 
| 734 | 	} | 
| 735 |  | 
| 736 | 	/*! \brief Return the numbers of cells contained in this cell-list | 
| 737 | 	 * | 
| 738 | 	 * \return the number of cells | 
| 739 | 	 * | 
| 740 | 	 */ | 
| 741 | 	size_t getNCells() | 
| 742 | 	{ | 
| 743 | 		return cl_n.size(); | 
| 744 | 	} | 
| 745 |  | 
| 746 | 	/*! \brief Return the numbers of elements in the cell | 
| 747 | 	 * | 
| 748 | 	 * \return the number of elements in the cell | 
| 749 | 	 * | 
| 750 | 	 */ | 
| 751 | 	size_t getNelements(size_t i) | 
| 752 | 	{ | 
| 753 | 		return cl_n.template get<0>(i); | 
| 754 | 	} | 
| 755 |  | 
| 756 | 	/*! \brief Get an element in the cell | 
| 757 | 	 * | 
| 758 | 	 * \tparam i property to get | 
| 759 | 	 * | 
| 760 | 	 * \param cell cell id | 
| 761 | 	 * \param ele element id | 
| 762 | 	 * | 
| 763 | 	 * \return The element value | 
| 764 | 	 * | 
| 765 | 	 */ | 
| 766 | 	inline auto get(size_t cell, size_t ele) -> decltype(cells.template get<0>(starts.template get<0>(cell)+ele)) | 
| 767 | 	{ | 
| 768 | 		return cells.template get<0>(starts.template get<0>(cell)+ele); | 
| 769 | 	} | 
| 770 |  | 
| 771 | 	/*! \brief Get an element in the cell | 
| 772 | 	 * | 
| 773 | 	 * \tparam i property to get | 
| 774 | 	 * | 
| 775 | 	 * \param cell cell id | 
| 776 | 	 * \param ele element id | 
| 777 | 	 * | 
| 778 | 	 * \return The element value | 
| 779 | 	 * | 
| 780 | 	 */ | 
| 781 | 	inline auto get(size_t cell, size_t ele) const -> decltype(cells.template get<0>(starts.template get<0>(cell)+ele)) | 
| 782 | 	{ | 
| 783 | 		return cells.template get<0>(starts.template get<0>(cell)+ele); | 
| 784 | 	} | 
| 785 |  | 
| 786 | 	/*! \brief swap the information of the two cell-lists | 
| 787 | 	 * | 
| 788 | 	 * | 
| 789 | 	 * | 
| 790 | 	 */ | 
| 791 | 	void swap(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg) | 
| 792 | 	{ | 
| 793 | 		((CellDecomposer_sm<dim,T,transform> *)this)->swap(clg); | 
| 794 | 		cl_n.swap(clg.cl_n); | 
| 795 | 		cells.swap(clg.cells); | 
| 796 | 		starts.swap(clg.starts); | 
| 797 | 		part_ids.swap(clg.part_ids); | 
| 798 | 		cl_sparse.swap(clg.cl_sparse); | 
| 799 | 		cells_nn.swap(clg.cells_nn); | 
| 800 | 		cells_nn_list.swap(clg.cells_nn_list); | 
| 801 | 		cells_nn_test.swap(clg.cells_nn_test); | 
| 802 | 		sorted_to_not_sorted.swap(clg.sorted_to_not_sorted); | 
| 803 | 		sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg); | 
| 804 | 		sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids); | 
| 805 | 		non_sorted_to_sorted.swap(clg.non_sorted_to_sorted); | 
| 806 |  | 
| 807 | 		spacing_c.swap(clg.spacing_c); | 
| 808 | 		div_c.swap(clg.div_c); | 
| 809 | 		off.swap(clg.off); | 
| 810 |  | 
| 811 | 		size_t g_m_tmp = g_m; | 
| 812 | 		g_m = clg.g_m; | 
| 813 | 		clg.g_m = g_m_tmp; | 
| 814 |  | 
| 815 | 		size_t n_dec_tmp = n_dec; | 
| 816 | 		n_dec = clg.n_dec; | 
| 817 | 		clg.n_dec = n_dec_tmp; | 
| 818 |  | 
| 819 | 		int cells_nn_test_size_tmp = cells_nn_test_size; | 
| 820 | 		cells_nn_test_size = clg.cells_nn_test_size; | 
| 821 | 		clg.cells_nn_test_size = cells_nn_test_size_tmp; | 
| 822 | 	} | 
| 823 |  | 
| 824 | 	CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & | 
| 825 | 	operator=(const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg) | 
| 826 | 	{ | 
| 827 | 		*static_cast<CellDecomposer_sm<dim,T,transform> *>(this) = *static_cast<const CellDecomposer_sm<dim,T,transform> *>(&clg); | 
| 828 | 		cl_n = clg.cl_n; | 
| 829 | 		cells = clg.cells; | 
| 830 | 		starts = clg.starts; | 
| 831 | 		part_ids = clg.part_ids; | 
| 832 | 		cl_sparse = clg.cl_sparse; | 
| 833 | 		cells_nn = clg.cells_nn; | 
| 834 | 		cells_nn_list = clg.cells_nn_list; | 
| 835 | 		cells_nn_test = clg.cells_nn_test; | 
| 836 | 		sorted_to_not_sorted = clg.sorted_to_not_sorted; | 
| 837 | 		sorted_domain_particles_dg = clg.sorted_domain_particles_dg; | 
| 838 | 		sorted_domain_particles_ids = clg.sorted_domain_particles_ids; | 
| 839 | 		non_sorted_to_sorted = clg.non_sorted_to_sorted; | 
| 840 |  | 
| 841 | 		spacing_c = clg.spacing_c; | 
| 842 | 		div_c = clg.div_c; | 
| 843 | 		off = clg.off; | 
| 844 | 		g_m = clg.g_m; | 
| 845 | 		n_dec = clg.n_dec; | 
| 846 |  | 
| 847 | 		cells_nn_test_size = clg.cells_nn_test_size; | 
| 848 |  | 
| 849 | 		return *this; | 
| 850 | 	} | 
| 851 |  | 
| 852 | 	CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> & | 
| 853 | 	operator=(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg) | 
| 854 | 	{ | 
| 855 | 		static_cast<CellDecomposer_sm<dim,T,transform> *>(this)->swap(*static_cast<CellDecomposer_sm<dim,T,transform> *>(&clg)); | 
| 856 | 		cl_n.swap(clg.cl_n); | 
| 857 | 		cells.swap(clg.cells); | 
| 858 | 		starts.swap(clg.starts); | 
| 859 | 		part_ids.swap(clg.part_ids); | 
| 860 | 		cl_sparse.swap(clg.cl_sparse); | 
| 861 | 		cells_nn.swap(clg.cells_nn); | 
| 862 | 		cells_nn_list.swap(clg.cells_nn_list); | 
| 863 | 		cells_nn_test.swap(clg.cells_nn_test); | 
| 864 | 		sorted_to_not_sorted.swap(clg.sorted_to_not_sorted); | 
| 865 | 		sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg); | 
| 866 | 		sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids); | 
| 867 | 		non_sorted_to_sorted.swap(clg.non_sorted_to_sorted); | 
| 868 |  | 
| 869 | 		spacing_c = clg.spacing_c; | 
| 870 | 		div_c = clg.div_c; | 
| 871 | 		off = clg.off; | 
| 872 | 		g_m = clg.g_m; | 
| 873 | 		n_dec = clg.n_dec; | 
| 874 |  | 
| 875 | 		cells_nn_test_size = clg.cells_nn_test_size; | 
| 876 |  | 
| 877 | 		return *this; | 
| 878 | 	} | 
| 879 | }; | 
| 880 |  | 
| 881 | // This is a tranformation node for vector_distributed for the algorithm toKernel_tranform | 
| 882 | template<template <typename> class layout_base, typename T> | 
| 883 | struct toKernel_transform<layout_base,T,4> | 
| 884 | { | 
| 885 | 	typedef CellList_gpu_ker<T::dims, | 
| 886 | 							 typename T::stype, | 
| 887 | 							 typename T::cnt_type_, | 
| 888 | 							 typename T::ids_type_, | 
| 889 | 							 typename T::transform_, | 
| 890 | 							 T::is_sparse_::value> type; | 
| 891 | }; | 
| 892 |  | 
| 893 | #endif | 
| 894 |  | 
| 895 | #endif /* OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ */ | 
| 896 |  |