map_vector_sparse_cuda_kernels.cuh source code [openfpm/openfpm_data/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh]

1	/*
2	* map_vector_sparse_cuda_kernels.cuh
3	*
4	* Created on: Jan 26, 2019
5	* Author: i-bird
6	*/
7
8	#ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9	#define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
10
11	#ifdef __NVCC__
12
13	#if CUDART_VERSION < 11000
14	#include "util/cuda/cub_old/util_type.cuh"
15	#include "util/cuda/cub_old/block/block_scan.cuh"
16	#include "util/cuda/moderngpu/operators.hxx"
17	#else
18	#if !defined(CUDA_ON_CPU)
19	#include "cub/util_type.cuh"
20	#include "cub/block/block_scan.cuh"
21	#include "util/cuda/moderngpu/operators.hxx"
22	#endif
23	#endif
24
25	#endif
26
27	template<typename type_t>
28	struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> {
29	__device__ __host__ type_t operator()(type_t a, type_t b) const {
30	return b;
31	}
32	};
33
34	template<unsigned int prp>
35	struct sRight_
36	{
37	typedef boost::mpl::int_<prp> prop;
38
39	template<typename red_type> using op_red = rightOperand_t<red_type>;
40
41	template<typename red_type>
42	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
43	{
44	return r2;
45	}
46
47	static bool is_special()
48	{
49	return false;
50	}
51
52	//! is not special reduction so it does not need it
53	template<typename seg_type, typename output_type>
54	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
55	{}
56	};
57
58	template<typename type_t>
59	struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> {
60	__device__ __host__ type_t operator()(type_t a, type_t b) const {
61	return a;
62	}
63	};
64
65	template<unsigned int prp>
66	struct sLeft_
67	{
68	typedef boost::mpl::int_<prp> prop;
69
70	template<typename red_type> using op_red = leftOperand_t<red_type>;
71
72	template<typename red_type>
73	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
74	{
75	return r1;
76	}
77
78	static bool is_special()
79	{
80	return false;
81	}
82
83	//! is not special reduction so it does not need it
84	template<typename seg_type, typename output_type>
85	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
86	{}
87	};
88
89	template<unsigned int prp>
90	struct sadd_
91	{
92	typedef boost::mpl::int_<prp> prop;
93
94	#ifdef __NVCC__
95	template<typename red_type> using op_red = mgpu::plus_t<red_type>;
96	#endif
97
98	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
99	{
100	return r1 + r2;
101	}
102
103	static bool is_special()
104	{
105	return false;
106	}
107
108	//! is not special reduction so it does not need it
109	template<typename seg_type, typename output_type>
110	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
111	{}
112	};
113
114	#ifdef __NVCC__
115
116	template<typename vect_type>
117	__global__ void set_one_insert_buffer(vect_type vadd)
118	{
119	// points
120	const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
121
122	if (p >= vadd.size())
123	{return;}
124
125	vadd.template get<`0`>(p) = `1`;
126	}
127
128	template<typename type_t, unsigned int blockLength>
129	struct plus_block_t : public std::binary_function<type_t, type_t, type_t> {
130	__device__ __host__ type_t operator()(type_t a, type_t b) const {
131	type_t res;
132	for (int i=`0`; i<blockLength; ++i)
133	{
134	res[i] = a[i] + b[i];
135	}
136	return res;
137	}
138	};
139
140	#endif
141
142	template<unsigned int prp, unsigned int blockLength>
143	struct sadd_block_
144	{
145	typedef boost::mpl::int_<prp> prop;
146
147	#ifdef __NVCC__
148	template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
149	#endif
150
151	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
152	{
153	red_type res;
154	for (int i=`0`; i<blockLength; ++i)
155	{
156	res[i] = r1[i] + r2[i];
157	}
158	return res;
159	}
160
161	static bool is_special()
162	{
163	return false;
164	}
165
166	//! is not special reduction so it does not need it
167	template<typename seg_type, typename output_type>
168	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
169	{}
170	};
171
172	template<unsigned int prp>
173	struct smax_
174	{
175	typedef boost::mpl::int_<prp> prop;
176
177	#ifdef __NVCC__
178	template<typename red_type> using op_red = mgpu::maximum_t<red_type>;
179	#endif
180
181	template<typename red_type>
182	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
183	{
184	return (r1 < r2)?r2:r1;
185	}
186
187	static bool is_special()
188	{
189	return false;
190	}
191
192	//! is not special reduction so it does not need it
193	template<typename seg_type, typename output_type>
194	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
195	{}
196	};
197
198	#ifdef __NVCC__
199
200	template<typename type_t, unsigned int blockLength>
201	struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> {
202	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
203	type_t res;
204	for (int i=`0`; i<blockLength; ++i)
205	{
206	res[i] = max(a[i], b[i]);
207	}
208	return res;
209	}
210	};
211
212	#endif
213
214	template<unsigned int prp, unsigned int blockLength>
215	struct smax_block_
216	{
217	typedef boost::mpl::int_<prp> prop;
218
219	#ifdef __NVCC__
220	template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
221	#endif
222
223	template<typename red_type>
224	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
225	{
226	red_type res;
227	for (int i=`0`; i<blockLength; ++i)
228	{
229	res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
230	}
231	return res;
232	}
233
234	static bool is_special()
235	{
236	return false;
237	}
238
239	//! is not special reduction so it does not need it
240	template<typename seg_type, typename output_type>
241	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
242	{}
243	};
244
245
246
247	template<unsigned int prp>
248	struct smin_
249	{
250	typedef boost::mpl::int_<prp> prop;
251
252	#ifdef __NVCC__
253	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
254	#endif
255
256	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
257	{
258	return (r1 < r2)?r1:r2;
259	}
260
261	static bool is_special()
262	{
263	return false;
264	}
265
266	//! is not special reduction so it does not need it
267	template<typename seg_type, typename output_type>
268	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
269	{}
270	};
271
272	#ifdef __NVCC__
273
274	template<typename type_t, unsigned int blockLength>
275	struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> {
276	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
277	type_t res;
278	for (int i=`0`; i<blockLength; ++i)
279	{
280	res[i] = min(a[i], b[i]);
281	}
282	return res;
283	}
284	};
285
286	#endif
287
288	template<unsigned int prp, unsigned int blockLength>
289	struct smin_block_
290	{
291	typedef boost::mpl::int_<prp> prop;
292
293	#ifdef __NVCC__
294	template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
295	#endif
296
297	template<typename red_type>
298	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
299	{
300	red_type res;
301	for (int i=`0`; i<blockLength; ++i)
302	{
303	res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
304	}
305	return res;
306	}
307
308	static bool is_special()
309	{
310	return false;
311	}
312
313	//! is not special reduction so it does not need it
314	template<typename seg_type, typename output_type>
315	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
316	{}
317	};
318
319
320	#ifdef __NVCC__
321
322	template<typename type_t>
323	struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> {
324	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
325	return a\|b;
326	}
327	};
328
329	template<unsigned int prp>
330	struct sBitwiseOr_
331	{
332	typedef boost::mpl::int_<prp> prop;
333
334	template<typename red_type> using op_red = bitwiseOr_t<red_type>;
335
336	template<typename red_type>
337	__device__ __host__ static red_type red(red_type & r1, red_type & r2)
338	{
339	return r1\|r2;
340	}
341
342	static bool is_special()
343	{
344	return false;
345	}
346
347	//! is not special reduction so it does not need it
348	template<typename seg_type, typename output_type>
349	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
350	{}
351	};
352
353
354	template<unsigned int prp>
355	struct sstart_
356	{
357	typedef boost::mpl::int_<prp> prop;
358
359	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
360
361	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
362	{
363	return `0`;
364	}
365
366	static bool is_special()
367	{
368	return true;
369	}
370
371	//! is not special reduction so it does not need it
372	template<typename seg_type, typename output_type>
373	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
374	{
375	output.template get<`0`>(i) = seg_prev;
376	}
377	};
378
379	template<unsigned int prp>
380	struct sstop_
381	{
382	typedef boost::mpl::int_<prp> prop;
383
384	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
385
386	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
387	{
388	return `0`;
389	}
390
391	static bool is_special()
392	{
393	return true;
394	}
395
396	//! is not special reduction so it does not need it
397	template<typename seg_type, typename output_type>
398	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
399	{
400	output.template get<`0`>(i) = seg_next;
401	}
402	};
403
404	template<unsigned int prp>
405	struct snum_
406	{
407	typedef boost::mpl::int_<prp> prop;
408
409	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
410
411	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
412	{
413	return `0`;
414	}
415
416	static bool is_special()
417	{
418	return true;
419	}
420
421	//! is not special reduction so it does not need it
422	template<typename seg_type, typename output_type>
423	__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
424	{
425	output.template get<`0`>(i) = seg_next - seg_prev;
426	}
427	};
428
429
430	template<typename vector_index_type>
431	__global__ void construct_insert_list_key_only(vector_index_type vit_block_data,
432	vector_index_type vit_block_n,
433	vector_index_type vit_block_scan,
434	vector_index_type vit_list_0,
435	vector_index_type vit_list_1,
436	int nslot)
437	{
438	int n_move = vit_block_n.template get<`0`>(blockIdx.x);
439	int n_block_move = vit_block_n.template get<`0`>(blockIdx.x) / blockDim.x;
440	int start = vit_block_scan.template get<`0`>(blockIdx.x);
441
442	int i = `0`;
443	for ( ; i < n_block_move ; i++)
444	{
445	vit_list_0.template get<`0`>(start + iblockDim.x + threadIdx.x) = vit_block_data.template get<`0`>(nslotblockIdx.x + i*blockDim.x + threadIdx.x);
446	vit_list_1.template get<`0`>(start + iblockDim.x + threadIdx.x) = nslotblockIdx.x + i*blockDim.x + threadIdx.x;
447	}
448
449	// move remaining
450	if (threadIdx.x < n_move - i*blockDim.x )
451	{
452	vit_list_0.template get<`0`>(start + iblockDim.x + threadIdx.x) = vit_block_data.template get<`0`>(nslotblockIdx.x + i*blockDim.x + threadIdx.x);
453	vit_list_1.template get<`0`>(start + iblockDim.x + threadIdx.x) = nslotblockIdx.x + i*blockDim.x + threadIdx.x;
454	}
455	}
456
457	template<typename vector_index_type>
458	__global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
459	vector_index_type vit_block_n,
460	vector_index_type vit_block_scan,
461	vector_index_type vit_list_0,
462	vector_index_type vit_list_1,
463	int nslot)
464	{
465	int p = blockIdx.x * blockDim.x + threadIdx.x;
466
467	if (p >= vit_block_data.size()) {return;}
468
469	int pool_id = p / nslot;
470	int thr_id = p % nslot;
471	int start = vit_block_scan.template get<`0`>(pool_id);
472	int n = vit_block_scan.template get<`0`>(pool_id+`1`) - start;
473
474	// move remaining
475	if (thr_id < n )
476	{
477	vit_list_0.template get<`0`>(start + thr_id) = vit_block_data.template get<`0`>(nslot*pool_id + thr_id);
478	vit_list_1.template get<`0`>(start + thr_id) = nslot*pool_id + thr_id;
479	}
480	}
481
482
483	template<typename vector_index_type>
484	__global__ void construct_remove_list(vector_index_type vit_block_data,
485	vector_index_type vit_block_n,
486	vector_index_type vit_block_scan,
487	vector_index_type vit_list_0,
488	vector_index_type vit_list_1,
489	int nslot)
490	{
491	int n_move = vit_block_n.template get<`0`>(blockIdx.x);
492	int n_block_move = vit_block_n.template get<`0`>(blockIdx.x) / blockDim.x;
493	int start = vit_block_scan.template get<`0`>(blockIdx.x);
494
495	int i = `0`;
496	for ( ; i < n_block_move ; i++)
497	{
498	vit_list_0.template get<`0`>(start + iblockDim.x + threadIdx.x) = vit_block_data.template get<`0`>(nslotblockIdx.x + i*blockDim.x + threadIdx.x);
499	vit_list_1.template get<`0`>(start + iblockDim.x + threadIdx.x) = start + iblockDim.x + threadIdx.x;
500	}
501
502	// move remaining
503	if (threadIdx.x < n_move - i*blockDim.x )
504	{
505	vit_list_0.template get<`0`>(start + iblockDim.x + threadIdx.x) = vit_block_data.template get<`0`>(nslotblockIdx.x + i*blockDim.x + threadIdx.x);
506	vit_list_1.template get<`0`>(start + iblockDim.x + threadIdx.x) = start + iblockDim.x + threadIdx.x;
507	}
508	}
509
510
511	template<typename e_type, typename v_reduce>
512	struct data_merger
513	{
514	e_type src1;
515	e_type src2;
516
517	e_type dst;
518
519	__device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst)
520	:src1(src1),src2(src2),dst(dst)
521	{
522	};
523
524
525	//! It call the copy function for each property
526	template<typename T>
527	__device__ __host__ inline void operator()(T& t) const
528	{
529	typedef typename boost::mpl::at<v_reduce,T>::type red_type;
530
531	dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
532	}
533	};
534
535	template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce>
536	__global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
537	vector_index_type merge_index, vector_data_type vct_add_data,
538	vector_index_type vct_index_out, vector_data_type vct_data_out,
539	vector_index_type2 vct_tot_out,
540	int base)
541	{
542	typedef typename std::remove_reference<decltype(vct_index.template get<`0`>(`0`))>::type index_type;
543
544	// Specialize BlockScan for a 1D block of 256 threads on type int
545	typedef cub::BlockScan<int, block_dim> BlockScan;
546	// Allocate shared memory for BlockScan
547	__shared__ typename BlockScan::TempStorage temp_storage;
548
549	int p = blockIdx.x * blockDim.x + threadIdx.x;
550
551	int scan = `0`;
552	int predicate = `0`;
553
554	if (p < vct_index.size())
555	{
556	index_type id_check = (p == vct_index.size() - `1`)?(index_type)-`1`:vct_index.template get<`0`>(p+`1`);
557	predicate = vct_index.template get<`0`>(p) != id_check;
558
559	scan = predicate;
560	}
561
562	// in shared memory scan
563	BlockScan(temp_storage).ExclusiveSum(scan, scan);
564
565	if (predicate == `1` && p < vct_index.size())
566	{
567	vct_index_out.template get<`0`>(blockIdx.x*block_dim + scan) = vct_index.template get<`0`>(p);
568
569	int index1 = merge_index.template get<`0`>(p);
570
571	auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
572
573	if (index1 < base)
574	{
575	e = vct_data.get(index1);
576	vct_data_out.get(blockIdx.x*block_dim + scan) = e;
577	}
578	else
579	{
580	e = vct_add_data.get(index1 - base);
581	vct_data_out.get(blockIdx.x*block_dim + scan) = e;
582	}
583	}
584
585	__syncthreads();
586
587	if (predicate == `0` && p < vct_index.size())
588	{
589	//! at the border of the block the index must be copied
590	if (threadIdx.x == blockDim.x-`1`)
591	{vct_index_out.template get<`0`>(blockIdx.x*block_dim + scan) = vct_index.template get<`0`>(p);}
592
593	// we have to merge the data
594
595	typedef boost::mpl::vector<v_reduce ...> v_reduce_;
596
597	int index1 = merge_index.template get<`0`>(p);
598	int index2 = merge_index.template get<`0`>(p+`1`) - base;
599
600	data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
601	vct_add_data.get(index2),
602	vct_data_out.get(blockIdx.x*block_dim + scan));
603
604	// data_merge
605	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(dm);
606	}
607
608	if ((threadIdx.x == blockDim.x - `1` \|\| p == vct_index.size() - `1`) && p < vct_index.size())
609	{
610	vct_tot_out.template get<`0`>(blockIdx.x) = scan + predicate;
611	vct_tot_out.template get<`2`>(blockIdx.x) = predicate;
612	}
613	}
614
615
616	template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim>
617	__global__ void solve_conflicts_remove(vector_index_type vct_index,
618	vector_index_type merge_index,
619	vector_index_type vct_index_out,
620	vector_index_type vct_index_out_ps,
621	vector_index_type2 vct_tot_out,
622	int base)
623	{
624	typedef typename std::remove_reference<decltype(vct_index.template get<`0`>(`0`))>::type index_type;
625
626	// Specialize BlockScan for a 1D block of 256 threads on type int
627	typedef cub::BlockScan<int, block_dim> BlockScan;
628	// Allocate shared memory for BlockScan
629	__shared__ typename BlockScan::TempStorage temp_storage;
630
631	int p = blockIdx.x * blockDim.x + threadIdx.x;
632
633	int scan = `0`;
634	int predicate = `0`;
635	if (p < vct_index.size())
636	{
637	index_type id_check_n = (p == vct_index.size() - `1`)?(index_type)-`1`:vct_index.template get<`0`>(p+`1`);
638	index_type id_check_p = (p == `0`)?(index_type)-`1`:vct_index.template get<`0`>(p-`1`);
639	index_type id_check = vct_index.template get<`0`>(p);
640	predicate = id_check != id_check_p;
641	predicate &= id_check != id_check_n;
642	int mi = merge_index.template get<`0`>(p);
643	predicate &= (mi < base);
644
645	scan = predicate;
646	}
647	// in shared memory scan
648	BlockScan(temp_storage).ExclusiveSum(scan, scan);
649
650	if (predicate == `1` && p < vct_index.size())
651	{
652	vct_index_out.template get<`0`>(blockIdx.x*block_dim + scan) = vct_index.template get<`0`>(p);
653	vct_index_out_ps.template get<`0`>(blockIdx.x*block_dim + scan) = merge_index.template get<`0`>(p);
654	}
655
656	__syncthreads();
657
658	if ((threadIdx.x == blockDim.x - `1` \|\| p == vct_index.size() - `1`) && p < vct_index.size())
659	{
660	vct_tot_out.template get<`0`>(blockIdx.x) = scan + predicate;
661	}
662	}
663
664	template<typename vector_type, typename vector_type2, typename red_op>
665	__global__ void reduce_from_offset(vector_type segment_offset,
666	vector_type2 output,
667	typename std::remove_reference<decltype(segment_offset.template get<`1`>(`0`))>::type max_index)
668	{
669	int p = blockIdx.x * blockDim.x + threadIdx.x;
670
671	if (p >= segment_offset.size()) return;
672
673	typename std::remove_reference<decltype(segment_offset.template get<`1`>(`0`))>::type v;
674	if (p == segment_offset.size()-`1`)
675	{v = max_index;}
676	else
677	{v = segment_offset.template get<`1`>(p+`1`);}
678
679	red_op::set(v,segment_offset.template get<`1`>(p),output,p);
680	}
681
682	template<typename vector_index_type, typename vector_data_type, typename vector_index_type2>
683	__global__ void realign(vector_index_type vct_index, vector_data_type vct_data,
684	vector_index_type vct_index_out, vector_data_type vct_data_out,
685	vector_index_type2 vct_tot_out_scan)
686	{
687	int p = blockIdx.x * blockDim.x + threadIdx.x;
688
689	if (p >= vct_index.size()) return;
690
691	int tot = vct_tot_out_scan.template get<`0`>(blockIdx.x);
692
693	//! It is xorrect > not >=, The last thread in the block of solveConflict always copy the data
694	//! This mean that threadIdx.x == tot have always data to copy independently that the indexes are equal
695	//! or different
696	if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!!
697	{return;}
698
699	if (threadIdx.x == tot && vct_tot_out_scan.template get<`2`>(blockIdx.x) == `1`)
700	{return;}
701
702	// this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case
703	// the thread 0 of the next block does not have to produce any data
704	if (threadIdx.x == `0` && blockIdx.x != `0` && vct_tot_out_scan.template get<`2`>(blockIdx.x - `1`) == `0`)
705	{return;}
706
707	int ds = vct_tot_out_scan.template get<`1`>(blockIdx.x);
708
709	if (ds + threadIdx.x >= vct_index_out.size())
710	{return;}
711
712	vct_index_out.template get<`0`>(ds+threadIdx.x) = vct_index.template get<`0`>(p);
713
714	auto src = vct_data.get(p);
715	auto dst = vct_data_out.get(ds+threadIdx.x);
716
717	dst = src;
718	}
719
720	template<typename vector_index_type, typename vct_data_type, typename vector_index_type2>
721	__global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
722	vector_index_type vct_index_out, vct_data_type vct_data_out,
723	vector_index_type2 vct_tot_out_scan)
724	{
725	int p = blockIdx.x * blockDim.x + threadIdx.x;
726
727	if (p >= vct_index.size()) return;
728
729	int tot = vct_tot_out_scan.template get<`0`>(blockIdx.x);
730
731	if (threadIdx.x >= tot)
732	{return;}
733
734	int ds = vct_tot_out_scan.template get<`1`>(blockIdx.x);
735
736	vct_index_out.template get<`0`>(ds+threadIdx.x) = vct_index.template get<`0`>(p);
737
738	int oi = vct_m_index.template get<`0`>(p);
739
740	auto src = vct_data.get(oi);
741	auto dst = vct_data_out.get(ds+threadIdx.x);
742
743	dst = src;
744	}
745
746	template<typename vector_index_type, typename vector_data_type>
747	__global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
748	{
749	int p = blockIdx.x * blockDim.x + threadIdx.x;
750
751	if (p >= vi.size()) return;
752
753	// reorder based on v_ord the data vector
754
755	v_data_ord.get_o(p) = v_data.get_o(vi.template get<`0`>(p));
756	}
757
758	template<typename vector_index_type>
759	__global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
760	{
761	int p = blockIdx.x * blockDim.x + threadIdx.x;
762
763	if (p >= vi.size()) return;
764
765	// reorder based on v_ord the data vector
766
767	seg_out.template get<`0`>(p) = seg_in.template get<`0`>(vi.template get<`0`>(p));
768	}
769
770
771	template<unsigned int prp, typename vector_type>
772	__global__ void set_indexes(vector_type vd, int off)
773	{
774	int p = blockIdx.x * blockDim.x + threadIdx.x;
775
776	if (p >= vd.size()) {return;}
777
778	vd.template get<prp>(p) = p + off;
779	}
780
781	#endif
782
783	#endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */
784

Browse the source code of openfpm/openfpm_data/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh