1/*
2 * map_vector_sparse_cuda_kernels.cuh
3 *
4 * Created on: Jan 26, 2019
5 * Author: i-bird
6 */
7
8#ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
9#define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_
10
11#ifdef __NVCC__
12
13#if CUDART_VERSION < 11000
14#include "util/cuda/cub_old/util_type.cuh"
15#include "util/cuda/cub_old/block/block_scan.cuh"
16#include "util/cuda/moderngpu/operators.hxx"
17#else
18 #if !defined(CUDA_ON_CPU)
19 #include "cub/util_type.cuh"
20 #include "cub/block/block_scan.cuh"
21 #include "util/cuda/moderngpu/operators.hxx"
22 #endif
23#endif
24
25#endif
26
27template<typename type_t>
28struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> {
29 __device__ __host__ type_t operator()(type_t a, type_t b) const {
30 return b;
31 }
32};
33
34template<unsigned int prp>
35struct sRight_
36{
37 typedef boost::mpl::int_<prp> prop;
38
39 template<typename red_type> using op_red = rightOperand_t<red_type>;
40
41 template<typename red_type>
42 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
43 {
44 return r2;
45 }
46
47 static bool is_special()
48 {
49 return false;
50 }
51
52 //! is not special reduction so it does not need it
53 template<typename seg_type, typename output_type>
54 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
55 {}
56};
57
58template<typename type_t>
59struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> {
60 __device__ __host__ type_t operator()(type_t a, type_t b) const {
61 return a;
62 }
63};
64
65template<unsigned int prp>
66struct sLeft_
67{
68 typedef boost::mpl::int_<prp> prop;
69
70 template<typename red_type> using op_red = leftOperand_t<red_type>;
71
72 template<typename red_type>
73 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
74 {
75 return r1;
76 }
77
78 static bool is_special()
79 {
80 return false;
81 }
82
83 //! is not special reduction so it does not need it
84 template<typename seg_type, typename output_type>
85 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
86 {}
87};
88
89template<unsigned int prp>
90struct sadd_
91{
92 typedef boost::mpl::int_<prp> prop;
93
94#ifdef __NVCC__
95 template<typename red_type> using op_red = mgpu::plus_t<red_type>;
96#endif
97
98 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
99 {
100 return r1 + r2;
101 }
102
103 static bool is_special()
104 {
105 return false;
106 }
107
108 //! is not special reduction so it does not need it
109 template<typename seg_type, typename output_type>
110 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
111 {}
112};
113
114#ifdef __NVCC__
115
116template<typename vect_type>
117__global__ void set_one_insert_buffer(vect_type vadd)
118{
119 // points
120 const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
121
122 if (p >= vadd.size())
123 {return;}
124
125 vadd.template get<0>(p) = 1;
126}
127
128template<typename type_t, unsigned int blockLength>
129struct plus_block_t : public std::binary_function<type_t, type_t, type_t> {
130 __device__ __host__ type_t operator()(type_t a, type_t b) const {
131 type_t res;
132 for (int i=0; i<blockLength; ++i)
133 {
134 res[i] = a[i] + b[i];
135 }
136 return res;
137 }
138};
139
140#endif
141
142template<unsigned int prp, unsigned int blockLength>
143struct sadd_block_
144{
145 typedef boost::mpl::int_<prp> prop;
146
147#ifdef __NVCC__
148 template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
149#endif
150
151 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
152 {
153 red_type res;
154 for (int i=0; i<blockLength; ++i)
155 {
156 res[i] = r1[i] + r2[i];
157 }
158 return res;
159 }
160
161 static bool is_special()
162 {
163 return false;
164 }
165
166 //! is not special reduction so it does not need it
167 template<typename seg_type, typename output_type>
168 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
169 {}
170};
171
172template<unsigned int prp>
173struct smax_
174{
175 typedef boost::mpl::int_<prp> prop;
176
177#ifdef __NVCC__
178 template<typename red_type> using op_red = mgpu::maximum_t<red_type>;
179#endif
180
181 template<typename red_type>
182 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
183 {
184 return (r1 < r2)?r2:r1;
185 }
186
187 static bool is_special()
188 {
189 return false;
190 }
191
192 //! is not special reduction so it does not need it
193 template<typename seg_type, typename output_type>
194 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
195 {}
196};
197
198#ifdef __NVCC__
199
200template<typename type_t, unsigned int blockLength>
201struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> {
202 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
203 type_t res;
204 for (int i=0; i<blockLength; ++i)
205 {
206 res[i] = max(a[i], b[i]);
207 }
208 return res;
209 }
210};
211
212#endif
213
214template<unsigned int prp, unsigned int blockLength>
215struct smax_block_
216{
217 typedef boost::mpl::int_<prp> prop;
218
219#ifdef __NVCC__
220 template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
221#endif
222
223 template<typename red_type>
224 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
225 {
226 red_type res;
227 for (int i=0; i<blockLength; ++i)
228 {
229 res[i] = (r1[i] < r2[i])?r2[i]:r1[i];
230 }
231 return res;
232 }
233
234 static bool is_special()
235 {
236 return false;
237 }
238
239 //! is not special reduction so it does not need it
240 template<typename seg_type, typename output_type>
241 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
242 {}
243};
244
245
246
247template<unsigned int prp>
248struct smin_
249{
250 typedef boost::mpl::int_<prp> prop;
251
252#ifdef __NVCC__
253 template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
254#endif
255
256 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
257 {
258 return (r1 < r2)?r1:r2;
259 }
260
261 static bool is_special()
262 {
263 return false;
264 }
265
266 //! is not special reduction so it does not need it
267 template<typename seg_type, typename output_type>
268 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
269 {}
270};
271
272#ifdef __NVCC__
273
274template<typename type_t, unsigned int blockLength>
275struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> {
276 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
277 type_t res;
278 for (int i=0; i<blockLength; ++i)
279 {
280 res[i] = min(a[i], b[i]);
281 }
282 return res;
283 }
284};
285
286#endif
287
288template<unsigned int prp, unsigned int blockLength>
289struct smin_block_
290{
291 typedef boost::mpl::int_<prp> prop;
292
293#ifdef __NVCC__
294 template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
295#endif
296
297 template<typename red_type>
298 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
299 {
300 red_type res;
301 for (int i=0; i<blockLength; ++i)
302 {
303 res[i] = (r1[i] < r2[i])?r1[i]:r2[i];
304 }
305 return res;
306 }
307
308 static bool is_special()
309 {
310 return false;
311 }
312
313 //! is not special reduction so it does not need it
314 template<typename seg_type, typename output_type>
315 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
316 {}
317};
318
319
320#ifdef __NVCC__
321
322template<typename type_t>
323struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> {
324 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
325 return a|b;
326 }
327};
328
329template<unsigned int prp>
330struct sBitwiseOr_
331{
332 typedef boost::mpl::int_<prp> prop;
333
334 template<typename red_type> using op_red = bitwiseOr_t<red_type>;
335
336 template<typename red_type>
337 __device__ __host__ static red_type red(red_type & r1, red_type & r2)
338 {
339 return r1|r2;
340 }
341
342 static bool is_special()
343 {
344 return false;
345 }
346
347 //! is not special reduction so it does not need it
348 template<typename seg_type, typename output_type>
349 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
350 {}
351};
352
353
354template<unsigned int prp>
355struct sstart_
356{
357 typedef boost::mpl::int_<prp> prop;
358
359 template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
360
361 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
362 {
363 return 0;
364 }
365
366 static bool is_special()
367 {
368 return true;
369 }
370
371 //! is not special reduction so it does not need it
372 template<typename seg_type, typename output_type>
373 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
374 {
375 output.template get<0>(i) = seg_prev;
376 }
377};
378
379template<unsigned int prp>
380struct sstop_
381{
382 typedef boost::mpl::int_<prp> prop;
383
384 template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
385
386 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
387 {
388 return 0;
389 }
390
391 static bool is_special()
392 {
393 return true;
394 }
395
396 //! is not special reduction so it does not need it
397 template<typename seg_type, typename output_type>
398 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
399 {
400 output.template get<0>(i) = seg_next;
401 }
402};
403
404template<unsigned int prp>
405struct snum_
406{
407 typedef boost::mpl::int_<prp> prop;
408
409 template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
410
411 template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
412 {
413 return 0;
414 }
415
416 static bool is_special()
417 {
418 return true;
419 }
420
421 //! is not special reduction so it does not need it
422 template<typename seg_type, typename output_type>
423 __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
424 {
425 output.template get<0>(i) = seg_next - seg_prev;
426 }
427};
428
429
430template<typename vector_index_type>
431__global__ void construct_insert_list_key_only(vector_index_type vit_block_data,
432 vector_index_type vit_block_n,
433 vector_index_type vit_block_scan,
434 vector_index_type vit_list_0,
435 vector_index_type vit_list_1,
436 int nslot)
437{
438 int n_move = vit_block_n.template get<0>(blockIdx.x);
439 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
440 int start = vit_block_scan.template get<0>(blockIdx.x);
441
442 int i = 0;
443 for ( ; i < n_block_move ; i++)
444 {
445 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
446 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
447 }
448
449 // move remaining
450 if (threadIdx.x < n_move - i*blockDim.x )
451 {
452 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
453 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x;
454 }
455}
456
457template<typename vector_index_type>
458__global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data,
459 vector_index_type vit_block_n,
460 vector_index_type vit_block_scan,
461 vector_index_type vit_list_0,
462 vector_index_type vit_list_1,
463 int nslot)
464{
465 int p = blockIdx.x * blockDim.x + threadIdx.x;
466
467 if (p >= vit_block_data.size()) {return;}
468
469 int pool_id = p / nslot;
470 int thr_id = p % nslot;
471 int start = vit_block_scan.template get<0>(pool_id);
472 int n = vit_block_scan.template get<0>(pool_id+1) - start;
473
474 // move remaining
475 if (thr_id < n )
476 {
477 vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id);
478 vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id;
479 }
480}
481
482
483template<typename vector_index_type>
484__global__ void construct_remove_list(vector_index_type vit_block_data,
485 vector_index_type vit_block_n,
486 vector_index_type vit_block_scan,
487 vector_index_type vit_list_0,
488 vector_index_type vit_list_1,
489 int nslot)
490{
491 int n_move = vit_block_n.template get<0>(blockIdx.x);
492 int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x;
493 int start = vit_block_scan.template get<0>(blockIdx.x);
494
495 int i = 0;
496 for ( ; i < n_block_move ; i++)
497 {
498 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
499 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
500 }
501
502 // move remaining
503 if (threadIdx.x < n_move - i*blockDim.x )
504 {
505 vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x);
506 vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x;
507 }
508}
509
510
511template<typename e_type, typename v_reduce>
512struct data_merger
513{
514 e_type src1;
515 e_type src2;
516
517 e_type dst;
518
519 __device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst)
520 :src1(src1),src2(src2),dst(dst)
521 {
522 };
523
524
525 //! It call the copy function for each property
526 template<typename T>
527 __device__ __host__ inline void operator()(T& t) const
528 {
529 typedef typename boost::mpl::at<v_reduce,T>::type red_type;
530
531 dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>());
532 }
533};
534
535template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce>
536__global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data,
537 vector_index_type merge_index, vector_data_type vct_add_data,
538 vector_index_type vct_index_out, vector_data_type vct_data_out,
539 vector_index_type2 vct_tot_out,
540 int base)
541{
542 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
543
544 // Specialize BlockScan for a 1D block of 256 threads on type int
545 typedef cub::BlockScan<int, block_dim> BlockScan;
546 // Allocate shared memory for BlockScan
547 __shared__ typename BlockScan::TempStorage temp_storage;
548
549 int p = blockIdx.x * blockDim.x + threadIdx.x;
550
551 int scan = 0;
552 int predicate = 0;
553
554 if (p < vct_index.size())
555 {
556 index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
557 predicate = vct_index.template get<0>(p) != id_check;
558
559 scan = predicate;
560 }
561
562 // in shared memory scan
563 BlockScan(temp_storage).ExclusiveSum(scan, scan);
564
565 if (predicate == 1 && p < vct_index.size())
566 {
567 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
568
569 int index1 = merge_index.template get<0>(p);
570
571 auto e = vct_data_out.get(blockIdx.x*block_dim + scan);
572
573 if (index1 < base)
574 {
575 e = vct_data.get(index1);
576 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
577 }
578 else
579 {
580 e = vct_add_data.get(index1 - base);
581 vct_data_out.get(blockIdx.x*block_dim + scan) = e;
582 }
583 }
584
585 __syncthreads();
586
587 if (predicate == 0 && p < vct_index.size())
588 {
589 //! at the border of the block the index must be copied
590 if (threadIdx.x == blockDim.x-1)
591 {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);}
592
593 // we have to merge the data
594
595 typedef boost::mpl::vector<v_reduce ...> v_reduce_;
596
597 int index1 = merge_index.template get<0>(p);
598 int index2 = merge_index.template get<0>(p+1) - base;
599
600 data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1),
601 vct_add_data.get(index2),
602 vct_data_out.get(blockIdx.x*block_dim + scan));
603
604 // data_merge
605 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(dm);
606 }
607
608 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
609 {
610 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
611 vct_tot_out.template get<2>(blockIdx.x) = predicate;
612 }
613}
614
615
616template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim>
617__global__ void solve_conflicts_remove(vector_index_type vct_index,
618 vector_index_type merge_index,
619 vector_index_type vct_index_out,
620 vector_index_type vct_index_out_ps,
621 vector_index_type2 vct_tot_out,
622 int base)
623{
624 typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type;
625
626 // Specialize BlockScan for a 1D block of 256 threads on type int
627 typedef cub::BlockScan<int, block_dim> BlockScan;
628 // Allocate shared memory for BlockScan
629 __shared__ typename BlockScan::TempStorage temp_storage;
630
631 int p = blockIdx.x * blockDim.x + threadIdx.x;
632
633 int scan = 0;
634 int predicate = 0;
635 if (p < vct_index.size())
636 {
637 index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1);
638 index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1);
639 index_type id_check = vct_index.template get<0>(p);
640 predicate = id_check != id_check_p;
641 predicate &= id_check != id_check_n;
642 int mi = merge_index.template get<0>(p);
643 predicate &= (mi < base);
644
645 scan = predicate;
646 }
647 // in shared memory scan
648 BlockScan(temp_storage).ExclusiveSum(scan, scan);
649
650 if (predicate == 1 && p < vct_index.size())
651 {
652 vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);
653 vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p);
654 }
655
656 __syncthreads();
657
658 if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size())
659 {
660 vct_tot_out.template get<0>(blockIdx.x) = scan + predicate;
661 }
662}
663
664template<typename vector_type, typename vector_type2, typename red_op>
665__global__ void reduce_from_offset(vector_type segment_offset,
666 vector_type2 output,
667 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index)
668{
669 int p = blockIdx.x * blockDim.x + threadIdx.x;
670
671 if (p >= segment_offset.size()) return;
672
673 typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v;
674 if (p == segment_offset.size()-1)
675 {v = max_index;}
676 else
677 {v = segment_offset.template get<1>(p+1);}
678
679 red_op::set(v,segment_offset.template get<1>(p),output,p);
680}
681
682template<typename vector_index_type, typename vector_data_type, typename vector_index_type2>
683__global__ void realign(vector_index_type vct_index, vector_data_type vct_data,
684 vector_index_type vct_index_out, vector_data_type vct_data_out,
685 vector_index_type2 vct_tot_out_scan)
686{
687 int p = blockIdx.x * blockDim.x + threadIdx.x;
688
689 if (p >= vct_index.size()) return;
690
691 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
692
693 //! It is xorrect > not >=, The last thread in the block of solveConflict always copy the data
694 //! This mean that threadIdx.x == tot have always data to copy independently that the indexes are equal
695 //! or different
696 if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!!
697 {return;}
698
699 if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1)
700 {return;}
701
702 // this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case
703 // the thread 0 of the next block does not have to produce any data
704 if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0)
705 {return;}
706
707 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
708
709 if (ds + threadIdx.x >= vct_index_out.size())
710 {return;}
711
712 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
713
714 auto src = vct_data.get(p);
715 auto dst = vct_data_out.get(ds+threadIdx.x);
716
717 dst = src;
718}
719
720template<typename vector_index_type, typename vct_data_type, typename vector_index_type2>
721__global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data,
722 vector_index_type vct_index_out, vct_data_type vct_data_out,
723 vector_index_type2 vct_tot_out_scan)
724{
725 int p = blockIdx.x * blockDim.x + threadIdx.x;
726
727 if (p >= vct_index.size()) return;
728
729 int tot = vct_tot_out_scan.template get<0>(blockIdx.x);
730
731 if (threadIdx.x >= tot)
732 {return;}
733
734 int ds = vct_tot_out_scan.template get<1>(blockIdx.x);
735
736 vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p);
737
738 int oi = vct_m_index.template get<0>(p);
739
740 auto src = vct_data.get(oi);
741 auto dst = vct_data_out.get(ds+threadIdx.x);
742
743 dst = src;
744}
745
746template<typename vector_index_type, typename vector_data_type>
747__global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord)
748{
749 int p = blockIdx.x * blockDim.x + threadIdx.x;
750
751 if (p >= vi.size()) return;
752
753 // reorder based on v_ord the data vector
754
755 v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p));
756}
757
758template<typename vector_index_type>
759__global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out)
760{
761 int p = blockIdx.x * blockDim.x + threadIdx.x;
762
763 if (p >= vi.size()) return;
764
765 // reorder based on v_ord the data vector
766
767 seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p));
768}
769
770
771template<unsigned int prp, typename vector_type>
772__global__ void set_indexes(vector_type vd, int off)
773{
774 int p = blockIdx.x * blockDim.x + threadIdx.x;
775
776 if (p >= vd.size()) {return;}
777
778 vd.template get<prp>(p) = p + off;
779}
780
781#endif
782
783#endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */
784