1 | /* |
2 | * map_vector_sparse_cuda_kernels.cuh |
3 | * |
4 | * Created on: Jan 26, 2019 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ |
9 | #define MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ |
10 | |
11 | #ifdef __NVCC__ |
12 | |
13 | #if CUDART_VERSION < 11000 |
14 | #include "util/cuda/cub_old/util_type.cuh" |
15 | #include "util/cuda/cub_old/block/block_scan.cuh" |
16 | #include "util/cuda/moderngpu/operators.hxx" |
17 | #else |
18 | #if !defined(CUDA_ON_CPU) |
19 | #include "cub/util_type.cuh" |
20 | #include "cub/block/block_scan.cuh" |
21 | #include "util/cuda/moderngpu/operators.hxx" |
22 | #endif |
23 | #endif |
24 | |
25 | #endif |
26 | |
27 | template<typename type_t> |
28 | struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> { |
29 | __device__ __host__ type_t operator()(type_t a, type_t b) const { |
30 | return b; |
31 | } |
32 | }; |
33 | |
34 | template<unsigned int prp> |
35 | struct sRight_ |
36 | { |
37 | typedef boost::mpl::int_<prp> prop; |
38 | |
39 | template<typename red_type> using op_red = rightOperand_t<red_type>; |
40 | |
41 | template<typename red_type> |
42 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
43 | { |
44 | return r2; |
45 | } |
46 | |
47 | static bool is_special() |
48 | { |
49 | return false; |
50 | } |
51 | |
52 | //! is not special reduction so it does not need it |
53 | template<typename seg_type, typename output_type> |
54 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
55 | {} |
56 | }; |
57 | |
58 | template<typename type_t> |
59 | struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> { |
60 | __device__ __host__ type_t operator()(type_t a, type_t b) const { |
61 | return a; |
62 | } |
63 | }; |
64 | |
65 | template<unsigned int prp> |
66 | struct sLeft_ |
67 | { |
68 | typedef boost::mpl::int_<prp> prop; |
69 | |
70 | template<typename red_type> using op_red = leftOperand_t<red_type>; |
71 | |
72 | template<typename red_type> |
73 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
74 | { |
75 | return r1; |
76 | } |
77 | |
78 | static bool is_special() |
79 | { |
80 | return false; |
81 | } |
82 | |
83 | //! is not special reduction so it does not need it |
84 | template<typename seg_type, typename output_type> |
85 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
86 | {} |
87 | }; |
88 | |
89 | template<unsigned int prp> |
90 | struct sadd_ |
91 | { |
92 | typedef boost::mpl::int_<prp> prop; |
93 | |
94 | #ifdef __NVCC__ |
95 | template<typename red_type> using op_red = mgpu::plus_t<red_type>; |
96 | #endif |
97 | |
98 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
99 | { |
100 | return r1 + r2; |
101 | } |
102 | |
103 | static bool is_special() |
104 | { |
105 | return false; |
106 | } |
107 | |
108 | //! is not special reduction so it does not need it |
109 | template<typename seg_type, typename output_type> |
110 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
111 | {} |
112 | }; |
113 | |
114 | #ifdef __NVCC__ |
115 | |
116 | template<typename vect_type> |
117 | __global__ void set_one_insert_buffer(vect_type vadd) |
118 | { |
119 | // points |
120 | const unsigned int p = blockIdx.x * blockDim.x + threadIdx.x; |
121 | |
122 | if (p >= vadd.size()) |
123 | {return;} |
124 | |
125 | vadd.template get<0>(p) = 1; |
126 | } |
127 | |
128 | template<typename type_t, unsigned int blockLength> |
129 | struct plus_block_t : public std::binary_function<type_t, type_t, type_t> { |
130 | __device__ __host__ type_t operator()(type_t a, type_t b) const { |
131 | type_t res; |
132 | for (int i=0; i<blockLength; ++i) |
133 | { |
134 | res[i] = a[i] + b[i]; |
135 | } |
136 | return res; |
137 | } |
138 | }; |
139 | |
140 | #endif |
141 | |
142 | template<unsigned int prp, unsigned int blockLength> |
143 | struct sadd_block_ |
144 | { |
145 | typedef boost::mpl::int_<prp> prop; |
146 | |
147 | #ifdef __NVCC__ |
148 | template<typename red_type> using op_red = plus_block_t<red_type, blockLength>; |
149 | #endif |
150 | |
151 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
152 | { |
153 | red_type res; |
154 | for (int i=0; i<blockLength; ++i) |
155 | { |
156 | res[i] = r1[i] + r2[i]; |
157 | } |
158 | return res; |
159 | } |
160 | |
161 | static bool is_special() |
162 | { |
163 | return false; |
164 | } |
165 | |
166 | //! is not special reduction so it does not need it |
167 | template<typename seg_type, typename output_type> |
168 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
169 | {} |
170 | }; |
171 | |
172 | template<unsigned int prp> |
173 | struct smax_ |
174 | { |
175 | typedef boost::mpl::int_<prp> prop; |
176 | |
177 | #ifdef __NVCC__ |
178 | template<typename red_type> using op_red = mgpu::maximum_t<red_type>; |
179 | #endif |
180 | |
181 | template<typename red_type> |
182 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
183 | { |
184 | return (r1 < r2)?r2:r1; |
185 | } |
186 | |
187 | static bool is_special() |
188 | { |
189 | return false; |
190 | } |
191 | |
192 | //! is not special reduction so it does not need it |
193 | template<typename seg_type, typename output_type> |
194 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
195 | {} |
196 | }; |
197 | |
198 | #ifdef __NVCC__ |
199 | |
200 | template<typename type_t, unsigned int blockLength> |
201 | struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> { |
202 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
203 | type_t res; |
204 | for (int i=0; i<blockLength; ++i) |
205 | { |
206 | res[i] = max(a[i], b[i]); |
207 | } |
208 | return res; |
209 | } |
210 | }; |
211 | |
212 | #endif |
213 | |
214 | template<unsigned int prp, unsigned int blockLength> |
215 | struct smax_block_ |
216 | { |
217 | typedef boost::mpl::int_<prp> prop; |
218 | |
219 | #ifdef __NVCC__ |
220 | template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>; |
221 | #endif |
222 | |
223 | template<typename red_type> |
224 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
225 | { |
226 | red_type res; |
227 | for (int i=0; i<blockLength; ++i) |
228 | { |
229 | res[i] = (r1[i] < r2[i])?r2[i]:r1[i]; |
230 | } |
231 | return res; |
232 | } |
233 | |
234 | static bool is_special() |
235 | { |
236 | return false; |
237 | } |
238 | |
239 | //! is not special reduction so it does not need it |
240 | template<typename seg_type, typename output_type> |
241 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
242 | {} |
243 | }; |
244 | |
245 | |
246 | |
247 | template<unsigned int prp> |
248 | struct smin_ |
249 | { |
250 | typedef boost::mpl::int_<prp> prop; |
251 | |
252 | #ifdef __NVCC__ |
253 | template<typename red_type> using op_red = mgpu::minimum_t<red_type>; |
254 | #endif |
255 | |
256 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
257 | { |
258 | return (r1 < r2)?r1:r2; |
259 | } |
260 | |
261 | static bool is_special() |
262 | { |
263 | return false; |
264 | } |
265 | |
266 | //! is not special reduction so it does not need it |
267 | template<typename seg_type, typename output_type> |
268 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
269 | {} |
270 | }; |
271 | |
272 | #ifdef __NVCC__ |
273 | |
274 | template<typename type_t, unsigned int blockLength> |
275 | struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> { |
276 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
277 | type_t res; |
278 | for (int i=0; i<blockLength; ++i) |
279 | { |
280 | res[i] = min(a[i], b[i]); |
281 | } |
282 | return res; |
283 | } |
284 | }; |
285 | |
286 | #endif |
287 | |
288 | template<unsigned int prp, unsigned int blockLength> |
289 | struct smin_block_ |
290 | { |
291 | typedef boost::mpl::int_<prp> prop; |
292 | |
293 | #ifdef __NVCC__ |
294 | template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>; |
295 | #endif |
296 | |
297 | template<typename red_type> |
298 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
299 | { |
300 | red_type res; |
301 | for (int i=0; i<blockLength; ++i) |
302 | { |
303 | res[i] = (r1[i] < r2[i])?r1[i]:r2[i]; |
304 | } |
305 | return res; |
306 | } |
307 | |
308 | static bool is_special() |
309 | { |
310 | return false; |
311 | } |
312 | |
313 | //! is not special reduction so it does not need it |
314 | template<typename seg_type, typename output_type> |
315 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
316 | {} |
317 | }; |
318 | |
319 | |
320 | #ifdef __NVCC__ |
321 | |
322 | template<typename type_t> |
323 | struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> { |
324 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
325 | return a|b; |
326 | } |
327 | }; |
328 | |
329 | template<unsigned int prp> |
330 | struct sBitwiseOr_ |
331 | { |
332 | typedef boost::mpl::int_<prp> prop; |
333 | |
334 | template<typename red_type> using op_red = bitwiseOr_t<red_type>; |
335 | |
336 | template<typename red_type> |
337 | __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
338 | { |
339 | return r1|r2; |
340 | } |
341 | |
342 | static bool is_special() |
343 | { |
344 | return false; |
345 | } |
346 | |
347 | //! is not special reduction so it does not need it |
348 | template<typename seg_type, typename output_type> |
349 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
350 | {} |
351 | }; |
352 | |
353 | |
354 | template<unsigned int prp> |
355 | struct sstart_ |
356 | { |
357 | typedef boost::mpl::int_<prp> prop; |
358 | |
359 | template<typename red_type> using op_red = mgpu::minimum_t<red_type>; |
360 | |
361 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
362 | { |
363 | return 0; |
364 | } |
365 | |
366 | static bool is_special() |
367 | { |
368 | return true; |
369 | } |
370 | |
371 | //! is not special reduction so it does not need it |
372 | template<typename seg_type, typename output_type> |
373 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
374 | { |
375 | output.template get<0>(i) = seg_prev; |
376 | } |
377 | }; |
378 | |
379 | template<unsigned int prp> |
380 | struct sstop_ |
381 | { |
382 | typedef boost::mpl::int_<prp> prop; |
383 | |
384 | template<typename red_type> using op_red = mgpu::minimum_t<red_type>; |
385 | |
386 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
387 | { |
388 | return 0; |
389 | } |
390 | |
391 | static bool is_special() |
392 | { |
393 | return true; |
394 | } |
395 | |
396 | //! is not special reduction so it does not need it |
397 | template<typename seg_type, typename output_type> |
398 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
399 | { |
400 | output.template get<0>(i) = seg_next; |
401 | } |
402 | }; |
403 | |
404 | template<unsigned int prp> |
405 | struct snum_ |
406 | { |
407 | typedef boost::mpl::int_<prp> prop; |
408 | |
409 | template<typename red_type> using op_red = mgpu::minimum_t<red_type>; |
410 | |
411 | template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) |
412 | { |
413 | return 0; |
414 | } |
415 | |
416 | static bool is_special() |
417 | { |
418 | return true; |
419 | } |
420 | |
421 | //! is not special reduction so it does not need it |
422 | template<typename seg_type, typename output_type> |
423 | __device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i) |
424 | { |
425 | output.template get<0>(i) = seg_next - seg_prev; |
426 | } |
427 | }; |
428 | |
429 | |
430 | template<typename vector_index_type> |
431 | __global__ void construct_insert_list_key_only(vector_index_type vit_block_data, |
432 | vector_index_type vit_block_n, |
433 | vector_index_type vit_block_scan, |
434 | vector_index_type vit_list_0, |
435 | vector_index_type vit_list_1, |
436 | int nslot) |
437 | { |
438 | int n_move = vit_block_n.template get<0>(blockIdx.x); |
439 | int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x; |
440 | int start = vit_block_scan.template get<0>(blockIdx.x); |
441 | |
442 | int i = 0; |
443 | for ( ; i < n_block_move ; i++) |
444 | { |
445 | vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x); |
446 | vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x; |
447 | } |
448 | |
449 | // move remaining |
450 | if (threadIdx.x < n_move - i*blockDim.x ) |
451 | { |
452 | vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x); |
453 | vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = nslot*blockIdx.x + i*blockDim.x + threadIdx.x; |
454 | } |
455 | } |
456 | |
457 | template<typename vector_index_type> |
458 | __global__ void construct_insert_list_key_only_small_pool(vector_index_type vit_block_data, |
459 | vector_index_type vit_block_n, |
460 | vector_index_type vit_block_scan, |
461 | vector_index_type vit_list_0, |
462 | vector_index_type vit_list_1, |
463 | int nslot) |
464 | { |
465 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
466 | |
467 | if (p >= vit_block_data.size()) {return;} |
468 | |
469 | int pool_id = p / nslot; |
470 | int thr_id = p % nslot; |
471 | int start = vit_block_scan.template get<0>(pool_id); |
472 | int n = vit_block_scan.template get<0>(pool_id+1) - start; |
473 | |
474 | // move remaining |
475 | if (thr_id < n ) |
476 | { |
477 | vit_list_0.template get<0>(start + thr_id) = vit_block_data.template get<0>(nslot*pool_id + thr_id); |
478 | vit_list_1.template get<0>(start + thr_id) = nslot*pool_id + thr_id; |
479 | } |
480 | } |
481 | |
482 | |
483 | template<typename vector_index_type> |
484 | __global__ void construct_remove_list(vector_index_type vit_block_data, |
485 | vector_index_type vit_block_n, |
486 | vector_index_type vit_block_scan, |
487 | vector_index_type vit_list_0, |
488 | vector_index_type vit_list_1, |
489 | int nslot) |
490 | { |
491 | int n_move = vit_block_n.template get<0>(blockIdx.x); |
492 | int n_block_move = vit_block_n.template get<0>(blockIdx.x) / blockDim.x; |
493 | int start = vit_block_scan.template get<0>(blockIdx.x); |
494 | |
495 | int i = 0; |
496 | for ( ; i < n_block_move ; i++) |
497 | { |
498 | vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x); |
499 | vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x; |
500 | } |
501 | |
502 | // move remaining |
503 | if (threadIdx.x < n_move - i*blockDim.x ) |
504 | { |
505 | vit_list_0.template get<0>(start + i*blockDim.x + threadIdx.x) = vit_block_data.template get<0>(nslot*blockIdx.x + i*blockDim.x + threadIdx.x); |
506 | vit_list_1.template get<0>(start + i*blockDim.x + threadIdx.x) = start + i*blockDim.x + threadIdx.x; |
507 | } |
508 | } |
509 | |
510 | |
511 | template<typename e_type, typename v_reduce> |
512 | struct data_merger |
513 | { |
514 | e_type src1; |
515 | e_type src2; |
516 | |
517 | e_type dst; |
518 | |
519 | __device__ __host__ inline data_merger(const e_type & src1, const e_type & src2, const e_type & dst) |
520 | :src1(src1),src2(src2),dst(dst) |
521 | { |
522 | }; |
523 | |
524 | |
525 | //! It call the copy function for each property |
526 | template<typename T> |
527 | __device__ __host__ inline void operator()(T& t) const |
528 | { |
529 | typedef typename boost::mpl::at<v_reduce,T>::type red_type; |
530 | |
531 | dst.template get<red_type::prop::value>() = red_type::red(src1.template get<red_type::prop::value>(),src2.template get<red_type::prop::value>()); |
532 | } |
533 | }; |
534 | |
535 | template<typename vector_index_type, typename vector_data_type, typename vector_index_type2, unsigned int block_dim, typename ... v_reduce> |
536 | __global__ void solve_conflicts(vector_index_type vct_index, vector_data_type vct_data, |
537 | vector_index_type merge_index, vector_data_type vct_add_data, |
538 | vector_index_type vct_index_out, vector_data_type vct_data_out, |
539 | vector_index_type2 vct_tot_out, |
540 | int base) |
541 | { |
542 | typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type; |
543 | |
544 | // Specialize BlockScan for a 1D block of 256 threads on type int |
545 | typedef cub::BlockScan<int, block_dim> BlockScan; |
546 | // Allocate shared memory for BlockScan |
547 | __shared__ typename BlockScan::TempStorage temp_storage; |
548 | |
549 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
550 | |
551 | int scan = 0; |
552 | int predicate = 0; |
553 | |
554 | if (p < vct_index.size()) |
555 | { |
556 | index_type id_check = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1); |
557 | predicate = vct_index.template get<0>(p) != id_check; |
558 | |
559 | scan = predicate; |
560 | } |
561 | |
562 | // in shared memory scan |
563 | BlockScan(temp_storage).ExclusiveSum(scan, scan); |
564 | |
565 | if (predicate == 1 && p < vct_index.size()) |
566 | { |
567 | vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p); |
568 | |
569 | int index1 = merge_index.template get<0>(p); |
570 | |
571 | auto e = vct_data_out.get(blockIdx.x*block_dim + scan); |
572 | |
573 | if (index1 < base) |
574 | { |
575 | e = vct_data.get(index1); |
576 | vct_data_out.get(blockIdx.x*block_dim + scan) = e; |
577 | } |
578 | else |
579 | { |
580 | e = vct_add_data.get(index1 - base); |
581 | vct_data_out.get(blockIdx.x*block_dim + scan) = e; |
582 | } |
583 | } |
584 | |
585 | __syncthreads(); |
586 | |
587 | if (predicate == 0 && p < vct_index.size()) |
588 | { |
589 | //! at the border of the block the index must be copied |
590 | if (threadIdx.x == blockDim.x-1) |
591 | {vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p);} |
592 | |
593 | // we have to merge the data |
594 | |
595 | typedef boost::mpl::vector<v_reduce ...> v_reduce_; |
596 | |
597 | int index1 = merge_index.template get<0>(p); |
598 | int index2 = merge_index.template get<0>(p+1) - base; |
599 | |
600 | data_merger<decltype(vct_data.get(p)),v_reduce_> dm(vct_data.get(index1), |
601 | vct_add_data.get(index2), |
602 | vct_data_out.get(blockIdx.x*block_dim + scan)); |
603 | |
604 | // data_merge |
605 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(dm); |
606 | } |
607 | |
608 | if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size()) |
609 | { |
610 | vct_tot_out.template get<0>(blockIdx.x) = scan + predicate; |
611 | vct_tot_out.template get<2>(blockIdx.x) = predicate; |
612 | } |
613 | } |
614 | |
615 | |
616 | template<typename vector_index_type, typename vector_index_type2, unsigned int block_dim> |
617 | __global__ void solve_conflicts_remove(vector_index_type vct_index, |
618 | vector_index_type merge_index, |
619 | vector_index_type vct_index_out, |
620 | vector_index_type vct_index_out_ps, |
621 | vector_index_type2 vct_tot_out, |
622 | int base) |
623 | { |
624 | typedef typename std::remove_reference<decltype(vct_index.template get<0>(0))>::type index_type; |
625 | |
626 | // Specialize BlockScan for a 1D block of 256 threads on type int |
627 | typedef cub::BlockScan<int, block_dim> BlockScan; |
628 | // Allocate shared memory for BlockScan |
629 | __shared__ typename BlockScan::TempStorage temp_storage; |
630 | |
631 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
632 | |
633 | int scan = 0; |
634 | int predicate = 0; |
635 | if (p < vct_index.size()) |
636 | { |
637 | index_type id_check_n = (p == vct_index.size() - 1)?(index_type)-1:vct_index.template get<0>(p+1); |
638 | index_type id_check_p = (p == 0)?(index_type)-1:vct_index.template get<0>(p-1); |
639 | index_type id_check = vct_index.template get<0>(p); |
640 | predicate = id_check != id_check_p; |
641 | predicate &= id_check != id_check_n; |
642 | int mi = merge_index.template get<0>(p); |
643 | predicate &= (mi < base); |
644 | |
645 | scan = predicate; |
646 | } |
647 | // in shared memory scan |
648 | BlockScan(temp_storage).ExclusiveSum(scan, scan); |
649 | |
650 | if (predicate == 1 && p < vct_index.size()) |
651 | { |
652 | vct_index_out.template get<0>(blockIdx.x*block_dim + scan) = vct_index.template get<0>(p); |
653 | vct_index_out_ps.template get<0>(blockIdx.x*block_dim + scan) = merge_index.template get<0>(p); |
654 | } |
655 | |
656 | __syncthreads(); |
657 | |
658 | if ((threadIdx.x == blockDim.x - 1 || p == vct_index.size() - 1) && p < vct_index.size()) |
659 | { |
660 | vct_tot_out.template get<0>(blockIdx.x) = scan + predicate; |
661 | } |
662 | } |
663 | |
664 | template<typename vector_type, typename vector_type2, typename red_op> |
665 | __global__ void reduce_from_offset(vector_type segment_offset, |
666 | vector_type2 output, |
667 | typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type max_index) |
668 | { |
669 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
670 | |
671 | if (p >= segment_offset.size()) return; |
672 | |
673 | typename std::remove_reference<decltype(segment_offset.template get<1>(0))>::type v; |
674 | if (p == segment_offset.size()-1) |
675 | {v = max_index;} |
676 | else |
677 | {v = segment_offset.template get<1>(p+1);} |
678 | |
679 | red_op::set(v,segment_offset.template get<1>(p),output,p); |
680 | } |
681 | |
682 | template<typename vector_index_type, typename vector_data_type, typename vector_index_type2> |
683 | __global__ void realign(vector_index_type vct_index, vector_data_type vct_data, |
684 | vector_index_type vct_index_out, vector_data_type vct_data_out, |
685 | vector_index_type2 vct_tot_out_scan) |
686 | { |
687 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
688 | |
689 | if (p >= vct_index.size()) return; |
690 | |
691 | int tot = vct_tot_out_scan.template get<0>(blockIdx.x); |
692 | |
693 | //! It is xorrect > not >=, The last thread in the block of solveConflict always copy the data |
694 | //! This mean that threadIdx.x == tot have always data to copy independently that the indexes are equal |
695 | //! or different |
696 | if (threadIdx.x > tot) // NOTE COMMENT UP !!!!!!!!! |
697 | {return;} |
698 | |
699 | if (threadIdx.x == tot && vct_tot_out_scan.template get<2>(blockIdx.x) == 1) |
700 | {return;} |
701 | |
702 | // this branch exist if the previous block (last thread) had a predicate == 0 in resolve_conflict in that case |
703 | // the thread 0 of the next block does not have to produce any data |
704 | if (threadIdx.x == 0 && blockIdx.x != 0 && vct_tot_out_scan.template get<2>(blockIdx.x - 1) == 0) |
705 | {return;} |
706 | |
707 | int ds = vct_tot_out_scan.template get<1>(blockIdx.x); |
708 | |
709 | if (ds + threadIdx.x >= vct_index_out.size()) |
710 | {return;} |
711 | |
712 | vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p); |
713 | |
714 | auto src = vct_data.get(p); |
715 | auto dst = vct_data_out.get(ds+threadIdx.x); |
716 | |
717 | dst = src; |
718 | } |
719 | |
720 | template<typename vector_index_type, typename vct_data_type, typename vector_index_type2> |
721 | __global__ void realign_remove(vector_index_type vct_index, vector_index_type vct_m_index, vct_data_type vct_data, |
722 | vector_index_type vct_index_out, vct_data_type vct_data_out, |
723 | vector_index_type2 vct_tot_out_scan) |
724 | { |
725 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
726 | |
727 | if (p >= vct_index.size()) return; |
728 | |
729 | int tot = vct_tot_out_scan.template get<0>(blockIdx.x); |
730 | |
731 | if (threadIdx.x >= tot) |
732 | {return;} |
733 | |
734 | int ds = vct_tot_out_scan.template get<1>(blockIdx.x); |
735 | |
736 | vct_index_out.template get<0>(ds+threadIdx.x) = vct_index.template get<0>(p); |
737 | |
738 | int oi = vct_m_index.template get<0>(p); |
739 | |
740 | auto src = vct_data.get(oi); |
741 | auto dst = vct_data_out.get(ds+threadIdx.x); |
742 | |
743 | dst = src; |
744 | } |
745 | |
746 | template<typename vector_index_type, typename vector_data_type> |
747 | __global__ void reorder_vector_data(vector_index_type vi, vector_data_type v_data, vector_data_type v_data_ord) |
748 | { |
749 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
750 | |
751 | if (p >= vi.size()) return; |
752 | |
753 | // reorder based on v_ord the data vector |
754 | |
755 | v_data_ord.get_o(p) = v_data.get_o(vi.template get<0>(p)); |
756 | } |
757 | |
758 | template<typename vector_index_type> |
759 | __global__ void reorder_create_index_map(vector_index_type vi, vector_index_type seg_in, vector_index_type seg_out) |
760 | { |
761 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
762 | |
763 | if (p >= vi.size()) return; |
764 | |
765 | // reorder based on v_ord the data vector |
766 | |
767 | seg_out.template get<0>(p) = seg_in.template get<0>(vi.template get<0>(p)); |
768 | } |
769 | |
770 | |
771 | template<unsigned int prp, typename vector_type> |
772 | __global__ void set_indexes(vector_type vd, int off) |
773 | { |
774 | int p = blockIdx.x * blockDim.x + threadIdx.x; |
775 | |
776 | if (p >= vd.size()) {return;} |
777 | |
778 | vd.template get<prp>(p) = p + off; |
779 | } |
780 | |
781 | #endif |
782 | |
783 | #endif /* MAP_VECTOR_SPARSE_CUDA_KERNELS_CUH_ */ |
784 | |