1/*
2 * map_vector_sparse.hpp
3 *
4 * Created on: Jan 22, 2019
5 * Author: i-bird
6 */
7
8#ifndef MAP_VECTOR_SPARSE_HPP_
9#define MAP_VECTOR_SPARSE_HPP_
10
11#include "util/cuda_launch.hpp"
12#include "Vector/map_vector.hpp"
13#include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
14#include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
15#include "util/cuda/ofp_context.hxx"
16#include <iostream>
17#include <limits>
18
19#if defined(__NVCC__)
20 #if !defined(CUDA_ON_CPU)
21 #include "util/cuda/moderngpu/kernel_segreduce.hxx"
22 #include "util/cuda/moderngpu/kernel_merge.hxx"
23 #endif
24 #include "util/cuda/kernels.cuh"
25#endif
26
27#include "util/cuda/scan_ofp.cuh"
28#include "util/cuda/sort_ofp.cuh"
29
30enum flush_type
31{
32 FLUSH_ON_HOST = 0,
33 FLUSH_ON_DEVICE = 1,
34 FLUSH_NO_DATA = 2
35};
36
37template<typename OfpmVectorT>
38using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type;
39
40namespace openfpm
41{
42 // All props
43 template<typename sg_type>
44 struct htoD
45 {
46 //! encapsulated source object
47 sg_type & sg;
48
49 unsigned int lele;
50
51 htoD(sg_type & sg, unsigned int lele)
52 :sg(sg),lele(lele)
53 {};
54
55
56 //! It call the copy function for each property
57 template<typename T>
58 __device__ __host__ inline void operator()(T& t) const
59 {
60 sg.template hostToDevice<T::value>(lele,lele);
61 }
62 };
63
64 constexpr int VECTOR_SPARSE_STANDARD = 1;
65 constexpr int VECTOR_SPARSE_BLOCK = 2;
66
67 template<typename reduction_type, unsigned int impl>
68 struct cpu_block_process
69 {
70 template<typename encap_src, typename encap_dst>
71 static inline void process(encap_src & src, encap_dst & dst)
72 {
73 dst = reduction_type::red(dst,src);
74 }
75 };
76
77 template<typename reduction_type>
78 struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK>
79 {
80 template<typename encap_src, typename encap_dst>
81 static inline void process(encap_src & src, encap_dst & dst)
82 {
83 for (size_t i = 0 ; i < encap_src::size ; i++)
84 {
85 dst[i] = reduction_type::red(dst[i],src[i]);
86 }
87 }
88 };
89
90 template<typename reduction_type>
91 struct cpu_block_process<reduction_type,3>
92 {
93 template<typename encap_src, typename encap_dst,unsigned int N1>
94 static inline void process(encap_src & src, encap_dst (& dst)[N1])
95 {
96 for (unsigned int j = 0 ; j < N1 ; j++)
97 {
98 for (size_t i = 0 ; i < encap_dst::size ; i++)
99 {
100 dst[i] = reduction_type::red(dst[i][j],src[j][i]);
101 }
102 }
103 }
104
105 template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst>
106 static inline void process_e(encap_src & src, encap_dst & dst)
107 {
108 for (unsigned int j = 0 ; j < N1 ; j++)
109 {
110 for (size_t i = 0 ; i < blockSize ; i++)
111 {
112 dst[i] = reduction_type::red(dst[i][j],src[i][j]);
113 }
114 }
115 }
116 };
117
118 /*! \brief Functor switch to select the vector sparse for standars scalar and blocked implementation
119 *
120 *
121 */
122 template<unsigned int impl, typename block_functor>
123 struct scalar_block_implementation_switch // Case for scalar implementations
124 {
125 template <unsigned int p, typename vector_index_type>
126 static void extendSegments(vector_index_type & segments, size_t dataSize)
127 {
128#ifdef __NVCC__
129 // Pass as there is nothing to append for mgpu
130#else // __NVCC__
131 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
132#endif // __NVCC__
133 }
134
135 template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2>
136 static void segreduce(vector_data_type & vector_data,
137 vector_data_type & vector_data_unsorted,
138 vector_index_type & vector_data_map,
139 vector_index_type2 & segment_offset,
140 vector_data_type & vector_data_red,
141 block_functor & blf,
142 mgpu::ofp_context_t & context)
143 {
144#ifdef __NVCC__
145 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
146 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
147 typedef typename reduction_type::template op_red<red_type> red_op;
148 typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
149 red_type init;
150 init = 0;
151
152 assert((std::is_same<seg_type,int>::value == true));
153
154 mgpu::segreduce(
155 (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
156 (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(),
157 (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
158 red_op(), init, context);
159#else // __NVCC__
160 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
161#endif // __NVCC__
162 }
163
164
165 /*! \briefMerge all datas
166 *
167 * \param vct_index_old sorted vector of the old indexes
168 * \param vct_data_old vector of old data
169 * \param vct_index_out output indexes merged new and old indexes
170 * \param vct_index_merge_id indicate from where it come from the merged index (if the number is bigger than vct_index_old.size()
171 * the merged index come from the new data)
172 * \param vct_index_merge indexes old and new merged with conflicts
173 * \param vct_add_data_unique data to add (has been already reduced)
174 * \param vct_data_old old data
175 * \param vct_add_data data to add in its original form in the insert buffer
176 * \param vct_data_out reduced data vector new + old
177 * \param vct_index_dtmp temporal buffer vector used for intermediate calculation
178 *
179 */
180 template <
181 typename vector_data_type,
182 typename vector_index_type,
183 typename vector_index_type2,
184 typename vector_index_dtmp_type,
185 typename Ti,
186 typename ... v_reduce>
187 static void solveConflicts(
188 vector_index_type & vct_index_old,
189 vector_index_type & vct_index_merge,
190 vector_index_type & vct_index_merge_id,
191 vector_index_type & vct_index_out,
192 vector_index_dtmp_type & vct_index_dtmp,
193 vector_index_type & data_map,
194 vector_index_type2 & segments_new,
195 vector_data_type & vct_data_old,
196 vector_data_type & vct_add_data,
197 vector_data_type & vct_add_data_unique,
198 vector_data_type & vct_data_out,
199 ite_gpu<1> & itew,
200 block_functor & blf,
201 mgpu::ofp_context_t & context
202 )
203 {
204#ifdef __NVCC__
205
206 CUDA_LAUNCH((solve_conflicts<
207 decltype(vct_index_merge.toKernel()),
208 decltype(vct_data_old.toKernel()),
209 decltype(vct_index_dtmp.toKernel()),
210 128,
211 v_reduce ...
212 >),
213 itew,
214 vct_index_merge.toKernel(),vct_data_old.toKernel(),
215 vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(),
216 vct_index_out.toKernel(),vct_data_out.toKernel(),
217 vct_index_dtmp.toKernel(),
218 vct_index_old.size());
219
220 // we scan tmp3
221 openfpm::scan(
222 (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),
223 vct_index_dtmp.size(),
224 (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),
225 context);
226
227 // get the size to resize vct_index and vct_data
228 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
229 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
230
231 vct_index_old.resize(size);
232 vct_data_old.resize(size);
233
234 CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(),
235 vct_index_old.toKernel(), vct_data_old.toKernel(),
236 vct_index_dtmp.toKernel());
237
238
239#else // __NVCC__
240 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
241#endif // __NVCC__
242 }
243 };
244
245
246 template<typename block_functor>
247 struct scalar_block_implementation_switch<2, block_functor> // Case for blocked implementations
248 {
249 template <unsigned int p, typename vector_index_type>
250 static void extendSegments(vector_index_type & segments, size_t dataSize)
251 {
252#ifdef __NVCC__
253 // Append trailing element to segment (marks end of last segment)
254 segments.resize(segments.size()+1);
255 segments.template get<p>(segments.size() - 1) = dataSize;
256 segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
257#else // __NVCC__
258 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
259#endif // __NVCC__
260 }
261
262 template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2>
263 static void segreduce(vector_data_type & vector_data,
264 vector_data_type & vector_data_unsorted,
265 vector_index_type & vector_data_map,
266 vector_index_type2 & segment_offset,
267 vector_data_type & vector_data_red,
268 block_functor & blf,
269 mgpu::ofp_context_t & context)
270 {
271
272 }
273
274 template <
275 typename vector_data_type,
276 typename vector_index_type,
277 typename vector_index_type2,
278 typename vector_index_dtmp_type,
279 typename Ti,
280 typename ... v_reduce>
281 static void solveConflicts(
282 vector_index_type & vct_index_old,
283 vector_index_type & vct_index_merge,
284 vector_index_type & vct_index_merge_id,
285 vector_index_type & vct_index_out,
286 vector_index_dtmp_type & vct_index_dtmp,
287 vector_index_type & data_map,
288 vector_index_type2 & segments_new,
289 vector_data_type & vct_data,
290 vector_data_type & vct_add_data,
291 vector_data_type & vct_add_data_unique,
292 vector_data_type & vct_data_out,
293 ite_gpu<1> & itew,
294 block_functor & blf,
295 mgpu::ofp_context_t & context
296 )
297 {
298#ifdef __NVCC__
299 blf.template solve_conflicts<1,
300 decltype(vct_index_merge),
301 decltype(segments_new),
302 decltype(vct_data),
303 v_reduce ...>
304 (vct_index_merge, vct_index_merge_id, segments_new, data_map,
305 vct_data, vct_add_data,
306 vct_index_old, vct_data_out,
307 context);
308 vct_data_out.swap(vct_data);
309
310#else // __NVCC__
311 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
312#endif // __NVCC__
313 }
314 };
315
316 template<typename Ti>
317 struct reorder
318 {
319 Ti id;
320 Ti id2;
321
322 bool operator<(const reorder & t) const
323 {
324 return id < t.id;
325 }
326 };
327
328 template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type>
329 struct sparse_vector_reduction_cpu_impl
330 {
331 template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
332 static inline void red(size_t & i, vector_data_type & vector_data_red,
333 vector_data_type & vector_data,
334 vector_index_type & vector_index,
335 vector_index_type_reo & reorder_add_index_cpu)
336 {
337 size_t start = reorder_add_index_cpu.get(i).id;
338 red_type red = vector_data.template get<reduction_type::prop::value>(i);
339
340 size_t j = 1;
341 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
342 {
343 cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
344 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
345 }
346 vector_data_red.add();
347 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
348
349 if (T::value == 0)
350 {
351 vector_index.add();
352 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
353 }
354
355 i += j;
356 }
357 };
358
359
360 template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1>
361 struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]>
362 {
363 template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
364 static inline void red(size_t & i, vector_data_type & vector_data_red,
365 vector_data_type & vector_data,
366 vector_index_type & vector_index,
367 vector_index_type_reo & reorder_add_index_cpu)
368 {
369 size_t start = reorder_add_index_cpu.get(i).id;
370 red_type red[N1];
371
372 for (size_t k = 0 ; k < N1 ; k++)
373 {
374 red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
375 }
376
377 size_t j = 1;
378 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
379 {
380 auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
381 cpu_block_process<reduction_type,impl+1>::process(ev,red);
382 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
383 }
384
385 vector_data_red.add();
386
387 for (size_t k = 0 ; k < N1 ; k++)
388 {
389 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] = red[k];
390 }
391
392 if (T::value == 0)
393 {
394 vector_index.add();
395 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
396 }
397
398 i += j;
399 }
400 };
401
402 /*! \brief this class is a functor for "for_each" algorithm
403 *
404 * This class is a functor for "for_each" algorithm. For each
405 * element of the boost::vector the operator() is called.
406 * Is mainly used to copy one encap into another encap object
407 *
408 * \tparam encap source
409 * \tparam encap dst
410 *
411 */
412 template<typename vector_data_type,
413 typename vector_index_type,
414 typename vector_index_type_reo,
415 typename vector_reduction,
416 unsigned int impl>
417 struct sparse_vector_reduction_cpu
418 {
419 //! Vector in which to the reduction
420 vector_data_type & vector_data_red;
421
422 //! Vector in which to the reduction
423 vector_data_type & vector_data;
424
425 //! reorder vector index
426 vector_index_type_reo & reorder_add_index_cpu;
427
428 //! Index type vector
429 vector_index_type & vector_index;
430
431 /*! \brief constructor
432 *
433 * \param src source encapsulated object
434 * \param dst source encapsulated object
435 *
436 */
437 inline sparse_vector_reduction_cpu(vector_data_type & vector_data_red,
438 vector_data_type & vector_data,
439 vector_index_type & vector_index,
440 vector_index_type_reo & reorder_add_index_cpu)
441 :vector_data_red(vector_data_red),vector_data(vector_data),vector_index(vector_index),reorder_add_index_cpu(reorder_add_index_cpu)
442 {};
443
444 //! It call the copy function for each property
445 template<typename T>
446 inline void operator()(T& t) const
447 {
448 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
449 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
450
451 if (reduction_type::is_special() == false)
452 {
453 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; )
454 {
455 sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type>::red(i,vector_data_red,vector_data,vector_index,reorder_add_index_cpu);
456
457/* size_t start = reorder_add_index_cpu.get(i).id;
458 red_type red = vector_data.template get<reduction_type::prop::value>(i);
459
460 size_t j = 1;
461 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
462 {
463 cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
464 //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
465 }
466 vector_data_red.add();
467 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
468
469 if (T::value == 0)
470 {
471 vector_index.add();
472 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
473 }
474
475 i += j;*/
476 }
477 }
478 }
479 };
480
481 /*! \brief this class is a functor for "for_each" algorithm
482 *
483 * This class is a functor for "for_each" algorithm. For each
484 * element of the boost::vector the operator() is called.
485 * Is mainly used to copy one encap into another encap object
486 *
487 * \tparam encap source
488 * \tparam encap dst
489 *
490 */
491 template<typename encap_src,
492 typename encap_dst,
493 typename vector_reduction>
494 struct sparse_vector_reduction_solve_conflict_assign_cpu
495 {
496 //! source
497 encap_src & src;
498
499 //! destination
500 encap_dst & dst;
501
502
503 /*! \brief constructor
504 *
505 * \param src source encapsulated object
506 * \param dst source encapsulated object
507 *
508 */
509 inline sparse_vector_reduction_solve_conflict_assign_cpu(encap_src & src, encap_dst & dst)
510 :src(src),dst(dst)
511 {};
512
513 //! It call the copy function for each property
514 template<typename T>
515 inline void operator()(T& t) const
516 {
517 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
518
519 dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>();
520 }
521 };
522
523
524 template<unsigned int impl,typename vector_reduction, typename T,typename red_type>
525 struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl
526 {
527 template<typename encap_src, typename encap_dst>
528 static inline void red(encap_src & src, encap_dst & dst)
529 {
530 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
531
532 cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
533 }
534 };
535
536 template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1>
537 struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]>
538 {
539 template<typename encap_src, typename encap_dst>
540 static inline void red(encap_src & src, encap_dst & dst)
541 {
542 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
543
544 auto src_e = src.template get<reduction_type::prop::value>();
545 auto dst_e = dst.template get<reduction_type::prop::value>();
546
547 cpu_block_process<reduction_type,impl+1>::template process_e<N1,red_type::size>(src_e,dst_e);
548 }
549 };
550
551 /*! \brief this class is a functor for "for_each" algorithm
552 *
553 * This class is a functor for "for_each" algorithm. For each
554 * element of the boost::vector the operator() is called.
555 * Is mainly used to copy one encap into another encap object
556 *
557 * \tparam encap source
558 * \tparam encap dst
559 *
560 */
561 template<typename encap_src,
562 typename encap_dst,
563 typename vector_reduction,
564 unsigned int impl>
565 struct sparse_vector_reduction_solve_conflict_reduce_cpu
566 {
567 //! source
568 encap_src & src;
569
570 //! destination
571 encap_dst & dst;
572
573
574 /*! \brief constructor
575 *
576 * \param src source encapsulated object
577 * \param dst source encapsulated object
578 *
579 */
580 inline sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src & src, encap_dst & dst)
581 :src(src),dst(dst)
582 {};
583
584 //! It call the copy function for each property
585 template<typename T>
586 inline void operator()(T& t) const
587 {
588 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
589 typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
590
591 sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type>::red(src,dst);
592
593// cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
594// reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>());
595 }
596 };
597
598 /*! \brief this class is a functor for "for_each" algorithm
599 *
600 * This class is a functor for "for_each" algorithm. For each
601 * element of the boost::vector the operator() is called.
602 * Is mainly used to copy one encap into another encap object
603 *
604 * \tparam encap source
605 * \tparam encap dst
606 *
607 */
608 template<typename vector_data_type,
609 typename vector_index_type,
610 typename vector_index_type2,
611 typename vector_reduction,
612 typename block_functor,
613 unsigned int impl2, unsigned int pSegment=1>
614 struct sparse_vector_reduction
615 {
616 //! Vector in which to the reduction
617 vector_data_type & vector_data_red;
618
619 //! new datas
620 vector_data_type & vector_data;
621
622 //! new data in an unsorted way
623 vector_data_type & vector_data_unsorted;
624
625 //! segment of offsets
626 vector_index_type2 & segment_offset;
627
628 //! map of the data
629 vector_index_type & vector_data_map;
630
631 //! block functor
632 block_functor & blf;
633
634 //! gpu context
635 mgpu::ofp_context_t & context;
636
637 /*! \brief constructor
638 *
639 * \param src source encapsulated object
640 * \param dst source encapsulated object
641 *
642 */
643 inline sparse_vector_reduction(vector_data_type & vector_data_red,
644 vector_data_type & vector_data,
645 vector_data_type & vector_data_unsorted,
646 vector_index_type & vector_data_map,
647 vector_index_type2 & segment_offset,
648 block_functor & blf,
649 mgpu::ofp_context_t & context)
650 :vector_data_red(vector_data_red),
651 vector_data(vector_data),
652 vector_data_unsorted(vector_data_unsorted),
653 segment_offset(segment_offset),
654 vector_data_map(vector_data_map),
655 blf(blf),
656 context(context)
657 {};
658
659 //! It call the copy function for each property
660 template<typename T>
661 inline void operator()(T& t) const
662 {
663#ifdef __NVCC__
664
665 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
666 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
667 if (reduction_type::is_special() == false)
668 {
669 scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>(
670 vector_data,
671 vector_data_unsorted,
672 vector_data_map,
673 segment_offset,
674 vector_data_red,
675 blf,
676 context);
677 }
678#else
679 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
680#endif
681 }
682 };
683
684
685 struct stub_block_functor
686 {
687 template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type>
688 static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
689 {
690 return true;
691 }
692
693 template<typename vector_index_type, typename vector_data_type, typename ... v_reduce>
694 static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
695 vector_data_type &data1, vector_data_type &data2,
696 vector_index_type &indices_tmp, vector_data_type &data_tmp,
697 vector_index_type &keysOut, vector_data_type &dataOut,
698 mgpu::ofp_context_t & context)
699 {
700 return true;
701 }
702
703 openfpm::vector_gpu<aggregate<unsigned int>> outputMap;
704
705 openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap()
706 {
707 return outputMap;
708 }
709
710 const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
711 {
712 return outputMap;
713 }
714 };
715
716 /*! \brief this class is a functor for "for_each" algorithm
717 *
718 * This class is a functor for "for_each" algorithm. For each
719 * element of the boost::vector the operator() is called.
720 * Is mainly used to copy one encap into another encap object
721 *
722 * \tparam encap source
723 * \tparam encap dst
724 *
725 */
726 template<typename vector_data_type, typename vector_index_type, typename vector_reduction>
727 struct sparse_vector_special
728 {
729 //! Vector in which to the reduction
730 vector_data_type & vector_data_red;
731
732 //! Vector in which to the reduction
733 vector_data_type & vector_data;
734
735 //! segment of offsets
736 vector_index_type & segment_offset;
737
738 //! gpu context
739 mgpu::ofp_context_t & context;
740
741 /*! \brief constructor
742 *
743 * \param src source encapsulated object
744 * \param dst source encapsulated object
745 *
746 */
747 inline sparse_vector_special(vector_data_type & vector_data_red,
748 vector_data_type & vector_data,
749 vector_index_type & segment_offset,
750 mgpu::ofp_context_t & context)
751 :vector_data_red(vector_data_red),vector_data(vector_data),segment_offset(segment_offset),context(context)
752 {};
753
754 //! It call the copy function for each property
755 template<typename T>
756 inline void operator()(T& t) const
757 {
758#ifdef __NVCC__
759
760 typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
761
762 // reduction type
763 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
764
765 if (reduction_type::is_special() == true)
766 {
767 auto ite = segment_offset.getGPUIterator();
768
769 CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>),
770 ite,segment_offset.toKernel(),vector_data_red.toKernel(),vector_data.size());
771 }
772
773#else
774 std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl;
775#endif
776 }
777 };
778
779 template<typename T,
780 typename Ti = long int,
781 typename Memory=HeapMemory,
782 typename layout=typename memory_traits_lin<T>::type,
783 template<typename> class layout_base=memory_traits_lin ,
784 typename grow_p=grow_policy_double,
785 unsigned int impl=vect_isel<T>::value,
786 unsigned int impl2 = VECTOR_SPARSE_STANDARD,
787 typename block_functor = stub_block_functor>
788 class vector_sparse
789 {
790 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index;
791 vector<T,Memory,layout_base,grow_p,impl> vct_data;
792 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index;
793
794 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index;
795 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index;
796 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index;
797 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index;
798 vector<T,Memory,layout_base,grow_p> vct_add_data;
799 vector<T,Memory,layout_base,grow_p> vct_add_data_reord;
800
801 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0;
802 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1;
803 vector<T,Memory,layout_base,grow_p> vct_add_data_cont;
804 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique;
805 vector<aggregate<int,int>,Memory,layout_base,grow_p> segments_int;
806
807 vector<T,Memory,layout_base,grow_p,impl> vct_add_data_unique;
808
809 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4;
810 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
811 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2;
812 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3;
813 vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp;
814
815 // segments map (This is used only in case of Blocked data)
816 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_segment_index_map;
817
818 block_functor blf;
819
820 T bck;
821
822 CudaMemory mem;
823
824 openfpm::vector<reorder<Ti>> reorder_add_index_cpu;
825
826 size_t max_ele;
827
828 int n_gpu_add_block_slot = 0;
829 int n_gpu_rem_block_slot = 0;
830
831 /*! \brief get the element i
832 *
833 * search the element x
834 *
835 * \param i element i
836 */
837 template<bool prefetch>
838 inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
839 {
840 if (vct_index.size() == 0) {id = 0; return -1;}
841 const Ti *base = &vct_index.template get<0>(0);
842 const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
843 Ti n = vct_data.size()-1;
844 while (n > 1)
845 {
846 Ti half = n / 2;
847 if (prefetch)
848 {
849 __builtin_prefetch(base + half/2, 0, 0);
850 __builtin_prefetch(base + half + half/2, 0, 0);
851 }
852 base = (base[half] < x) ? base+half : base;
853 n -= half;
854 }
855
856 int off = (*base < x);
857 id = base - &vct_index.template get<0>(0) + off;
858 return (base + off != end)?*(base + off):-1;
859 }
860
861 /*! \brief get the element i
862 *
863 * search the element x
864 *
865 * \param i element i
866 */
867 template<bool prefetch>
868 inline void _branchfree_search(Ti x, Ti & id) const
869 {
870 Ti v = _branchfree_search_nobck<prefetch>(x,id);
871 id = (x == v)?id:vct_data.size()-1;
872 }
873
874
875 /* \brief take the indexes for the insertion pools and create a continuos array
876 *
877 * \param vct_nadd_index number of insertions of each pool
878 * \param vct_add_index pool of inserts
879 * \param vct_add_cont_index output continuos array of inserted indexes
880 * \param vct_add_data array of added data
881 * \param vct_add_data_cont continuos array of inserted data
882 * \param contect mgpu context
883 *
884 */
885 size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
886 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index,
887 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
888 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
889 vector<T,Memory,layout_base,grow_p> & vct_add_data,
890 vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
891 mgpu::ofp_context_t & context)
892 {
893#ifdef __NVCC__
894
895 // Add 0 to the last element to vct_nadd_index
896 vct_nadd_index.resize(vct_nadd_index.size()+1);
897 vct_nadd_index.template get<0>(vct_nadd_index.size()-1) = 0;
898 vct_nadd_index.template hostToDevice<0>(vct_nadd_index.size()-1,vct_nadd_index.size()-1);
899
900 // Merge the list of inserted points for each block
901 vct_index_tmp4.resize(vct_nadd_index.size());
902
903 openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(),
904 vct_nadd_index.size(),
905 (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() ,
906 context);
907
908 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
909 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
910
911 // we reuse vct_nadd_index
912 vct_add_cont_index.resize(n_ele);
913 vct_add_cont_index_map.resize(n_ele);
914
915 if (impl2 == VECTOR_SPARSE_STANDARD)
916 {
917 vct_add_data_cont.resize(n_ele);
918 }
919 else
920 {
921 vct_segment_index_map.resize(n_ele);
922 }
923
924 if (n_gpu_add_block_slot >= 128)
925 {
926 ite_gpu<1> itew;
927 itew.wthr.x = vct_nadd_index.size()-1;
928 itew.wthr.y = 1;
929 itew.wthr.z = 1;
930 itew.thr.x = 128;
931 itew.thr.y = 1;
932 itew.thr.z = 1;
933
934 CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
935 vct_nadd_index.toKernel(),
936 vct_index_tmp4.toKernel(),
937 vct_add_cont_index.toKernel(),
938 vct_add_cont_index_map.toKernel(),
939 n_gpu_add_block_slot);
940 }
941 else
942 {
943 auto itew = vct_add_index.getGPUIterator();
944
945 CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
946 vct_nadd_index.toKernel(),
947 vct_index_tmp4.toKernel(),
948 vct_add_cont_index.toKernel(),
949 vct_add_cont_index_map.toKernel(),
950 n_gpu_add_block_slot);
951 }
952
953 return n_ele;
954#endif
955 return 0;
956 }
957
958 /*! \brief sort the continuos array of inserted key
959 *
960 * \param context modern gpu context
961 * \param vct_add_cont_index array of indexes (unsorted), as output will be sorted
962 * \param vct_add_cont_index_map reference to the original indexes
963 * \param vct_add_data_reord sorted data output
964 * \param vct_add_data_cont added data in a continuos unsorted array
965 *
966 */
967 void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
968 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
969 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
970 vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
971 mgpu::ofp_context_t & context)
972 {
973#ifdef __NVCC__
974 ite_gpu<1> itew;
975 itew.wthr.x = vct_nadd_index.size()-1;
976 itew.wthr.y = 1;
977 itew.wthr.z = 1;
978 itew.thr.x = 128;
979 itew.thr.y = 1;
980 itew.thr.z = 1;
981
982 size_t n_ele = vct_add_cont_index.size();
983
984 n_gpu_add_block_slot = 0;
985
986 // now we sort
987 openfpm::sort(
988 (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(),
989 (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(),
990 vct_add_cont_index.size(),
991 mgpu::template less_t<Ti>(),
992 context);
993
994 auto ite = vct_add_cont_index.getGPUIterator();
995
996 // Now we reorder the data vector accordingly to the indexes
997
998 if (impl2 == VECTOR_SPARSE_STANDARD)
999 {
1000 vct_add_data_reord.resize(n_ele);
1001 CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
1002 }
1003
1004#endif
1005 }
1006
1007 /*! \brief Merge indexes
1008 *
1009 * \param
1010 *
1011 *
1012 */
1013 template<typename ... v_reduce>
1014 void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_sort,
1015 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1016 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index,
1017 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map,
1018 mgpu::ofp_context_t & context)
1019 {
1020#ifdef __NVCC__
1021
1022 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1023
1024 auto ite = vct_add_index_sort.getGPUIterator();
1025
1026 mem.allocate(sizeof(int));
1027 mem.fill(0);
1028 vct_add_index_unique.resize(vct_add_index_sort.size()+1);
1029
1030 ite = vct_add_index_sort.getGPUIterator();
1031
1032 vct_index_tmp4.resize(vct_add_index_sort.size()+1);
1033
1034 CUDA_LAUNCH(
1035 (
1036 find_buffer_offsets_for_scan
1037 <0,
1038 decltype(vct_add_index_sort.toKernel()),
1039 decltype(vct_index_tmp4.toKernel())
1040 >
1041 ),
1042 ite,
1043 vct_add_index_sort.toKernel(),
1044 vct_index_tmp4.toKernel());
1045
1046 openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),context);
1047
1048 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1049 int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1050
1051 vct_add_index_unique.resize(n_ele_unique);
1052
1053 if (impl2 == VECTOR_SPARSE_STANDARD)
1054 {
1055 vct_add_data_unique.resize(n_ele_unique);
1056 }
1057
1058 CUDA_LAUNCH(
1059 (construct_index_unique<0>),
1060 ite,
1061 vct_add_index_sort.toKernel(),
1062 vct_index_tmp4.toKernel(),
1063 vct_add_index_unique.toKernel());
1064
1065 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1066
1067 // Then we merge the two list vct_index and vct_add_index_unique
1068
1069 // index to get merge index
1070 vct_m_index.resize(vct_index.size());
1071
1072 if (vct_m_index.size() != 0)
1073 {
1074 ite = vct_m_index.getGPUIterator();
1075 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1076 }
1077
1078 // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1079 // we reorder the data array also
1080
1081 vct_merge_index.resize(vct_index.size() + vct_add_index_unique.size());
1082 vct_merge_index_map.resize(vct_index.size() + vct_add_index_unique.size());
1083 vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size());
1084
1085 // Do not delete this reserve
1086 // Unfortunately all resize with DataBlocks are broken
1087 if (impl2 == VECTOR_SPARSE_STANDARD)
1088 {
1089 vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
1090 vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
1091 }
1092
1093 ite = vct_add_index_unique.getGPUIterator();
1094 vct_index_tmp4.resize(vct_add_index_unique.size());
1095 CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),vct_index.size());
1096
1097 ite_gpu<1> itew;
1098
1099 itew.wthr.x = vct_merge_index.size() / 128 + (vct_merge_index.size() % 128 != 0);
1100 itew.wthr.y = 1;
1101 itew.wthr.z = 1;
1102 itew.thr.x = 128;
1103 itew.thr.y = 1;
1104 itew.thr.z = 1;
1105
1106 vct_index_dtmp.resize(itew.wthr.x);
1107
1108 // we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes
1109 //
1110
1111 mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1112 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
1113 (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
1114
1115
1116#endif
1117 }
1118
1119
1120
1121 template<typename ... v_reduce>
1122 void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1123 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new,
1124 vector<T,Memory,layout_base,grow_p> & vct_add_data,
1125 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map,
1126 mgpu::ofp_context_t & context)
1127 {
1128#ifdef __NVCC__
1129 ite_gpu<1> itew;
1130 itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1131 itew.wthr.y = 1;
1132 itew.wthr.z = 1;
1133 itew.thr.x = 128;
1134 itew.thr.y = 1;
1135 itew.thr.z = 1;
1136
1137 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1138
1139 ////////////////////////////////////////////////////////////////////////////////////////////////////
1140
1141 // Now we can do a segmented reduction
1142 scalar_block_implementation_switch<impl2, block_functor>
1143 ::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size());
1144
1145 if (impl2 == VECTOR_SPARSE_STANDARD)
1146 {
1147 sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type,
1148 decltype(vct_add_data_reord_map),
1149 decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1150 svr(
1151 vct_add_data_unique,
1152 vct_add_data_reord,
1153 vct_add_data,
1154 vct_add_data_reord_map,
1155 vct_add_index_unique,
1156 blf,
1157 context);
1158
1159 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1160 }
1161
1162 sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
1163 decltype(vct_add_index_unique),
1164 vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context);
1165 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr2);
1166
1167 //////////////////////////////////////////////////////////////////////////////////////////////////////
1168
1169 // Now perform the right solve_conflicts according to impl2
1170 scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1171 decltype(vct_data),
1172 decltype(vct_index),
1173 decltype(segments_new),
1174 decltype(vct_index_dtmp),
1175 Ti,
1176 v_reduce ...
1177 >
1178 (
1179 vct_index,
1180 vct_index_tmp,
1181 vct_index_tmp2,
1182 vct_index_tmp3,
1183 vct_index_dtmp,
1184 vct_add_data_reord_map,
1185 segments_new,
1186 vct_data,
1187 vct_add_data,
1188 vct_add_data_unique,
1189 vct_add_data_cont,
1190 itew,
1191 blf,
1192 context
1193 );
1194
1195
1196#else
1197 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1198#endif
1199 }
1200
1201 template<typename ... v_reduce>
1202 void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1203 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1204 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1205 mgpu::ofp_context_t & context)
1206 {
1207#ifdef __NVCC__
1208
1209 // To avoid the case where you never called setGPUInsertBuffer
1210 if (n_gpu_add_block_slot == 0 || vct_add_index.size() == 0)
1211 {
1212 return;
1213 }
1214
1215 size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1,
1216 vct_add_data,vct_add_data_cont,context);
1217
1218 // At this point we can check whether we have not inserted anything actually,
1219 // in this case, return without further ado...
1220 if (vct_add_index_cont_0.size() == 0)
1221 {return;}
1222
1223 reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context);
1224
1225 merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1226 vct_index_tmp,vct_index_tmp2,
1227 context);
1228
1229 merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context);
1230
1231#else
1232 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1233#endif
1234 }
1235
1236
1237 void flush_on_gpu_remove(
1238 mgpu::ofp_context_t & context)
1239 {
1240#ifdef __NVCC__
1241
1242 // Add 0 to the last element to vct_nadd_index
1243 vct_nrem_index.resize(vct_nrem_index.size()+1);
1244 vct_nrem_index.template get<0>(vct_nrem_index.size()-1) = 0;
1245 vct_nrem_index.template hostToDevice<0>(vct_nrem_index.size()-1,vct_nrem_index.size()-1);
1246
1247 // Merge the list of inserted points for each block
1248 vct_index_tmp4.resize(vct_nrem_index.size());
1249
1250 openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , context);
1251
1252 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1);
1253 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
1254
1255 // we reuse vct_nadd_index
1256 vct_add_index_cont_0.resize(n_ele);
1257 vct_add_index_cont_1.resize(n_ele);
1258
1259 ite_gpu<1> itew;
1260 itew.wthr.x = vct_nrem_index.size()-1;
1261 itew.wthr.y = 1;
1262 itew.wthr.z = 1;
1263 itew.thr.x = 128;
1264 itew.thr.y = 1;
1265 itew.thr.z = 1;
1266
1267 CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1268 vct_nrem_index.toKernel(),
1269 vct_index_tmp4.toKernel(),
1270 vct_add_index_cont_0.toKernel(),
1271 vct_add_index_cont_1.toKernel(),
1272 n_gpu_rem_block_slot);
1273
1274 // now we sort
1275 openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
1276 vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context);
1277
1278 auto ite = vct_add_index_cont_0.getGPUIterator();
1279
1280 mem.allocate(sizeof(int));
1281 mem.fill(0);
1282 vct_add_index_unique.resize(vct_add_index_cont_0.size()+1);
1283
1284 ite = vct_add_index_cont_0.getGPUIterator();
1285
1286 // produce unique index list
1287 // Find the buffer bases
1288 CUDA_LAUNCH((find_buffer_offsets_zero<0,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>),
1289 ite,
1290 vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel());
1291
1292 mem.deviceToHost();
1293 int n_ele_unique = *(int *)mem.getPointer();
1294
1295 vct_add_index_unique.resize(n_ele_unique);
1296
1297 openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
1298 vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context);
1299
1300 // Then we merge the two list vct_index and vct_add_index_unique
1301
1302 // index to get merge index
1303 vct_m_index.resize(vct_index.size() + vct_add_index_unique.size());
1304
1305 ite = vct_m_index.getGPUIterator();
1306 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1307
1308 ite = vct_add_index_unique.getGPUIterator();
1309 CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),vct_index.size());
1310
1311 // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1312 // we reorder the data array also
1313
1314 vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1315 vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1316
1317 itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0);
1318 itew.wthr.y = 1;
1319 itew.wthr.z = 1;
1320 itew.thr.x = 128;
1321 itew.thr.y = 1;
1322 itew.thr.z = 1;
1323
1324 vct_index_dtmp.resize(itew.wthr.x);
1325
1326 // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
1327 //
1328 mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
1329 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
1330 (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
1331
1332 vct_index_tmp3.resize(128*itew.wthr.x);
1333
1334 CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),128>),
1335 itew,
1336 vct_index_tmp.toKernel(),
1337 vct_index_tmp2.toKernel(),
1338 vct_index_tmp3.toKernel(),
1339 vct_m_index.toKernel(),
1340 vct_index_dtmp.toKernel(),
1341 vct_index.size());
1342
1343 // we scan tmp3
1344 openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),context);
1345
1346 // get the size to resize vct_index and vct_data
1347 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
1348 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
1349
1350 vct_add_data_cont.resize(size);
1351 vct_index.resize(size);
1352
1353 CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1354 vct_index.toKernel(),vct_add_data_cont.toKernel(),
1355 vct_index_dtmp.toKernel());
1356
1357 vct_data.swap(vct_add_data_cont);
1358
1359#else
1360 std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1361#endif
1362 }
1363
1364 void resetBck()
1365 {
1366 // re-add background
1367 vct_data.resize(vct_data.size()+1);
1368 vct_data.get(vct_data.size()-1) = bck;
1369
1370 htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1371 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1372 }
1373
1374 template<typename ... v_reduce>
1375 void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1376 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1377 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1378 mgpu::ofp_context_t & context)
1379 {
1380 flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
1381 }
1382
1383 template<typename ... v_reduce>
1384 void flush_on_cpu()
1385 {
1386 if (vct_add_index.size() == 0)
1387 {return;}
1388
1389 // First copy the added index to reorder
1390 reorder_add_index_cpu.resize(vct_add_index.size());
1391 vct_add_data_cont.resize(vct_add_index.size());
1392
1393 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1394 {
1395 reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i);
1396 reorder_add_index_cpu.get(i).id2 = i;
1397 }
1398
1399 reorder_add_index_cpu.sort();
1400
1401 // Copy the data
1402 for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++)
1403 {
1404 vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1405 }
1406
1407 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1408
1409 sparse_vector_reduction_cpu<decltype(vct_add_data),
1410 decltype(vct_add_index_unique),
1411 decltype(reorder_add_index_cpu),
1412 vv_reduce,
1413 impl2>
1414 svr(vct_add_data_unique,
1415 vct_add_data_cont,
1416 vct_add_index_unique,
1417 reorder_add_index_cpu);
1418
1419 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1420
1421 // merge the the data
1422
1423 vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1424 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1425
1426 vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size());
1427 vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1428
1429 Ti di = 0;
1430 Ti ai = 0;
1431 size_t i = 0;
1432
1433 for ( ; i < vct_data_tmp.size() ; i++)
1434 {
1435 Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max();
1436 Ti id_d = (di < vct_index.size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max();
1437
1438 if ( id_a <= id_d )
1439 {
1440 vct_index_tmp.template get<0>(i) = id_a;
1441
1442 if (id_a == id_d)
1443 {
1444 auto dst = vct_data_tmp.get(i);
1445 auto src = vct_add_data_unique.get(ai);
1446
1447 sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)),
1448 decltype(vct_add_data.get(ai)),
1449 vv_reduce>
1450 sva(src,dst);
1451
1452 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(sva);
1453 ai++;
1454
1455 dst = vct_data_tmp.get(i);
1456 src = vct_data.get(di);
1457
1458 sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)),
1459 decltype(vct_data.get(di)),
1460 vv_reduce,
1461 impl2>
1462 svr(src,dst);
1463 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
1464
1465 di++;
1466
1467 vct_data_tmp.resize(vct_data_tmp.size()-1);
1468 vct_index_tmp.resize(vct_index_tmp.size()-1);
1469 }
1470 else
1471 {
1472 vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai);
1473 vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1474 ai++;
1475 }
1476 }
1477 else
1478 {
1479 vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di);
1480 vct_data_tmp.get(i) = vct_data.get(di);
1481 di++;
1482 }
1483 }
1484
1485 vct_index.swap(vct_index_tmp);
1486 vct_data.swap(vct_data_tmp);
1487
1488 vct_add_data.clear();
1489 vct_add_index.clear();
1490 vct_add_index_unique.clear();
1491 vct_add_data_unique.clear();
1492 }
1493
1494 public:
1495
1496 vector_sparse()
1497 :max_ele(0)
1498 {
1499 vct_data.resize(1);
1500 }
1501
1502 /*! \brief Get the indices buffer
1503 *
1504 * \return the reference to the indices buffer
1505 */
1506 auto getIndexBuffer() -> decltype(vct_index)&
1507 {
1508 return vct_index;
1509 }
1510
1511 /*! \brief Get the data buffer
1512 *
1513 * \return the reference to the data buffer
1514 */
1515 auto getDataBuffer() -> decltype(vct_data)&
1516 {
1517 return vct_data;
1518 }
1519
1520 /*! \brief Get the indices buffer
1521 *
1522 * \return the reference to the indices buffer
1523 */
1524 auto getIndexBuffer() const -> const decltype(vct_index)&
1525 {
1526 return vct_index;
1527 }
1528
1529 /*! \brief Get the data buffer
1530 *
1531 * \return the reference to the data buffer
1532 */
1533 auto getDataBuffer() const -> const decltype(vct_data)&
1534 {
1535 return vct_data;
1536 }
1537
1538 /*! \brief Get the sparse index
1539 *
1540 * Get the sparse index of the element id
1541 *
1542 * \note use get_index and get to retrieve the value index associated to the sparse index
1543 *
1544 * \param id Element to get
1545 *
1546 * \return the element value requested
1547 *
1548 */
1549 inline openfpm::sparse_index<Ti> get_sparse(Ti id) const
1550 {
1551 Ti di;
1552 this->_branchfree_search<false>(id,di);
1553 openfpm::sparse_index<Ti> sid;
1554 sid.id = di;
1555
1556 return sid;
1557 }
1558
1559 /*! \brief Get an element of the vector
1560 *
1561 * Get an element of the vector
1562 *
1563 * \tparam p Property to get
1564 * \param id Element to get
1565 *
1566 * \return the element value requested
1567 *
1568 */
1569 template <unsigned int p>
1570 inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
1571 {
1572 Ti di;
1573 this->_branchfree_search<false>(id,di);
1574 return vct_data.template get<p>(di);
1575 }
1576
1577 /*! \brief Get an element of the vector
1578 *
1579 * Get an element of the vector
1580 *
1581 * \tparam p Property to get
1582 * \param id Element to get
1583 *
1584 * \return the element value requested
1585 *
1586 */
1587 inline auto get(Ti id) const -> decltype(vct_data.get(id))
1588 {
1589 Ti di;
1590 this->_branchfree_search<false>(id,di);
1591 return vct_data.get(di);
1592 }
1593
1594 /*! \brief resize to n elements
1595 *
1596 * \param n elements
1597 *
1598 */
1599 void resize(size_t n)
1600 {
1601 max_ele = n;
1602 }
1603
1604 /*! \brief
1605 *
1606 * \warning After using this function to move out the vector of the indexes, this object become useless and
1607 * must be destroyed
1608 *
1609 * \param iv
1610 *
1611 */
1612 void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv)
1613 {
1614 vct_index.swap(iv);
1615 }
1616
1617 /*! \brief Set the background to bck (which value get must return when the value is not find)
1618 *
1619 * \param bck
1620 *
1621 */
1622 template <unsigned int p>
1623 auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-1))
1624 {
1625 return vct_data.template get<p>(vct_data.size()-1);
1626 }
1627
1628 /*! \brief Set the background to bck (which value get must return when the value is not find)
1629 *
1630 * \param bck
1631 *
1632 */
1633 auto getBackground() const -> decltype(vct_data.get(vct_data.size()-1))
1634 {
1635 return vct_data.get(vct_data.size()-1);
1636 }
1637
1638 template<unsigned int p>
1639 void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_)
1640 {
1641 meta_copy_d<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type,
1642 typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-1))>::type>
1643 ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-1));
1644
1645 vct_data.template hostToDevice<p>(vct_data.size()-1,vct_data.size()-1);
1646
1647 meta_copy<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type>
1648 ::meta_copy_(bck_,bck.template get<p>());
1649 }
1650
1651 /*! \brief It insert an element in the sparse vector
1652 *
1653 * \tparam p property id
1654 *
1655 * \param ele element id
1656 *
1657 */
1658 template <unsigned int p>
1659 auto insert(Ti ele) -> decltype(vct_data.template get<p>(0))
1660 {
1661 vct_add_index.add();
1662 vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1663 vct_add_data.add();
1664 return vct_add_data.template get<p>(vct_add_data.size()-1);
1665 }
1666
1667 /*! \brief It insert an element in the sparse vector
1668 *
1669 * \tparam p property id
1670 *
1671 * \param ele element id
1672 *
1673 */
1674 template <unsigned int p>
1675 auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(0))
1676 {
1677 size_t di;
1678
1679 // first we have to search if the block exist
1680 Ti v = _branchfree_search_nobck(ele,di);
1681
1682 if (v == ele)
1683 {
1684 // block exist
1685 return vct_data.template get<p>(di);
1686 }
1687
1688 // It does not exist, we create it di contain the index where we have to create the new block
1689 vct_index.insert(di);
1690 vct_data.isert(di);
1691
1692 return vct_data.template get<p>(di);
1693 }
1694
1695 /*! \brief It insert an element in the sparse vector
1696 *
1697 * \param ele element id
1698 *
1699 */
1700 auto insertFlush(Ti ele) -> decltype(vct_data.get(0))
1701 {
1702 Ti di;
1703
1704 // first we have to search if the block exist
1705 Ti v = _branchfree_search_nobck<true>(ele,di);
1706
1707 if (v == ele)
1708 {
1709 // block exist
1710 return vct_data.get(di);
1711 }
1712
1713 // It does not exist, we create it di contain the index where we have to create the new block
1714 vct_index.insert(di);
1715 vct_data.insert(di);
1716
1717 vct_index.template get<0>(di) = ele;
1718
1719 return vct_data.get(di);
1720 }
1721
1722 /*! \brief It insert an element in the sparse vector
1723 *
1724 * \param ele element id
1725 *
1726 */
1727 auto insert(Ti ele) -> decltype(vct_data.get(0))
1728 {
1729 vct_add_index.add();
1730 vct_add_index.template get<0>(vct_add_index.size()-1) = ele;
1731 vct_add_data.add();
1732 return vct_add_data.get(vct_add_data.size()-1);
1733 }
1734
1735 /*! \brief merge the added element to the main data array but save the insert buffer in v
1736 *
1737 * \param v insert buffer
1738 *
1739 * \param opt options
1740 *
1741 */
1742 template<typename ... v_reduce>
1743 void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1744 mgpu::ofp_context_t & context,
1745 flush_type opt = FLUSH_ON_HOST,
1746 int i = 0)
1747 {
1748 // Eliminate background
1749 vct_data.resize(vct_index.size());
1750
1751 if (opt & flush_type::FLUSH_ON_DEVICE)
1752 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);}
1753 else
1754 {this->flush_on_cpu<v_reduce ... >();}
1755
1756 resetBck();
1757 }
1758
1759 /*! \brief merge the added element to the main data array but save the insert buffer in v
1760 *
1761 * \param v insert buffer
1762 *
1763 * \param opt options
1764 *
1765 */
1766 template<typename ... v_reduce>
1767 void flush_vd(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1768 mgpu::ofp_context_t & context,
1769 flush_type opt = FLUSH_ON_HOST)
1770 {
1771 // Eliminate background
1772 vct_data.resize(vct_index.size());
1773
1774 if (opt & flush_type::FLUSH_ON_DEVICE)
1775 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1776 else
1777 {this->flush_on_cpu<v_reduce ... >();}
1778
1779 resetBck();
1780 }
1781
1782 /*! \brief merge the added element to the main data array
1783 *
1784 * \param opt options
1785 *
1786 */
1787 template<typename ... v_reduce>
1788 void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1789 {
1790 // Eliminate background
1791 vct_data.resize(vct_index.size());
1792
1793 if (opt & flush_type::FLUSH_ON_DEVICE)
1794 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1795 else
1796 {this->flush_on_cpu<v_reduce ... >();}
1797
1798 resetBck();
1799 }
1800
1801 /*! \brief merge the added element to the main data array
1802 *
1803 * \param opt options
1804 *
1805 */
1806 void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1807 {
1808 vct_data.resize(vct_data.size()-1);
1809
1810 if (opt & flush_type::FLUSH_ON_DEVICE)
1811 {this->flush_on_gpu_remove(context);}
1812 else
1813 {
1814 std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet";
1815 }
1816
1817 resetBck();
1818 }
1819
1820 /*! \brief Return how many element you have in this map
1821 *
1822 * \return the number of elements
1823 */
1824 size_t size()
1825 {
1826 return vct_index.size();
1827 }
1828
1829 /*! \brief Return the sorted vector of the indexes
1830 *
1831 * \return return the sorted vector of the indexes
1832 */
1833 vector<aggregate<Ti>,Memory,layout_base,grow_p> &
1834 private_get_vct_index()
1835 {
1836 return vct_index;
1837 }
1838
1839 /*! \brief Transfer from device to host
1840 *
1841 * \tparam set of parameters to transfer to host
1842 *
1843 */
1844 template<unsigned int ... prp>
1845 void deviceToHost()
1846 {
1847 vct_index.template deviceToHost<0>();
1848 vct_data.template deviceToHost<prp...>();
1849 }
1850
1851 /*! \brief Transfer from host to device
1852 *
1853 * \tparam set of parameters to transfer to device
1854 *
1855 */
1856 template<unsigned int ... prp>
1857 void hostToDevice()
1858 {
1859 vct_index.template hostToDevice<0>();
1860 vct_data.template hostToDevice<prp...>();
1861 }
1862
1863 /*! \brief toKernel function transform this structure into one that can be used on GPU
1864 *
1865 * \return structure that can be used on GPU
1866 *
1867 */
1868 vector_sparse_gpu_ker<T,Ti,layout_base> toKernel()
1869 {
1870 vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(),
1871 vct_add_index.toKernel(),
1872 vct_rem_index.toKernel(),vct_add_data.toKernel(),
1873 vct_nadd_index.toKernel(),
1874 vct_nrem_index.toKernel(),
1875 n_gpu_add_block_slot,
1876 n_gpu_rem_block_slot);
1877
1878 return mvsck;
1879 }
1880
1881 /*! \brief set the gpu insert buffer for every block
1882 *
1883 * \param nblock number of blocks
1884 * \param nslot number of slots free for each block
1885 *
1886 */
1887 void setGPUInsertBuffer(int nblock, int nslot)
1888 {
1889 vct_add_index.resize(nblock*nslot);
1890 vct_nadd_index.resize(nblock);
1891 vct_add_data.resize(nblock*nslot);
1892 n_gpu_add_block_slot = nslot;
1893 vct_nadd_index.template fill<0>(0);
1894 }
1895
1896 /*! \brief In case we manually set the added index buffer and the add data buffer we have to call this
1897 * function before flush
1898 *
1899 *
1900 */
1901 void preFlush()
1902 {
1903#ifdef __NVCC__
1904 vct_nadd_index.resize(vct_add_index.size());
1905
1906 if (vct_nadd_index.size() != 0)
1907 {
1908 auto ite = vct_nadd_index.getGPUIterator();
1909 CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1910 }
1911 n_gpu_add_block_slot = 1;
1912#endif
1913 }
1914
1915 /*! \brief Get the GPU insert buffer
1916 *
1917 * \return the reference to the GPU insert buffer
1918 */
1919 auto getGPUInsertBuffer() -> decltype(vct_add_data)&
1920 {
1921 return vct_add_data;
1922 }
1923
1924 /*! \brief set the gpu remove buffer for every block
1925 *
1926 * \param nblock number of blocks
1927 * \param nslot number of slots free for each block
1928 *
1929 */
1930 void setGPURemoveBuffer(int nblock, int nslot)
1931 {
1932 vct_rem_index.resize(nblock*nslot);
1933 vct_nrem_index.resize(nblock);
1934 n_gpu_rem_block_slot = nslot;
1935 vct_nrem_index.template fill<0>(0);
1936 }
1937
1938#ifdef CUDA_GPU
1939
1940 /*! \brief Get iterator over the stored elements
1941 *
1942 * \return an iterator
1943 *
1944 */
1945 auto getGPUIterator() -> decltype(vct_index.getGPUIterator())
1946 {
1947 return vct_index.getGPUIterator();
1948 }
1949
1950#endif
1951
1952 /*! \brief Clear all from all the elements
1953 *
1954 *
1955 */
1956 void clear()
1957 {
1958 vct_data.clear();
1959 vct_index.clear();
1960 vct_add_index.clear();
1961 vct_add_data.clear();
1962
1963 // re-add background
1964 vct_data.resize(vct_data.size()+1);
1965 vct_data.get(vct_data.size()-1) = bck;
1966
1967 htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1);
1968 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1969
1970 max_ele = 0;
1971 n_gpu_add_block_slot = 0;
1972 n_gpu_rem_block_slot = 0;
1973 }
1974
1975 void swap(vector_sparse<T,Ti,Memory,layout,layout_base,grow_p,impl> & sp)
1976 {
1977 vct_data.swap(sp.vct_data);
1978 vct_index.swap(sp.vct_index);
1979 vct_add_index.swap(sp.vct_add_index);
1980 vct_add_data.swap(sp.vct_add_data);
1981
1982 size_t max_ele_ = sp.max_ele;
1983 sp.max_ele = max_ele;
1984 this->max_ele = max_ele_;
1985 }
1986
1987 vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1988 {
1989 return vct_add_data;
1990 }
1991
1992 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1993 {
1994 return vct_add_index;
1995 }
1996
1997 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const
1998 {
1999 return vct_add_index;
2000 }
2001
2002 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2003 {
2004 return vct_nadd_index;
2005 }
2006
2007 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const
2008 {
2009 return vct_nadd_index;
2010 }
2011
2012 auto getSegmentToOutMap() -> decltype(blf.get_outputMap())
2013 {
2014 return blf.get_outputMap();
2015 }
2016
2017 auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2018 {
2019 return blf.get_outputMap();
2020 }
2021
2022 /*! \brief Eliminate many internal temporary buffer you can use this between flushes if you get some out of memory
2023 *
2024 *
2025 */
2026 void removeUnusedBuffers()
2027 {
2028 vct_add_data.resize(0);
2029 vct_add_data.shrink_to_fit();
2030
2031 vct_add_data.resize(0);
2032 vct_add_data.shrink_to_fit();
2033
2034 vct_add_data_reord.resize(0);
2035 vct_add_data_reord.shrink_to_fit();
2036
2037 vct_add_data_cont.resize(0);
2038 vct_add_data_cont.shrink_to_fit();
2039
2040 vct_add_data_unique.resize(0);
2041 vct_add_data_unique.shrink_to_fit();
2042 }
2043
2044 /* \brief Return the offsets of the segments for the merge indexes
2045 *
2046 *
2047 */
2048 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
2049 {
2050 return vct_add_index_unique;
2051 }
2052
2053 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const
2054 {
2055 return vct_add_index_unique;
2056 }
2057
2058 /*! \brief Return the mapping vector
2059 *
2060 * When we add new elements this vector contain the merged old elements and new elements position
2061 *
2062 * For example the old vector contain
2063 *
2064 * Old: 5 10 35 50 66 79 (6 elements)
2065 * New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44)
2066 *
2067 * The merged indexes are (when reordered)
2068 *
2069 * 5 7 7 9 10 35 44 44 50 66 79
2070 *
2071 * The returned map contain 5 elements indicating the position of the reordered elements:
2072 *
2073 * 0 2 3 1 4
2074 * (7)(7)(9)(44)(44)
2075 */
2076 vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector()
2077 {
2078 return vct_add_index_cont_1;
2079 }
2080
2081 /*! \brief Return the merge mapping vector
2082 *
2083 * When we add new elements this vector contain the merged old elements and new elements position
2084 *
2085 * For example the old vector contain
2086 *
2087 * Old: 5 10 35 50 66 79 (6 elements)
2088 * New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44)
2089 *
2090 * The merged indexes are (when reordered)
2091 *
2092 * 5 7 7 9 10 35 44 44 50 66 79
2093 *
2094 * The returned map contain 5 elements indicating the position of the reordered elements:
2095 *
2096 * 0 6 7 8 1 2 9 10 3 4 5
2097 * (5)(7)(7)(9)(10)(35)(44)(44)(50)(66)(79)
2098 */
2099 vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector()
2100 {
2101 return vct_index_tmp2;
2102 }
2103 };
2104
2105
2106 template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int>
2107 using vector_sparse_gpu = openfpm::vector_sparse<
2108 T,
2109 indexT,
2110 CudaMemory,
2111 typename memory_traits_inte<T>::type,
2112 memory_traits_inte,
2113 grow_policy_double,
2114 vect_isel<T>::value,
2115 blockSwitch,
2116 block_functor
2117 >;
2118
2119 template<typename T, typename block_functor = stub_block_functor, typename indexT = long int>
2120 using vector_sparse_gpu_block = openfpm::vector_sparse<
2121 T,
2122 indexT,
2123 CudaMemory,
2124 typename memory_traits_inte<T>::type,
2125 memory_traits_inte,
2126 grow_policy_double,
2127 vect_isel<T>::value,
2128 VECTOR_SPARSE_BLOCK,
2129 block_functor
2130 >;
2131}
2132
2133
2134
2135#endif /* MAP_VECTOR_SPARSE_HPP_ */
2136