1 | /* |
2 | * map_vector_sparse.hpp |
3 | * |
4 | * Created on: Jan 22, 2019 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef MAP_VECTOR_SPARSE_HPP_ |
9 | #define MAP_VECTOR_SPARSE_HPP_ |
10 | |
11 | #include "util/cuda_launch.hpp" |
12 | #include "Vector/map_vector.hpp" |
13 | #include "Vector/cuda/map_vector_sparse_cuda_ker.cuh" |
14 | #include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh" |
15 | #include "util/cuda/ofp_context.hxx" |
16 | #include <iostream> |
17 | #include <limits> |
18 | |
19 | #if defined(__NVCC__) |
20 | #if !defined(CUDA_ON_CPU) |
21 | #include "util/cuda/moderngpu/kernel_segreduce.hxx" |
22 | #include "util/cuda/moderngpu/kernel_merge.hxx" |
23 | #endif |
24 | #include "util/cuda/kernels.cuh" |
25 | #endif |
26 | |
27 | #include "util/cuda/scan_ofp.cuh" |
28 | #include "util/cuda/sort_ofp.cuh" |
29 | |
30 | enum flush_type |
31 | { |
32 | FLUSH_ON_HOST = 0, |
33 | FLUSH_ON_DEVICE = 1, |
34 | FLUSH_NO_DATA = 2 |
35 | }; |
36 | |
37 | template<typename OfpmVectorT> |
38 | using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type; |
39 | |
40 | namespace openfpm |
41 | { |
42 | // All props |
43 | template<typename sg_type> |
44 | struct htoD |
45 | { |
46 | //! encapsulated source object |
47 | sg_type & sg; |
48 | |
49 | unsigned int lele; |
50 | |
51 | htoD(sg_type & sg, unsigned int lele) |
52 | :sg(sg),lele(lele) |
53 | {}; |
54 | |
55 | |
56 | //! It call the copy function for each property |
57 | template<typename T> |
58 | __device__ __host__ inline void operator()(T& t) const |
59 | { |
60 | sg.template hostToDevice<T::value>(lele,lele); |
61 | } |
62 | }; |
63 | |
64 | constexpr int VECTOR_SPARSE_STANDARD = 1; |
65 | constexpr int VECTOR_SPARSE_BLOCK = 2; |
66 | |
67 | template<typename reduction_type, unsigned int impl> |
68 | struct cpu_block_process |
69 | { |
70 | template<typename encap_src, typename encap_dst> |
71 | static inline void process(encap_src & src, encap_dst & dst) |
72 | { |
73 | dst = reduction_type::red(dst,src); |
74 | } |
75 | }; |
76 | |
77 | template<typename reduction_type> |
78 | struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK> |
79 | { |
80 | template<typename encap_src, typename encap_dst> |
81 | static inline void process(encap_src & src, encap_dst & dst) |
82 | { |
83 | for (size_t i = 0 ; i < encap_src::size ; i++) |
84 | { |
85 | dst[i] = reduction_type::red(dst[i],src[i]); |
86 | } |
87 | } |
88 | }; |
89 | |
90 | template<typename reduction_type> |
91 | struct cpu_block_process<reduction_type,3> |
92 | { |
93 | template<typename encap_src, typename encap_dst,unsigned int N1> |
94 | static inline void process(encap_src & src, encap_dst (& dst)[N1]) |
95 | { |
96 | for (unsigned int j = 0 ; j < N1 ; j++) |
97 | { |
98 | for (size_t i = 0 ; i < encap_dst::size ; i++) |
99 | { |
100 | dst[i] = reduction_type::red(dst[i][j],src[j][i]); |
101 | } |
102 | } |
103 | } |
104 | |
105 | template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst> |
106 | static inline void process_e(encap_src & src, encap_dst & dst) |
107 | { |
108 | for (unsigned int j = 0 ; j < N1 ; j++) |
109 | { |
110 | for (size_t i = 0 ; i < blockSize ; i++) |
111 | { |
112 | dst[i] = reduction_type::red(dst[i][j],src[i][j]); |
113 | } |
114 | } |
115 | } |
116 | }; |
117 | |
118 | /*! \brief Functor switch to select the vector sparse for standars scalar and blocked implementation |
119 | * |
120 | * |
121 | */ |
122 | template<unsigned int impl, typename block_functor> |
123 | struct scalar_block_implementation_switch // Case for scalar implementations |
124 | { |
125 | template <unsigned int p, typename vector_index_type> |
126 | static void extendSegments(vector_index_type & segments, size_t dataSize) |
127 | { |
128 | #ifdef __NVCC__ |
129 | // Pass as there is nothing to append for mgpu |
130 | #else // __NVCC__ |
131 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
132 | #endif // __NVCC__ |
133 | } |
134 | |
135 | template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2> |
136 | static void segreduce(vector_data_type & vector_data, |
137 | vector_data_type & vector_data_unsorted, |
138 | vector_index_type & vector_data_map, |
139 | vector_index_type2 & segment_offset, |
140 | vector_data_type & vector_data_red, |
141 | block_functor & blf, |
142 | mgpu::ofp_context_t & context) |
143 | { |
144 | #ifdef __NVCC__ |
145 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
146 | typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type; |
147 | typedef typename reduction_type::template op_red<red_type> red_op; |
148 | typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type; |
149 | red_type init; |
150 | init = 0; |
151 | |
152 | assert((std::is_same<seg_type,int>::value == true)); |
153 | |
154 | mgpu::segreduce( |
155 | (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(), |
156 | (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(), |
157 | (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(), |
158 | red_op(), init, context); |
159 | #else // __NVCC__ |
160 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
161 | #endif // __NVCC__ |
162 | } |
163 | |
164 | |
165 | /*! \briefMerge all datas |
166 | * |
167 | * \param vct_index_old sorted vector of the old indexes |
168 | * \param vct_data_old vector of old data |
169 | * \param vct_index_out output indexes merged new and old indexes |
170 | * \param vct_index_merge_id indicate from where it come from the merged index (if the number is bigger than vct_index_old.size() |
171 | * the merged index come from the new data) |
172 | * \param vct_index_merge indexes old and new merged with conflicts |
173 | * \param vct_add_data_unique data to add (has been already reduced) |
174 | * \param vct_data_old old data |
175 | * \param vct_add_data data to add in its original form in the insert buffer |
176 | * \param vct_data_out reduced data vector new + old |
177 | * \param vct_index_dtmp temporal buffer vector used for intermediate calculation |
178 | * |
179 | */ |
180 | template < |
181 | typename vector_data_type, |
182 | typename vector_index_type, |
183 | typename vector_index_type2, |
184 | typename vector_index_dtmp_type, |
185 | typename Ti, |
186 | typename ... v_reduce> |
187 | static void solveConflicts( |
188 | vector_index_type & vct_index_old, |
189 | vector_index_type & vct_index_merge, |
190 | vector_index_type & vct_index_merge_id, |
191 | vector_index_type & vct_index_out, |
192 | vector_index_dtmp_type & vct_index_dtmp, |
193 | vector_index_type & data_map, |
194 | vector_index_type2 & segments_new, |
195 | vector_data_type & vct_data_old, |
196 | vector_data_type & vct_add_data, |
197 | vector_data_type & vct_add_data_unique, |
198 | vector_data_type & vct_data_out, |
199 | ite_gpu<1> & itew, |
200 | block_functor & blf, |
201 | mgpu::ofp_context_t & context |
202 | ) |
203 | { |
204 | #ifdef __NVCC__ |
205 | |
206 | CUDA_LAUNCH((solve_conflicts< |
207 | decltype(vct_index_merge.toKernel()), |
208 | decltype(vct_data_old.toKernel()), |
209 | decltype(vct_index_dtmp.toKernel()), |
210 | 128, |
211 | v_reduce ... |
212 | >), |
213 | itew, |
214 | vct_index_merge.toKernel(),vct_data_old.toKernel(), |
215 | vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(), |
216 | vct_index_out.toKernel(),vct_data_out.toKernel(), |
217 | vct_index_dtmp.toKernel(), |
218 | vct_index_old.size()); |
219 | |
220 | // we scan tmp3 |
221 | openfpm::scan( |
222 | (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(), |
223 | vct_index_dtmp.size(), |
224 | (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(), |
225 | context); |
226 | |
227 | // get the size to resize vct_index and vct_data |
228 | vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1); |
229 | int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1); |
230 | |
231 | vct_index_old.resize(size); |
232 | vct_data_old.resize(size); |
233 | |
234 | CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(), |
235 | vct_index_old.toKernel(), vct_data_old.toKernel(), |
236 | vct_index_dtmp.toKernel()); |
237 | |
238 | |
239 | #else // __NVCC__ |
240 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
241 | #endif // __NVCC__ |
242 | } |
243 | }; |
244 | |
245 | |
246 | template<typename block_functor> |
247 | struct scalar_block_implementation_switch<2, block_functor> // Case for blocked implementations |
248 | { |
249 | template <unsigned int p, typename vector_index_type> |
250 | static void extendSegments(vector_index_type & segments, size_t dataSize) |
251 | { |
252 | #ifdef __NVCC__ |
253 | // Append trailing element to segment (marks end of last segment) |
254 | segments.resize(segments.size()+1); |
255 | segments.template get<p>(segments.size() - 1) = dataSize; |
256 | segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1); |
257 | #else // __NVCC__ |
258 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
259 | #endif // __NVCC__ |
260 | } |
261 | |
262 | template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2> |
263 | static void segreduce(vector_data_type & vector_data, |
264 | vector_data_type & vector_data_unsorted, |
265 | vector_index_type & vector_data_map, |
266 | vector_index_type2 & segment_offset, |
267 | vector_data_type & vector_data_red, |
268 | block_functor & blf, |
269 | mgpu::ofp_context_t & context) |
270 | { |
271 | |
272 | } |
273 | |
274 | template < |
275 | typename vector_data_type, |
276 | typename vector_index_type, |
277 | typename vector_index_type2, |
278 | typename vector_index_dtmp_type, |
279 | typename Ti, |
280 | typename ... v_reduce> |
281 | static void solveConflicts( |
282 | vector_index_type & vct_index_old, |
283 | vector_index_type & vct_index_merge, |
284 | vector_index_type & vct_index_merge_id, |
285 | vector_index_type & vct_index_out, |
286 | vector_index_dtmp_type & vct_index_dtmp, |
287 | vector_index_type & data_map, |
288 | vector_index_type2 & segments_new, |
289 | vector_data_type & vct_data, |
290 | vector_data_type & vct_add_data, |
291 | vector_data_type & vct_add_data_unique, |
292 | vector_data_type & vct_data_out, |
293 | ite_gpu<1> & itew, |
294 | block_functor & blf, |
295 | mgpu::ofp_context_t & context |
296 | ) |
297 | { |
298 | #ifdef __NVCC__ |
299 | blf.template solve_conflicts<1, |
300 | decltype(vct_index_merge), |
301 | decltype(segments_new), |
302 | decltype(vct_data), |
303 | v_reduce ...> |
304 | (vct_index_merge, vct_index_merge_id, segments_new, data_map, |
305 | vct_data, vct_add_data, |
306 | vct_index_old, vct_data_out, |
307 | context); |
308 | vct_data_out.swap(vct_data); |
309 | |
310 | #else // __NVCC__ |
311 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
312 | #endif // __NVCC__ |
313 | } |
314 | }; |
315 | |
316 | template<typename Ti> |
317 | struct reorder |
318 | { |
319 | Ti id; |
320 | Ti id2; |
321 | |
322 | bool operator<(const reorder & t) const |
323 | { |
324 | return id < t.id; |
325 | } |
326 | }; |
327 | |
328 | template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type> |
329 | struct sparse_vector_reduction_cpu_impl |
330 | { |
331 | template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo> |
332 | static inline void red(size_t & i, vector_data_type & vector_data_red, |
333 | vector_data_type & vector_data, |
334 | vector_index_type & vector_index, |
335 | vector_index_type_reo & reorder_add_index_cpu) |
336 | { |
337 | size_t start = reorder_add_index_cpu.get(i).id; |
338 | red_type red = vector_data.template get<reduction_type::prop::value>(i); |
339 | |
340 | size_t j = 1; |
341 | for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++) |
342 | { |
343 | cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red); |
344 | //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j)); |
345 | } |
346 | vector_data_red.add(); |
347 | vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red; |
348 | |
349 | if (T::value == 0) |
350 | { |
351 | vector_index.add(); |
352 | vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id; |
353 | } |
354 | |
355 | i += j; |
356 | } |
357 | }; |
358 | |
359 | |
360 | template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1> |
361 | struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]> |
362 | { |
363 | template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo> |
364 | static inline void red(size_t & i, vector_data_type & vector_data_red, |
365 | vector_data_type & vector_data, |
366 | vector_index_type & vector_index, |
367 | vector_index_type_reo & reorder_add_index_cpu) |
368 | { |
369 | size_t start = reorder_add_index_cpu.get(i).id; |
370 | red_type red[N1]; |
371 | |
372 | for (size_t k = 0 ; k < N1 ; k++) |
373 | { |
374 | red[k] = vector_data.template get<reduction_type::prop::value>(i)[k]; |
375 | } |
376 | |
377 | size_t j = 1; |
378 | for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++) |
379 | { |
380 | auto ev = vector_data.template get<reduction_type::prop::value>(i+j); |
381 | cpu_block_process<reduction_type,impl+1>::process(ev,red); |
382 | //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j)); |
383 | } |
384 | |
385 | vector_data_red.add(); |
386 | |
387 | for (size_t k = 0 ; k < N1 ; k++) |
388 | { |
389 | vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] = red[k]; |
390 | } |
391 | |
392 | if (T::value == 0) |
393 | { |
394 | vector_index.add(); |
395 | vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id; |
396 | } |
397 | |
398 | i += j; |
399 | } |
400 | }; |
401 | |
402 | /*! \brief this class is a functor for "for_each" algorithm |
403 | * |
404 | * This class is a functor for "for_each" algorithm. For each |
405 | * element of the boost::vector the operator() is called. |
406 | * Is mainly used to copy one encap into another encap object |
407 | * |
408 | * \tparam encap source |
409 | * \tparam encap dst |
410 | * |
411 | */ |
412 | template<typename vector_data_type, |
413 | typename vector_index_type, |
414 | typename vector_index_type_reo, |
415 | typename vector_reduction, |
416 | unsigned int impl> |
417 | struct sparse_vector_reduction_cpu |
418 | { |
419 | //! Vector in which to the reduction |
420 | vector_data_type & vector_data_red; |
421 | |
422 | //! Vector in which to the reduction |
423 | vector_data_type & vector_data; |
424 | |
425 | //! reorder vector index |
426 | vector_index_type_reo & reorder_add_index_cpu; |
427 | |
428 | //! Index type vector |
429 | vector_index_type & vector_index; |
430 | |
431 | /*! \brief constructor |
432 | * |
433 | * \param src source encapsulated object |
434 | * \param dst source encapsulated object |
435 | * |
436 | */ |
437 | inline sparse_vector_reduction_cpu(vector_data_type & vector_data_red, |
438 | vector_data_type & vector_data, |
439 | vector_index_type & vector_index, |
440 | vector_index_type_reo & reorder_add_index_cpu) |
441 | :vector_data_red(vector_data_red),vector_data(vector_data),vector_index(vector_index),reorder_add_index_cpu(reorder_add_index_cpu) |
442 | {}; |
443 | |
444 | //! It call the copy function for each property |
445 | template<typename T> |
446 | inline void operator()(T& t) const |
447 | { |
448 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
449 | typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type; |
450 | |
451 | if (reduction_type::is_special() == false) |
452 | { |
453 | for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; ) |
454 | { |
455 | sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type>::red(i,vector_data_red,vector_data,vector_index,reorder_add_index_cpu); |
456 | |
457 | /* size_t start = reorder_add_index_cpu.get(i).id; |
458 | red_type red = vector_data.template get<reduction_type::prop::value>(i); |
459 | |
460 | size_t j = 1; |
461 | for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++) |
462 | { |
463 | cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red); |
464 | //reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j)); |
465 | } |
466 | vector_data_red.add(); |
467 | vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red; |
468 | |
469 | if (T::value == 0) |
470 | { |
471 | vector_index.add(); |
472 | vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id; |
473 | } |
474 | |
475 | i += j;*/ |
476 | } |
477 | } |
478 | } |
479 | }; |
480 | |
481 | /*! \brief this class is a functor for "for_each" algorithm |
482 | * |
483 | * This class is a functor for "for_each" algorithm. For each |
484 | * element of the boost::vector the operator() is called. |
485 | * Is mainly used to copy one encap into another encap object |
486 | * |
487 | * \tparam encap source |
488 | * \tparam encap dst |
489 | * |
490 | */ |
491 | template<typename encap_src, |
492 | typename encap_dst, |
493 | typename vector_reduction> |
494 | struct sparse_vector_reduction_solve_conflict_assign_cpu |
495 | { |
496 | //! source |
497 | encap_src & src; |
498 | |
499 | //! destination |
500 | encap_dst & dst; |
501 | |
502 | |
503 | /*! \brief constructor |
504 | * |
505 | * \param src source encapsulated object |
506 | * \param dst source encapsulated object |
507 | * |
508 | */ |
509 | inline sparse_vector_reduction_solve_conflict_assign_cpu(encap_src & src, encap_dst & dst) |
510 | :src(src),dst(dst) |
511 | {}; |
512 | |
513 | //! It call the copy function for each property |
514 | template<typename T> |
515 | inline void operator()(T& t) const |
516 | { |
517 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
518 | |
519 | dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>(); |
520 | } |
521 | }; |
522 | |
523 | |
524 | template<unsigned int impl,typename vector_reduction, typename T,typename red_type> |
525 | struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl |
526 | { |
527 | template<typename encap_src, typename encap_dst> |
528 | static inline void red(encap_src & src, encap_dst & dst) |
529 | { |
530 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
531 | |
532 | cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>()); |
533 | } |
534 | }; |
535 | |
536 | template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1> |
537 | struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]> |
538 | { |
539 | template<typename encap_src, typename encap_dst> |
540 | static inline void red(encap_src & src, encap_dst & dst) |
541 | { |
542 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
543 | |
544 | auto src_e = src.template get<reduction_type::prop::value>(); |
545 | auto dst_e = dst.template get<reduction_type::prop::value>(); |
546 | |
547 | cpu_block_process<reduction_type,impl+1>::template process_e<N1,red_type::size>(src_e,dst_e); |
548 | } |
549 | }; |
550 | |
551 | /*! \brief this class is a functor for "for_each" algorithm |
552 | * |
553 | * This class is a functor for "for_each" algorithm. For each |
554 | * element of the boost::vector the operator() is called. |
555 | * Is mainly used to copy one encap into another encap object |
556 | * |
557 | * \tparam encap source |
558 | * \tparam encap dst |
559 | * |
560 | */ |
561 | template<typename encap_src, |
562 | typename encap_dst, |
563 | typename vector_reduction, |
564 | unsigned int impl> |
565 | struct sparse_vector_reduction_solve_conflict_reduce_cpu |
566 | { |
567 | //! source |
568 | encap_src & src; |
569 | |
570 | //! destination |
571 | encap_dst & dst; |
572 | |
573 | |
574 | /*! \brief constructor |
575 | * |
576 | * \param src source encapsulated object |
577 | * \param dst source encapsulated object |
578 | * |
579 | */ |
580 | inline sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src & src, encap_dst & dst) |
581 | :src(src),dst(dst) |
582 | {}; |
583 | |
584 | //! It call the copy function for each property |
585 | template<typename T> |
586 | inline void operator()(T& t) const |
587 | { |
588 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
589 | typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type; |
590 | |
591 | sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type>::red(src,dst); |
592 | |
593 | // cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>()); |
594 | // reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>()); |
595 | } |
596 | }; |
597 | |
598 | /*! \brief this class is a functor for "for_each" algorithm |
599 | * |
600 | * This class is a functor for "for_each" algorithm. For each |
601 | * element of the boost::vector the operator() is called. |
602 | * Is mainly used to copy one encap into another encap object |
603 | * |
604 | * \tparam encap source |
605 | * \tparam encap dst |
606 | * |
607 | */ |
608 | template<typename vector_data_type, |
609 | typename vector_index_type, |
610 | typename vector_index_type2, |
611 | typename vector_reduction, |
612 | typename block_functor, |
613 | unsigned int impl2, unsigned int pSegment=1> |
614 | struct sparse_vector_reduction |
615 | { |
616 | //! Vector in which to the reduction |
617 | vector_data_type & vector_data_red; |
618 | |
619 | //! new datas |
620 | vector_data_type & vector_data; |
621 | |
622 | //! new data in an unsorted way |
623 | vector_data_type & vector_data_unsorted; |
624 | |
625 | //! segment of offsets |
626 | vector_index_type2 & segment_offset; |
627 | |
628 | //! map of the data |
629 | vector_index_type & vector_data_map; |
630 | |
631 | //! block functor |
632 | block_functor & blf; |
633 | |
634 | //! gpu context |
635 | mgpu::ofp_context_t & context; |
636 | |
637 | /*! \brief constructor |
638 | * |
639 | * \param src source encapsulated object |
640 | * \param dst source encapsulated object |
641 | * |
642 | */ |
643 | inline sparse_vector_reduction(vector_data_type & vector_data_red, |
644 | vector_data_type & vector_data, |
645 | vector_data_type & vector_data_unsorted, |
646 | vector_index_type & vector_data_map, |
647 | vector_index_type2 & segment_offset, |
648 | block_functor & blf, |
649 | mgpu::ofp_context_t & context) |
650 | :vector_data_red(vector_data_red), |
651 | vector_data(vector_data), |
652 | vector_data_unsorted(vector_data_unsorted), |
653 | segment_offset(segment_offset), |
654 | vector_data_map(vector_data_map), |
655 | blf(blf), |
656 | context(context) |
657 | {}; |
658 | |
659 | //! It call the copy function for each property |
660 | template<typename T> |
661 | inline void operator()(T& t) const |
662 | { |
663 | #ifdef __NVCC__ |
664 | |
665 | typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; |
666 | typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type; |
667 | if (reduction_type::is_special() == false) |
668 | { |
669 | scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>( |
670 | vector_data, |
671 | vector_data_unsorted, |
672 | vector_data_map, |
673 | segment_offset, |
674 | vector_data_red, |
675 | blf, |
676 | context); |
677 | } |
678 | #else |
679 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; |
680 | #endif |
681 | } |
682 | }; |
683 | |
684 | |
685 | struct stub_block_functor |
686 | { |
687 | template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type> |
688 | static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst) |
689 | { |
690 | return true; |
691 | } |
692 | |
693 | template<typename vector_index_type, typename vector_data_type, typename ... v_reduce> |
694 | static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices, |
695 | vector_data_type &data1, vector_data_type &data2, |
696 | vector_index_type &indices_tmp, vector_data_type &data_tmp, |
697 | vector_index_type &keysOut, vector_data_type &dataOut, |
698 | mgpu::ofp_context_t & context) |
699 | { |
700 | return true; |
701 | } |
702 | |
703 | openfpm::vector_gpu<aggregate<unsigned int>> outputMap; |
704 | |
705 | openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() |
706 | { |
707 | return outputMap; |
708 | } |
709 | |
710 | const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const |
711 | { |
712 | return outputMap; |
713 | } |
714 | }; |
715 | |
716 | /*! \brief this class is a functor for "for_each" algorithm |
717 | * |
718 | * This class is a functor for "for_each" algorithm. For each |
719 | * element of the boost::vector the operator() is called. |
720 | * Is mainly used to copy one encap into another encap object |
721 | * |
722 | * \tparam encap source |
723 | * \tparam encap dst |
724 | * |
725 | */ |
726 | template<typename vector_data_type, typename vector_index_type, typename vector_reduction> |
727 | struct sparse_vector_special |
728 | { |
729 | //! Vector in which to the reduction |
730 | vector_data_type & vector_data_red; |
731 | |
732 | //! Vector in which to the reduction |
733 | vector_data_type & vector_data; |
734 | |
735 | //! segment of offsets |
736 | vector_index_type & segment_offset; |
737 | |
738 | //! gpu context |
739 | mgpu::ofp_context_t & context; |
740 | |
741 | /*! \brief constructor |
742 | * |
743 | * \param src source encapsulated object |
744 | * \param dst source encapsulated object |
745 | * |
746 | */ |
747 | inline sparse_vector_special(vector_data_type & vector_data_red, |
748 | vector_data_type & vector_data, |
749 | vector_index_type & segment_offset, |
750 | mgpu::ofp_context_t & context) |
751 | :vector_data_red(vector_data_red),vector_data(vector_data),segment_offset(segment_offset),context(context) |
752 | {}; |
753 | |
754 | //! It call the copy function for each property |
755 | template<typename T> |
756 | inline void operator()(T& t) const |
757 | { |
758 | #ifdef __NVCC__ |
759 | |
760 | typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type; |
761 | |
762 | // reduction type |
763 | typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type; |
764 | |
765 | if (reduction_type::is_special() == true) |
766 | { |
767 | auto ite = segment_offset.getGPUIterator(); |
768 | |
769 | CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>), |
770 | ite,segment_offset.toKernel(),vector_data_red.toKernel(),vector_data.size()); |
771 | } |
772 | |
773 | #else |
774 | std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl; |
775 | #endif |
776 | } |
777 | }; |
778 | |
779 | template<typename T, |
780 | typename Ti = long int, |
781 | typename Memory=HeapMemory, |
782 | typename layout=typename memory_traits_lin<T>::type, |
783 | template<typename> class layout_base=memory_traits_lin , |
784 | typename grow_p=grow_policy_double, |
785 | unsigned int impl=vect_isel<T>::value, |
786 | unsigned int impl2 = VECTOR_SPARSE_STANDARD, |
787 | typename block_functor = stub_block_functor> |
788 | class vector_sparse |
789 | { |
790 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index; |
791 | vector<T,Memory,layout_base,grow_p,impl> vct_data; |
792 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index; |
793 | |
794 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index; |
795 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index; |
796 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index; |
797 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index; |
798 | vector<T,Memory,layout_base,grow_p> vct_add_data; |
799 | vector<T,Memory,layout_base,grow_p> vct_add_data_reord; |
800 | |
801 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0; |
802 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1; |
803 | vector<T,Memory,layout_base,grow_p> vct_add_data_cont; |
804 | vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique; |
805 | vector<aggregate<int,int>,Memory,layout_base,grow_p> segments_int; |
806 | |
807 | vector<T,Memory,layout_base,grow_p,impl> vct_add_data_unique; |
808 | |
809 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4; |
810 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp; |
811 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2; |
812 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3; |
813 | vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp; |
814 | |
815 | // segments map (This is used only in case of Blocked data) |
816 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_segment_index_map; |
817 | |
818 | block_functor blf; |
819 | |
820 | T bck; |
821 | |
822 | CudaMemory mem; |
823 | |
824 | openfpm::vector<reorder<Ti>> reorder_add_index_cpu; |
825 | |
826 | size_t max_ele; |
827 | |
828 | int n_gpu_add_block_slot = 0; |
829 | int n_gpu_rem_block_slot = 0; |
830 | |
831 | /*! \brief get the element i |
832 | * |
833 | * search the element x |
834 | * |
835 | * \param i element i |
836 | */ |
837 | template<bool prefetch> |
838 | inline Ti _branchfree_search_nobck(Ti x, Ti & id) const |
839 | { |
840 | if (vct_index.size() == 0) {id = 0; return -1;} |
841 | const Ti *base = &vct_index.template get<0>(0); |
842 | const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size(); |
843 | Ti n = vct_data.size()-1; |
844 | while (n > 1) |
845 | { |
846 | Ti half = n / 2; |
847 | if (prefetch) |
848 | { |
849 | __builtin_prefetch(base + half/2, 0, 0); |
850 | __builtin_prefetch(base + half + half/2, 0, 0); |
851 | } |
852 | base = (base[half] < x) ? base+half : base; |
853 | n -= half; |
854 | } |
855 | |
856 | int off = (*base < x); |
857 | id = base - &vct_index.template get<0>(0) + off; |
858 | return (base + off != end)?*(base + off):-1; |
859 | } |
860 | |
861 | /*! \brief get the element i |
862 | * |
863 | * search the element x |
864 | * |
865 | * \param i element i |
866 | */ |
867 | template<bool prefetch> |
868 | inline void _branchfree_search(Ti x, Ti & id) const |
869 | { |
870 | Ti v = _branchfree_search_nobck<prefetch>(x,id); |
871 | id = (x == v)?id:vct_data.size()-1; |
872 | } |
873 | |
874 | |
875 | /* \brief take the indexes for the insertion pools and create a continuos array |
876 | * |
877 | * \param vct_nadd_index number of insertions of each pool |
878 | * \param vct_add_index pool of inserts |
879 | * \param vct_add_cont_index output continuos array of inserted indexes |
880 | * \param vct_add_data array of added data |
881 | * \param vct_add_data_cont continuos array of inserted data |
882 | * \param contect mgpu context |
883 | * |
884 | */ |
885 | size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index, |
886 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index, |
887 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index, |
888 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map, |
889 | vector<T,Memory,layout_base,grow_p> & vct_add_data, |
890 | vector<T,Memory,layout_base,grow_p> & vct_add_data_cont, |
891 | mgpu::ofp_context_t & context) |
892 | { |
893 | #ifdef __NVCC__ |
894 | |
895 | // Add 0 to the last element to vct_nadd_index |
896 | vct_nadd_index.resize(vct_nadd_index.size()+1); |
897 | vct_nadd_index.template get<0>(vct_nadd_index.size()-1) = 0; |
898 | vct_nadd_index.template hostToDevice<0>(vct_nadd_index.size()-1,vct_nadd_index.size()-1); |
899 | |
900 | // Merge the list of inserted points for each block |
901 | vct_index_tmp4.resize(vct_nadd_index.size()); |
902 | |
903 | openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(), |
904 | vct_nadd_index.size(), |
905 | (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , |
906 | context); |
907 | |
908 | vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1); |
909 | size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1); |
910 | |
911 | // we reuse vct_nadd_index |
912 | vct_add_cont_index.resize(n_ele); |
913 | vct_add_cont_index_map.resize(n_ele); |
914 | |
915 | if (impl2 == VECTOR_SPARSE_STANDARD) |
916 | { |
917 | vct_add_data_cont.resize(n_ele); |
918 | } |
919 | else |
920 | { |
921 | vct_segment_index_map.resize(n_ele); |
922 | } |
923 | |
924 | if (n_gpu_add_block_slot >= 128) |
925 | { |
926 | ite_gpu<1> itew; |
927 | itew.wthr.x = vct_nadd_index.size()-1; |
928 | itew.wthr.y = 1; |
929 | itew.wthr.z = 1; |
930 | itew.thr.x = 128; |
931 | itew.thr.y = 1; |
932 | itew.thr.z = 1; |
933 | |
934 | CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(), |
935 | vct_nadd_index.toKernel(), |
936 | vct_index_tmp4.toKernel(), |
937 | vct_add_cont_index.toKernel(), |
938 | vct_add_cont_index_map.toKernel(), |
939 | n_gpu_add_block_slot); |
940 | } |
941 | else |
942 | { |
943 | auto itew = vct_add_index.getGPUIterator(); |
944 | |
945 | CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(), |
946 | vct_nadd_index.toKernel(), |
947 | vct_index_tmp4.toKernel(), |
948 | vct_add_cont_index.toKernel(), |
949 | vct_add_cont_index_map.toKernel(), |
950 | n_gpu_add_block_slot); |
951 | } |
952 | |
953 | return n_ele; |
954 | #endif |
955 | return 0; |
956 | } |
957 | |
958 | /*! \brief sort the continuos array of inserted key |
959 | * |
960 | * \param context modern gpu context |
961 | * \param vct_add_cont_index array of indexes (unsorted), as output will be sorted |
962 | * \param vct_add_cont_index_map reference to the original indexes |
963 | * \param vct_add_data_reord sorted data output |
964 | * \param vct_add_data_cont added data in a continuos unsorted array |
965 | * |
966 | */ |
967 | void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index, |
968 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map, |
969 | vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, |
970 | vector<T,Memory,layout_base,grow_p> & vct_add_data_cont, |
971 | mgpu::ofp_context_t & context) |
972 | { |
973 | #ifdef __NVCC__ |
974 | ite_gpu<1> itew; |
975 | itew.wthr.x = vct_nadd_index.size()-1; |
976 | itew.wthr.y = 1; |
977 | itew.wthr.z = 1; |
978 | itew.thr.x = 128; |
979 | itew.thr.y = 1; |
980 | itew.thr.z = 1; |
981 | |
982 | size_t n_ele = vct_add_cont_index.size(); |
983 | |
984 | n_gpu_add_block_slot = 0; |
985 | |
986 | // now we sort |
987 | openfpm::sort( |
988 | (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(), |
989 | (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(), |
990 | vct_add_cont_index.size(), |
991 | mgpu::template less_t<Ti>(), |
992 | context); |
993 | |
994 | auto ite = vct_add_cont_index.getGPUIterator(); |
995 | |
996 | // Now we reorder the data vector accordingly to the indexes |
997 | |
998 | if (impl2 == VECTOR_SPARSE_STANDARD) |
999 | { |
1000 | vct_add_data_reord.resize(n_ele); |
1001 | CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel()); |
1002 | } |
1003 | |
1004 | #endif |
1005 | } |
1006 | |
1007 | /*! \brief Merge indexes |
1008 | * |
1009 | * \param |
1010 | * |
1011 | * |
1012 | */ |
1013 | template<typename ... v_reduce> |
1014 | void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_sort, |
1015 | vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique, |
1016 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index, |
1017 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map, |
1018 | mgpu::ofp_context_t & context) |
1019 | { |
1020 | #ifdef __NVCC__ |
1021 | |
1022 | typedef boost::mpl::vector<v_reduce...> vv_reduce; |
1023 | |
1024 | auto ite = vct_add_index_sort.getGPUIterator(); |
1025 | |
1026 | mem.allocate(sizeof(int)); |
1027 | mem.fill(0); |
1028 | vct_add_index_unique.resize(vct_add_index_sort.size()+1); |
1029 | |
1030 | ite = vct_add_index_sort.getGPUIterator(); |
1031 | |
1032 | vct_index_tmp4.resize(vct_add_index_sort.size()+1); |
1033 | |
1034 | CUDA_LAUNCH( |
1035 | ( |
1036 | find_buffer_offsets_for_scan |
1037 | <0, |
1038 | decltype(vct_add_index_sort.toKernel()), |
1039 | decltype(vct_index_tmp4.toKernel()) |
1040 | > |
1041 | ), |
1042 | ite, |
1043 | vct_add_index_sort.toKernel(), |
1044 | vct_index_tmp4.toKernel()); |
1045 | |
1046 | openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),context); |
1047 | |
1048 | vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1); |
1049 | int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1); |
1050 | |
1051 | vct_add_index_unique.resize(n_ele_unique); |
1052 | |
1053 | if (impl2 == VECTOR_SPARSE_STANDARD) |
1054 | { |
1055 | vct_add_data_unique.resize(n_ele_unique); |
1056 | } |
1057 | |
1058 | CUDA_LAUNCH( |
1059 | (construct_index_unique<0>), |
1060 | ite, |
1061 | vct_add_index_sort.toKernel(), |
1062 | vct_index_tmp4.toKernel(), |
1063 | vct_add_index_unique.toKernel()); |
1064 | |
1065 | typedef boost::mpl::vector<v_reduce...> vv_reduce; |
1066 | |
1067 | // Then we merge the two list vct_index and vct_add_index_unique |
1068 | |
1069 | // index to get merge index |
1070 | vct_m_index.resize(vct_index.size()); |
1071 | |
1072 | if (vct_m_index.size() != 0) |
1073 | { |
1074 | ite = vct_m_index.getGPUIterator(); |
1075 | CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0); |
1076 | } |
1077 | |
1078 | // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge |
1079 | // we reorder the data array also |
1080 | |
1081 | vct_merge_index.resize(vct_index.size() + vct_add_index_unique.size()); |
1082 | vct_merge_index_map.resize(vct_index.size() + vct_add_index_unique.size()); |
1083 | vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size()); |
1084 | |
1085 | // Do not delete this reserve |
1086 | // Unfortunately all resize with DataBlocks are broken |
1087 | if (impl2 == VECTOR_SPARSE_STANDARD) |
1088 | { |
1089 | vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1); |
1090 | vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size()); |
1091 | } |
1092 | |
1093 | ite = vct_add_index_unique.getGPUIterator(); |
1094 | vct_index_tmp4.resize(vct_add_index_unique.size()); |
1095 | CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),vct_index.size()); |
1096 | |
1097 | ite_gpu<1> itew; |
1098 | |
1099 | itew.wthr.x = vct_merge_index.size() / 128 + (vct_merge_index.size() % 128 != 0); |
1100 | itew.wthr.y = 1; |
1101 | itew.wthr.z = 1; |
1102 | itew.thr.x = 128; |
1103 | itew.thr.y = 1; |
1104 | itew.thr.z = 1; |
1105 | |
1106 | vct_index_dtmp.resize(itew.wthr.x); |
1107 | |
1108 | // we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes |
1109 | // |
1110 | |
1111 | mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), |
1112 | (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(), |
1113 | (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); |
1114 | |
1115 | |
1116 | #endif |
1117 | } |
1118 | |
1119 | |
1120 | |
1121 | template<typename ... v_reduce> |
1122 | void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, |
1123 | vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new, |
1124 | vector<T,Memory,layout_base,grow_p> & vct_add_data, |
1125 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map, |
1126 | mgpu::ofp_context_t & context) |
1127 | { |
1128 | #ifdef __NVCC__ |
1129 | ite_gpu<1> itew; |
1130 | itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0); |
1131 | itew.wthr.y = 1; |
1132 | itew.wthr.z = 1; |
1133 | itew.thr.x = 128; |
1134 | itew.thr.y = 1; |
1135 | itew.thr.z = 1; |
1136 | |
1137 | typedef boost::mpl::vector<v_reduce...> vv_reduce; |
1138 | |
1139 | //////////////////////////////////////////////////////////////////////////////////////////////////// |
1140 | |
1141 | // Now we can do a segmented reduction |
1142 | scalar_block_implementation_switch<impl2, block_functor> |
1143 | ::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size()); |
1144 | |
1145 | if (impl2 == VECTOR_SPARSE_STANDARD) |
1146 | { |
1147 | sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type, |
1148 | decltype(vct_add_data_reord_map), |
1149 | decltype(vct_add_index_unique),vv_reduce,block_functor,impl2> |
1150 | svr( |
1151 | vct_add_data_unique, |
1152 | vct_add_data_reord, |
1153 | vct_add_data, |
1154 | vct_add_data_reord_map, |
1155 | vct_add_index_unique, |
1156 | blf, |
1157 | context); |
1158 | |
1159 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr); |
1160 | } |
1161 | |
1162 | sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type, |
1163 | decltype(vct_add_index_unique), |
1164 | vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context); |
1165 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr2); |
1166 | |
1167 | ////////////////////////////////////////////////////////////////////////////////////////////////////// |
1168 | |
1169 | // Now perform the right solve_conflicts according to impl2 |
1170 | scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts< |
1171 | decltype(vct_data), |
1172 | decltype(vct_index), |
1173 | decltype(segments_new), |
1174 | decltype(vct_index_dtmp), |
1175 | Ti, |
1176 | v_reduce ... |
1177 | > |
1178 | ( |
1179 | vct_index, |
1180 | vct_index_tmp, |
1181 | vct_index_tmp2, |
1182 | vct_index_tmp3, |
1183 | vct_index_dtmp, |
1184 | vct_add_data_reord_map, |
1185 | segments_new, |
1186 | vct_data, |
1187 | vct_add_data, |
1188 | vct_add_data_unique, |
1189 | vct_add_data_cont, |
1190 | itew, |
1191 | blf, |
1192 | context |
1193 | ); |
1194 | |
1195 | |
1196 | #else |
1197 | std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; |
1198 | #endif |
1199 | } |
1200 | |
1201 | template<typename ... v_reduce> |
1202 | void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, |
1203 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1, |
1204 | vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, |
1205 | mgpu::ofp_context_t & context) |
1206 | { |
1207 | #ifdef __NVCC__ |
1208 | |
1209 | // To avoid the case where you never called setGPUInsertBuffer |
1210 | if (n_gpu_add_block_slot == 0 || vct_add_index.size() == 0) |
1211 | { |
1212 | return; |
1213 | } |
1214 | |
1215 | size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1, |
1216 | vct_add_data,vct_add_data_cont,context); |
1217 | |
1218 | // At this point we can check whether we have not inserted anything actually, |
1219 | // in this case, return without further ado... |
1220 | if (vct_add_index_cont_0.size() == 0) |
1221 | {return;} |
1222 | |
1223 | reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context); |
1224 | |
1225 | merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique, |
1226 | vct_index_tmp,vct_index_tmp2, |
1227 | context); |
1228 | |
1229 | merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context); |
1230 | |
1231 | #else |
1232 | std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; |
1233 | #endif |
1234 | } |
1235 | |
1236 | |
1237 | void flush_on_gpu_remove( |
1238 | mgpu::ofp_context_t & context) |
1239 | { |
1240 | #ifdef __NVCC__ |
1241 | |
1242 | // Add 0 to the last element to vct_nadd_index |
1243 | vct_nrem_index.resize(vct_nrem_index.size()+1); |
1244 | vct_nrem_index.template get<0>(vct_nrem_index.size()-1) = 0; |
1245 | vct_nrem_index.template hostToDevice<0>(vct_nrem_index.size()-1,vct_nrem_index.size()-1); |
1246 | |
1247 | // Merge the list of inserted points for each block |
1248 | vct_index_tmp4.resize(vct_nrem_index.size()); |
1249 | |
1250 | openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , context); |
1251 | |
1252 | vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.size()-1,vct_index_tmp4.size()-1); |
1253 | size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1); |
1254 | |
1255 | // we reuse vct_nadd_index |
1256 | vct_add_index_cont_0.resize(n_ele); |
1257 | vct_add_index_cont_1.resize(n_ele); |
1258 | |
1259 | ite_gpu<1> itew; |
1260 | itew.wthr.x = vct_nrem_index.size()-1; |
1261 | itew.wthr.y = 1; |
1262 | itew.wthr.z = 1; |
1263 | itew.thr.x = 128; |
1264 | itew.thr.y = 1; |
1265 | itew.thr.z = 1; |
1266 | |
1267 | CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(), |
1268 | vct_nrem_index.toKernel(), |
1269 | vct_index_tmp4.toKernel(), |
1270 | vct_add_index_cont_0.toKernel(), |
1271 | vct_add_index_cont_1.toKernel(), |
1272 | n_gpu_rem_block_slot); |
1273 | |
1274 | // now we sort |
1275 | openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(), |
1276 | vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context); |
1277 | |
1278 | auto ite = vct_add_index_cont_0.getGPUIterator(); |
1279 | |
1280 | mem.allocate(sizeof(int)); |
1281 | mem.fill(0); |
1282 | vct_add_index_unique.resize(vct_add_index_cont_0.size()+1); |
1283 | |
1284 | ite = vct_add_index_cont_0.getGPUIterator(); |
1285 | |
1286 | // produce unique index list |
1287 | // Find the buffer bases |
1288 | CUDA_LAUNCH((find_buffer_offsets_zero<0,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>), |
1289 | ite, |
1290 | vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel()); |
1291 | |
1292 | mem.deviceToHost(); |
1293 | int n_ele_unique = *(int *)mem.getPointer(); |
1294 | |
1295 | vct_add_index_unique.resize(n_ele_unique); |
1296 | |
1297 | openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(), |
1298 | vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context); |
1299 | |
1300 | // Then we merge the two list vct_index and vct_add_index_unique |
1301 | |
1302 | // index to get merge index |
1303 | vct_m_index.resize(vct_index.size() + vct_add_index_unique.size()); |
1304 | |
1305 | ite = vct_m_index.getGPUIterator(); |
1306 | CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0); |
1307 | |
1308 | ite = vct_add_index_unique.getGPUIterator(); |
1309 | CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),vct_index.size()); |
1310 | |
1311 | // after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge |
1312 | // we reorder the data array also |
1313 | |
1314 | vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size()); |
1315 | vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size()); |
1316 | |
1317 | itew.wthr.x = vct_index_tmp.size() / 128 + (vct_index_tmp.size() % 128 != 0); |
1318 | itew.wthr.y = 1; |
1319 | itew.wthr.z = 1; |
1320 | itew.thr.x = 128; |
1321 | itew.thr.y = 1; |
1322 | itew.thr.z = 1; |
1323 | |
1324 | vct_index_dtmp.resize(itew.wthr.x); |
1325 | |
1326 | // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index |
1327 | // |
1328 | mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), |
1329 | (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(), |
1330 | (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); |
1331 | |
1332 | vct_index_tmp3.resize(128*itew.wthr.x); |
1333 | |
1334 | CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),128>), |
1335 | itew, |
1336 | vct_index_tmp.toKernel(), |
1337 | vct_index_tmp2.toKernel(), |
1338 | vct_index_tmp3.toKernel(), |
1339 | vct_m_index.toKernel(), |
1340 | vct_index_dtmp.toKernel(), |
1341 | vct_index.size()); |
1342 | |
1343 | // we scan tmp3 |
1344 | openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),context); |
1345 | |
1346 | // get the size to resize vct_index and vct_data |
1347 | vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1); |
1348 | int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1); |
1349 | |
1350 | vct_add_data_cont.resize(size); |
1351 | vct_index.resize(size); |
1352 | |
1353 | CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(), |
1354 | vct_index.toKernel(),vct_add_data_cont.toKernel(), |
1355 | vct_index_dtmp.toKernel()); |
1356 | |
1357 | vct_data.swap(vct_add_data_cont); |
1358 | |
1359 | #else |
1360 | std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl; |
1361 | #endif |
1362 | } |
1363 | |
1364 | void resetBck() |
1365 | { |
1366 | // re-add background |
1367 | vct_data.resize(vct_data.size()+1); |
1368 | vct_data.get(vct_data.size()-1) = bck; |
1369 | |
1370 | htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1); |
1371 | boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf); |
1372 | } |
1373 | |
1374 | template<typename ... v_reduce> |
1375 | void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, |
1376 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1, |
1377 | vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, |
1378 | mgpu::ofp_context_t & context) |
1379 | { |
1380 | flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context); |
1381 | } |
1382 | |
1383 | template<typename ... v_reduce> |
1384 | void flush_on_cpu() |
1385 | { |
1386 | if (vct_add_index.size() == 0) |
1387 | {return;} |
1388 | |
1389 | // First copy the added index to reorder |
1390 | reorder_add_index_cpu.resize(vct_add_index.size()); |
1391 | vct_add_data_cont.resize(vct_add_index.size()); |
1392 | |
1393 | for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++) |
1394 | { |
1395 | reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i); |
1396 | reorder_add_index_cpu.get(i).id2 = i; |
1397 | } |
1398 | |
1399 | reorder_add_index_cpu.sort(); |
1400 | |
1401 | // Copy the data |
1402 | for (size_t i = 0 ; i < reorder_add_index_cpu.size() ; i++) |
1403 | { |
1404 | vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2); |
1405 | } |
1406 | |
1407 | typedef boost::mpl::vector<v_reduce...> vv_reduce; |
1408 | |
1409 | sparse_vector_reduction_cpu<decltype(vct_add_data), |
1410 | decltype(vct_add_index_unique), |
1411 | decltype(reorder_add_index_cpu), |
1412 | vv_reduce, |
1413 | impl2> |
1414 | svr(vct_add_data_unique, |
1415 | vct_add_data_cont, |
1416 | vct_add_index_unique, |
1417 | reorder_add_index_cpu); |
1418 | |
1419 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr); |
1420 | |
1421 | // merge the the data |
1422 | |
1423 | vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp; |
1424 | vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp; |
1425 | |
1426 | vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size()); |
1427 | vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size()); |
1428 | |
1429 | Ti di = 0; |
1430 | Ti ai = 0; |
1431 | size_t i = 0; |
1432 | |
1433 | for ( ; i < vct_data_tmp.size() ; i++) |
1434 | { |
1435 | Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max(); |
1436 | Ti id_d = (di < vct_index.size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max(); |
1437 | |
1438 | if ( id_a <= id_d ) |
1439 | { |
1440 | vct_index_tmp.template get<0>(i) = id_a; |
1441 | |
1442 | if (id_a == id_d) |
1443 | { |
1444 | auto dst = vct_data_tmp.get(i); |
1445 | auto src = vct_add_data_unique.get(ai); |
1446 | |
1447 | sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)), |
1448 | decltype(vct_add_data.get(ai)), |
1449 | vv_reduce> |
1450 | sva(src,dst); |
1451 | |
1452 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(sva); |
1453 | ai++; |
1454 | |
1455 | dst = vct_data_tmp.get(i); |
1456 | src = vct_data.get(di); |
1457 | |
1458 | sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)), |
1459 | decltype(vct_data.get(di)), |
1460 | vv_reduce, |
1461 | impl2> |
1462 | svr(src,dst); |
1463 | boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr); |
1464 | |
1465 | di++; |
1466 | |
1467 | vct_data_tmp.resize(vct_data_tmp.size()-1); |
1468 | vct_index_tmp.resize(vct_index_tmp.size()-1); |
1469 | } |
1470 | else |
1471 | { |
1472 | vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai); |
1473 | vct_data_tmp.get(i) = vct_add_data_unique.get(ai); |
1474 | ai++; |
1475 | } |
1476 | } |
1477 | else |
1478 | { |
1479 | vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di); |
1480 | vct_data_tmp.get(i) = vct_data.get(di); |
1481 | di++; |
1482 | } |
1483 | } |
1484 | |
1485 | vct_index.swap(vct_index_tmp); |
1486 | vct_data.swap(vct_data_tmp); |
1487 | |
1488 | vct_add_data.clear(); |
1489 | vct_add_index.clear(); |
1490 | vct_add_index_unique.clear(); |
1491 | vct_add_data_unique.clear(); |
1492 | } |
1493 | |
1494 | public: |
1495 | |
1496 | vector_sparse() |
1497 | :max_ele(0) |
1498 | { |
1499 | vct_data.resize(1); |
1500 | } |
1501 | |
1502 | /*! \brief Get the indices buffer |
1503 | * |
1504 | * \return the reference to the indices buffer |
1505 | */ |
1506 | auto getIndexBuffer() -> decltype(vct_index)& |
1507 | { |
1508 | return vct_index; |
1509 | } |
1510 | |
1511 | /*! \brief Get the data buffer |
1512 | * |
1513 | * \return the reference to the data buffer |
1514 | */ |
1515 | auto getDataBuffer() -> decltype(vct_data)& |
1516 | { |
1517 | return vct_data; |
1518 | } |
1519 | |
1520 | /*! \brief Get the indices buffer |
1521 | * |
1522 | * \return the reference to the indices buffer |
1523 | */ |
1524 | auto getIndexBuffer() const -> const decltype(vct_index)& |
1525 | { |
1526 | return vct_index; |
1527 | } |
1528 | |
1529 | /*! \brief Get the data buffer |
1530 | * |
1531 | * \return the reference to the data buffer |
1532 | */ |
1533 | auto getDataBuffer() const -> const decltype(vct_data)& |
1534 | { |
1535 | return vct_data; |
1536 | } |
1537 | |
1538 | /*! \brief Get the sparse index |
1539 | * |
1540 | * Get the sparse index of the element id |
1541 | * |
1542 | * \note use get_index and get to retrieve the value index associated to the sparse index |
1543 | * |
1544 | * \param id Element to get |
1545 | * |
1546 | * \return the element value requested |
1547 | * |
1548 | */ |
1549 | inline openfpm::sparse_index<Ti> get_sparse(Ti id) const |
1550 | { |
1551 | Ti di; |
1552 | this->_branchfree_search<false>(id,di); |
1553 | openfpm::sparse_index<Ti> sid; |
1554 | sid.id = di; |
1555 | |
1556 | return sid; |
1557 | } |
1558 | |
1559 | /*! \brief Get an element of the vector |
1560 | * |
1561 | * Get an element of the vector |
1562 | * |
1563 | * \tparam p Property to get |
1564 | * \param id Element to get |
1565 | * |
1566 | * \return the element value requested |
1567 | * |
1568 | */ |
1569 | template <unsigned int p> |
1570 | inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id)) |
1571 | { |
1572 | Ti di; |
1573 | this->_branchfree_search<false>(id,di); |
1574 | return vct_data.template get<p>(di); |
1575 | } |
1576 | |
1577 | /*! \brief Get an element of the vector |
1578 | * |
1579 | * Get an element of the vector |
1580 | * |
1581 | * \tparam p Property to get |
1582 | * \param id Element to get |
1583 | * |
1584 | * \return the element value requested |
1585 | * |
1586 | */ |
1587 | inline auto get(Ti id) const -> decltype(vct_data.get(id)) |
1588 | { |
1589 | Ti di; |
1590 | this->_branchfree_search<false>(id,di); |
1591 | return vct_data.get(di); |
1592 | } |
1593 | |
1594 | /*! \brief resize to n elements |
1595 | * |
1596 | * \param n elements |
1597 | * |
1598 | */ |
1599 | void resize(size_t n) |
1600 | { |
1601 | max_ele = n; |
1602 | } |
1603 | |
1604 | /*! \brief |
1605 | * |
1606 | * \warning After using this function to move out the vector of the indexes, this object become useless and |
1607 | * must be destroyed |
1608 | * |
1609 | * \param iv |
1610 | * |
1611 | */ |
1612 | void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv) |
1613 | { |
1614 | vct_index.swap(iv); |
1615 | } |
1616 | |
1617 | /*! \brief Set the background to bck (which value get must return when the value is not find) |
1618 | * |
1619 | * \param bck |
1620 | * |
1621 | */ |
1622 | template <unsigned int p> |
1623 | auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-1)) |
1624 | { |
1625 | return vct_data.template get<p>(vct_data.size()-1); |
1626 | } |
1627 | |
1628 | /*! \brief Set the background to bck (which value get must return when the value is not find) |
1629 | * |
1630 | * \param bck |
1631 | * |
1632 | */ |
1633 | auto getBackground() const -> decltype(vct_data.get(vct_data.size()-1)) |
1634 | { |
1635 | return vct_data.get(vct_data.size()-1); |
1636 | } |
1637 | |
1638 | template<unsigned int p> |
1639 | void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_) |
1640 | { |
1641 | meta_copy_d<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type, |
1642 | typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-1))>::type> |
1643 | ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-1)); |
1644 | |
1645 | vct_data.template hostToDevice<p>(vct_data.size()-1,vct_data.size()-1); |
1646 | |
1647 | meta_copy<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type> |
1648 | ::meta_copy_(bck_,bck.template get<p>()); |
1649 | } |
1650 | |
1651 | /*! \brief It insert an element in the sparse vector |
1652 | * |
1653 | * \tparam p property id |
1654 | * |
1655 | * \param ele element id |
1656 | * |
1657 | */ |
1658 | template <unsigned int p> |
1659 | auto insert(Ti ele) -> decltype(vct_data.template get<p>(0)) |
1660 | { |
1661 | vct_add_index.add(); |
1662 | vct_add_index.template get<0>(vct_add_index.size()-1) = ele; |
1663 | vct_add_data.add(); |
1664 | return vct_add_data.template get<p>(vct_add_data.size()-1); |
1665 | } |
1666 | |
1667 | /*! \brief It insert an element in the sparse vector |
1668 | * |
1669 | * \tparam p property id |
1670 | * |
1671 | * \param ele element id |
1672 | * |
1673 | */ |
1674 | template <unsigned int p> |
1675 | auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(0)) |
1676 | { |
1677 | size_t di; |
1678 | |
1679 | // first we have to search if the block exist |
1680 | Ti v = _branchfree_search_nobck(ele,di); |
1681 | |
1682 | if (v == ele) |
1683 | { |
1684 | // block exist |
1685 | return vct_data.template get<p>(di); |
1686 | } |
1687 | |
1688 | // It does not exist, we create it di contain the index where we have to create the new block |
1689 | vct_index.insert(di); |
1690 | vct_data.isert(di); |
1691 | |
1692 | return vct_data.template get<p>(di); |
1693 | } |
1694 | |
1695 | /*! \brief It insert an element in the sparse vector |
1696 | * |
1697 | * \param ele element id |
1698 | * |
1699 | */ |
1700 | auto insertFlush(Ti ele) -> decltype(vct_data.get(0)) |
1701 | { |
1702 | Ti di; |
1703 | |
1704 | // first we have to search if the block exist |
1705 | Ti v = _branchfree_search_nobck<true>(ele,di); |
1706 | |
1707 | if (v == ele) |
1708 | { |
1709 | // block exist |
1710 | return vct_data.get(di); |
1711 | } |
1712 | |
1713 | // It does not exist, we create it di contain the index where we have to create the new block |
1714 | vct_index.insert(di); |
1715 | vct_data.insert(di); |
1716 | |
1717 | vct_index.template get<0>(di) = ele; |
1718 | |
1719 | return vct_data.get(di); |
1720 | } |
1721 | |
1722 | /*! \brief It insert an element in the sparse vector |
1723 | * |
1724 | * \param ele element id |
1725 | * |
1726 | */ |
1727 | auto insert(Ti ele) -> decltype(vct_data.get(0)) |
1728 | { |
1729 | vct_add_index.add(); |
1730 | vct_add_index.template get<0>(vct_add_index.size()-1) = ele; |
1731 | vct_add_data.add(); |
1732 | return vct_add_data.get(vct_add_data.size()-1); |
1733 | } |
1734 | |
1735 | /*! \brief merge the added element to the main data array but save the insert buffer in v |
1736 | * |
1737 | * \param v insert buffer |
1738 | * |
1739 | * \param opt options |
1740 | * |
1741 | */ |
1742 | template<typename ... v_reduce> |
1743 | void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, |
1744 | mgpu::ofp_context_t & context, |
1745 | flush_type opt = FLUSH_ON_HOST, |
1746 | int i = 0) |
1747 | { |
1748 | // Eliminate background |
1749 | vct_data.resize(vct_index.size()); |
1750 | |
1751 | if (opt & flush_type::FLUSH_ON_DEVICE) |
1752 | {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);} |
1753 | else |
1754 | {this->flush_on_cpu<v_reduce ... >();} |
1755 | |
1756 | resetBck(); |
1757 | } |
1758 | |
1759 | /*! \brief merge the added element to the main data array but save the insert buffer in v |
1760 | * |
1761 | * \param v insert buffer |
1762 | * |
1763 | * \param opt options |
1764 | * |
1765 | */ |
1766 | template<typename ... v_reduce> |
1767 | void flush_vd(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, |
1768 | mgpu::ofp_context_t & context, |
1769 | flush_type opt = FLUSH_ON_HOST) |
1770 | { |
1771 | // Eliminate background |
1772 | vct_data.resize(vct_index.size()); |
1773 | |
1774 | if (opt & flush_type::FLUSH_ON_DEVICE) |
1775 | {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);} |
1776 | else |
1777 | {this->flush_on_cpu<v_reduce ... >();} |
1778 | |
1779 | resetBck(); |
1780 | } |
1781 | |
1782 | /*! \brief merge the added element to the main data array |
1783 | * |
1784 | * \param opt options |
1785 | * |
1786 | */ |
1787 | template<typename ... v_reduce> |
1788 | void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) |
1789 | { |
1790 | // Eliminate background |
1791 | vct_data.resize(vct_index.size()); |
1792 | |
1793 | if (opt & flush_type::FLUSH_ON_DEVICE) |
1794 | {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);} |
1795 | else |
1796 | {this->flush_on_cpu<v_reduce ... >();} |
1797 | |
1798 | resetBck(); |
1799 | } |
1800 | |
1801 | /*! \brief merge the added element to the main data array |
1802 | * |
1803 | * \param opt options |
1804 | * |
1805 | */ |
1806 | void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) |
1807 | { |
1808 | vct_data.resize(vct_data.size()-1); |
1809 | |
1810 | if (opt & flush_type::FLUSH_ON_DEVICE) |
1811 | {this->flush_on_gpu_remove(context);} |
1812 | else |
1813 | { |
1814 | std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet" ; |
1815 | } |
1816 | |
1817 | resetBck(); |
1818 | } |
1819 | |
1820 | /*! \brief Return how many element you have in this map |
1821 | * |
1822 | * \return the number of elements |
1823 | */ |
1824 | size_t size() |
1825 | { |
1826 | return vct_index.size(); |
1827 | } |
1828 | |
1829 | /*! \brief Return the sorted vector of the indexes |
1830 | * |
1831 | * \return return the sorted vector of the indexes |
1832 | */ |
1833 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & |
1834 | private_get_vct_index() |
1835 | { |
1836 | return vct_index; |
1837 | } |
1838 | |
1839 | /*! \brief Transfer from device to host |
1840 | * |
1841 | * \tparam set of parameters to transfer to host |
1842 | * |
1843 | */ |
1844 | template<unsigned int ... prp> |
1845 | void deviceToHost() |
1846 | { |
1847 | vct_index.template deviceToHost<0>(); |
1848 | vct_data.template deviceToHost<prp...>(); |
1849 | } |
1850 | |
1851 | /*! \brief Transfer from host to device |
1852 | * |
1853 | * \tparam set of parameters to transfer to device |
1854 | * |
1855 | */ |
1856 | template<unsigned int ... prp> |
1857 | void hostToDevice() |
1858 | { |
1859 | vct_index.template hostToDevice<0>(); |
1860 | vct_data.template hostToDevice<prp...>(); |
1861 | } |
1862 | |
1863 | /*! \brief toKernel function transform this structure into one that can be used on GPU |
1864 | * |
1865 | * \return structure that can be used on GPU |
1866 | * |
1867 | */ |
1868 | vector_sparse_gpu_ker<T,Ti,layout_base> toKernel() |
1869 | { |
1870 | vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(), |
1871 | vct_add_index.toKernel(), |
1872 | vct_rem_index.toKernel(),vct_add_data.toKernel(), |
1873 | vct_nadd_index.toKernel(), |
1874 | vct_nrem_index.toKernel(), |
1875 | n_gpu_add_block_slot, |
1876 | n_gpu_rem_block_slot); |
1877 | |
1878 | return mvsck; |
1879 | } |
1880 | |
1881 | /*! \brief set the gpu insert buffer for every block |
1882 | * |
1883 | * \param nblock number of blocks |
1884 | * \param nslot number of slots free for each block |
1885 | * |
1886 | */ |
1887 | void setGPUInsertBuffer(int nblock, int nslot) |
1888 | { |
1889 | vct_add_index.resize(nblock*nslot); |
1890 | vct_nadd_index.resize(nblock); |
1891 | vct_add_data.resize(nblock*nslot); |
1892 | n_gpu_add_block_slot = nslot; |
1893 | vct_nadd_index.template fill<0>(0); |
1894 | } |
1895 | |
1896 | /*! \brief In case we manually set the added index buffer and the add data buffer we have to call this |
1897 | * function before flush |
1898 | * |
1899 | * |
1900 | */ |
1901 | void preFlush() |
1902 | { |
1903 | #ifdef __NVCC__ |
1904 | vct_nadd_index.resize(vct_add_index.size()); |
1905 | |
1906 | if (vct_nadd_index.size() != 0) |
1907 | { |
1908 | auto ite = vct_nadd_index.getGPUIterator(); |
1909 | CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel()); |
1910 | } |
1911 | n_gpu_add_block_slot = 1; |
1912 | #endif |
1913 | } |
1914 | |
1915 | /*! \brief Get the GPU insert buffer |
1916 | * |
1917 | * \return the reference to the GPU insert buffer |
1918 | */ |
1919 | auto getGPUInsertBuffer() -> decltype(vct_add_data)& |
1920 | { |
1921 | return vct_add_data; |
1922 | } |
1923 | |
1924 | /*! \brief set the gpu remove buffer for every block |
1925 | * |
1926 | * \param nblock number of blocks |
1927 | * \param nslot number of slots free for each block |
1928 | * |
1929 | */ |
1930 | void setGPURemoveBuffer(int nblock, int nslot) |
1931 | { |
1932 | vct_rem_index.resize(nblock*nslot); |
1933 | vct_nrem_index.resize(nblock); |
1934 | n_gpu_rem_block_slot = nslot; |
1935 | vct_nrem_index.template fill<0>(0); |
1936 | } |
1937 | |
1938 | #ifdef CUDA_GPU |
1939 | |
1940 | /*! \brief Get iterator over the stored elements |
1941 | * |
1942 | * \return an iterator |
1943 | * |
1944 | */ |
1945 | auto getGPUIterator() -> decltype(vct_index.getGPUIterator()) |
1946 | { |
1947 | return vct_index.getGPUIterator(); |
1948 | } |
1949 | |
1950 | #endif |
1951 | |
1952 | /*! \brief Clear all from all the elements |
1953 | * |
1954 | * |
1955 | */ |
1956 | void clear() |
1957 | { |
1958 | vct_data.clear(); |
1959 | vct_index.clear(); |
1960 | vct_add_index.clear(); |
1961 | vct_add_data.clear(); |
1962 | |
1963 | // re-add background |
1964 | vct_data.resize(vct_data.size()+1); |
1965 | vct_data.get(vct_data.size()-1) = bck; |
1966 | |
1967 | htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-1); |
1968 | boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf); |
1969 | |
1970 | max_ele = 0; |
1971 | n_gpu_add_block_slot = 0; |
1972 | n_gpu_rem_block_slot = 0; |
1973 | } |
1974 | |
1975 | void swap(vector_sparse<T,Ti,Memory,layout,layout_base,grow_p,impl> & sp) |
1976 | { |
1977 | vct_data.swap(sp.vct_data); |
1978 | vct_index.swap(sp.vct_index); |
1979 | vct_add_index.swap(sp.vct_add_index); |
1980 | vct_add_data.swap(sp.vct_add_data); |
1981 | |
1982 | size_t max_ele_ = sp.max_ele; |
1983 | sp.max_ele = max_ele; |
1984 | this->max_ele = max_ele_; |
1985 | } |
1986 | |
1987 | vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data() |
1988 | { |
1989 | return vct_add_data; |
1990 | } |
1991 | |
1992 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() |
1993 | { |
1994 | return vct_add_index; |
1995 | } |
1996 | |
1997 | const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const |
1998 | { |
1999 | return vct_add_index; |
2000 | } |
2001 | |
2002 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() |
2003 | { |
2004 | return vct_nadd_index; |
2005 | } |
2006 | |
2007 | const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const |
2008 | { |
2009 | return vct_nadd_index; |
2010 | } |
2011 | |
2012 | auto getSegmentToOutMap() -> decltype(blf.get_outputMap()) |
2013 | { |
2014 | return blf.get_outputMap(); |
2015 | } |
2016 | |
2017 | auto getSegmentToOutMap() const -> decltype(blf.get_outputMap()) |
2018 | { |
2019 | return blf.get_outputMap(); |
2020 | } |
2021 | |
2022 | /*! \brief Eliminate many internal temporary buffer you can use this between flushes if you get some out of memory |
2023 | * |
2024 | * |
2025 | */ |
2026 | void removeUnusedBuffers() |
2027 | { |
2028 | vct_add_data.resize(0); |
2029 | vct_add_data.shrink_to_fit(); |
2030 | |
2031 | vct_add_data.resize(0); |
2032 | vct_add_data.shrink_to_fit(); |
2033 | |
2034 | vct_add_data_reord.resize(0); |
2035 | vct_add_data_reord.shrink_to_fit(); |
2036 | |
2037 | vct_add_data_cont.resize(0); |
2038 | vct_add_data_cont.shrink_to_fit(); |
2039 | |
2040 | vct_add_data_unique.resize(0); |
2041 | vct_add_data_unique.shrink_to_fit(); |
2042 | } |
2043 | |
2044 | /* \brief Return the offsets of the segments for the merge indexes |
2045 | * |
2046 | * |
2047 | */ |
2048 | vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() |
2049 | { |
2050 | return vct_add_index_unique; |
2051 | } |
2052 | |
2053 | vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const |
2054 | { |
2055 | return vct_add_index_unique; |
2056 | } |
2057 | |
2058 | /*! \brief Return the mapping vector |
2059 | * |
2060 | * When we add new elements this vector contain the merged old elements and new elements position |
2061 | * |
2062 | * For example the old vector contain |
2063 | * |
2064 | * Old: 5 10 35 50 66 79 (6 elements) |
2065 | * New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44) |
2066 | * |
2067 | * The merged indexes are (when reordered) |
2068 | * |
2069 | * 5 7 7 9 10 35 44 44 50 66 79 |
2070 | * |
2071 | * The returned map contain 5 elements indicating the position of the reordered elements: |
2072 | * |
2073 | * 0 2 3 1 4 |
2074 | * (7)(7)(9)(44)(44) |
2075 | */ |
2076 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector() |
2077 | { |
2078 | return vct_add_index_cont_1; |
2079 | } |
2080 | |
2081 | /*! \brief Return the merge mapping vector |
2082 | * |
2083 | * When we add new elements this vector contain the merged old elements and new elements position |
2084 | * |
2085 | * For example the old vector contain |
2086 | * |
2087 | * Old: 5 10 35 50 66 79 (6 elements) |
2088 | * New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44) |
2089 | * |
2090 | * The merged indexes are (when reordered) |
2091 | * |
2092 | * 5 7 7 9 10 35 44 44 50 66 79 |
2093 | * |
2094 | * The returned map contain 5 elements indicating the position of the reordered elements: |
2095 | * |
2096 | * 0 6 7 8 1 2 9 10 3 4 5 |
2097 | * (5)(7)(7)(9)(10)(35)(44)(44)(50)(66)(79) |
2098 | */ |
2099 | vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector() |
2100 | { |
2101 | return vct_index_tmp2; |
2102 | } |
2103 | }; |
2104 | |
2105 | |
2106 | template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int> |
2107 | using vector_sparse_gpu = openfpm::vector_sparse< |
2108 | T, |
2109 | indexT, |
2110 | CudaMemory, |
2111 | typename memory_traits_inte<T>::type, |
2112 | memory_traits_inte, |
2113 | grow_policy_double, |
2114 | vect_isel<T>::value, |
2115 | blockSwitch, |
2116 | block_functor |
2117 | >; |
2118 | |
2119 | template<typename T, typename block_functor = stub_block_functor, typename indexT = long int> |
2120 | using vector_sparse_gpu_block = openfpm::vector_sparse< |
2121 | T, |
2122 | indexT, |
2123 | CudaMemory, |
2124 | typename memory_traits_inte<T>::type, |
2125 | memory_traits_inte, |
2126 | grow_policy_double, |
2127 | vect_isel<T>::value, |
2128 | VECTOR_SPARSE_BLOCK, |
2129 | block_functor |
2130 | >; |
2131 | } |
2132 | |
2133 | |
2134 | |
2135 | #endif /* MAP_VECTOR_SPARSE_HPP_ */ |
2136 | |