map_vector_sparse.hpp source code [openfpm/openfpm_data/src/Vector/map_vector_sparse.hpp]

1	/*
2	* map_vector_sparse.hpp
3	*
4	* Created on: Jan 22, 2019
5	* Author: i-bird
6	*/
7
8	#ifndef MAP_VECTOR_SPARSE_HPP_
9	#define MAP_VECTOR_SPARSE_HPP_
10
11	#include "util/cuda_launch.hpp"
12	#include "Vector/map_vector.hpp"
13	#include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
14	#include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
15	#include "util/cuda/ofp_context.hxx"
16	#include <iostream>
17	#include <limits>
18
19	#if defined(__NVCC__)
20	#if !defined(CUDA_ON_CPU)
21	#include "util/cuda/moderngpu/kernel_segreduce.hxx"
22	#include "util/cuda/moderngpu/kernel_merge.hxx"
23	#endif
24	#include "util/cuda/kernels.cuh"
25	#endif
26
27	#include "util/cuda/scan_ofp.cuh"
28	#include "util/cuda/sort_ofp.cuh"
29
30	enum flush_type
31	{
32	FLUSH_ON_HOST = `0`,
33	FLUSH_ON_DEVICE = `1`,
34	FLUSH_NO_DATA = `2`
35	};
36
37	template<typename OfpmVectorT>
38	using ValueTypeOf = typename std::remove_reference<OfpmVectorT>::type::value_type;
39
40	namespace openfpm
41	{
42	// All props
43	template<typename sg_type>
44	struct htoD
45	{
46	//! encapsulated source object
47	sg_type & sg;
48
49	unsigned int lele;
50
51	htoD(sg_type & sg, unsigned int lele)
52	:sg(sg),lele(lele)
53	{};
54
55
56	//! It call the copy function for each property
57	template<typename T>
58	__device__ __host__ inline void operator()(T& t) const
59	{
60	sg.template hostToDevice<T::value>(lele,lele);
61	}
62	};
63
64	constexpr int VECTOR_SPARSE_STANDARD = `1`;
65	constexpr int VECTOR_SPARSE_BLOCK = `2`;
66
67	template<typename reduction_type, unsigned int impl>
68	struct cpu_block_process
69	{
70	template<typename encap_src, typename encap_dst>
71	static inline void process(encap_src & src, encap_dst & dst)
72	{
73	dst = reduction_type::red(dst,src);
74	}
75	};
76
77	template<typename reduction_type>
78	struct cpu_block_process<reduction_type,VECTOR_SPARSE_BLOCK>
79	{
80	template<typename encap_src, typename encap_dst>
81	static inline void process(encap_src & src, encap_dst & dst)
82	{
83	for (size_t i = `0` ; i < encap_src::size ; i++)
84	{
85	dst[i] = reduction_type::red(dst[i],src[i]);
86	}
87	}
88	};
89
90	template<typename reduction_type>
91	struct cpu_block_process<reduction_type,`3`>
92	{
93	template<typename encap_src, typename encap_dst,unsigned int N1>
94	static inline void process(encap_src & src, encap_dst (& dst)[N1])
95	{
96	for (unsigned int j = `0` ; j < N1 ; j++)
97	{
98	for (size_t i = `0` ; i < encap_dst::size ; i++)
99	{
100	dst[i] = reduction_type::red(dst[i][j],src[j][i]);
101	}
102	}
103	}
104
105	template<unsigned int N1, unsigned int blockSize, typename encap_src, typename encap_dst>
106	static inline void process_e(encap_src & src, encap_dst & dst)
107	{
108	for (unsigned int j = `0` ; j < N1 ; j++)
109	{
110	for (size_t i = `0` ; i < blockSize ; i++)
111	{
112	dst[i] = reduction_type::red(dst[i][j],src[i][j]);
113	}
114	}
115	}
116	};
117
118	/! \brief Functor switch to select the vector sparse for standars scalar and blocked implementation*
119	*
120	*
121	*/
122	template<unsigned int impl, typename block_functor>
123	struct scalar_block_implementation_switch // Case for scalar implementations
124	{
125	template <unsigned int p, typename vector_index_type>
126	static void extendSegments(vector_index_type & segments, size_t dataSize)
127	{
128	#ifdef __NVCC__
129	// Pass as there is nothing to append for mgpu
130	#else // __NVCC__
131	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
132	#endif // __NVCC__
133	}
134
135	template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type , typename vector_index_type2>
136	static void segreduce(vector_data_type & vector_data,
137	vector_data_type & vector_data_unsorted,
138	vector_index_type & vector_data_map,
139	vector_index_type2 & segment_offset,
140	vector_data_type & vector_data_red,
141	block_functor & blf,
142	mgpu::ofp_context_t & context)
143	{
144	#ifdef __NVCC__
145	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
146	typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
147	typedef typename reduction_type::template op_red<red_type> red_op;
148	typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<`0`>>::type seg_type;
149	red_type init;
150	init = `0`;
151
152	assert((std::is_same<seg_type,int>::value == true));
153
154	mgpu::segreduce(
155	(red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
156	(int *)segment_offset.template getDeviceBuffer<`1`>(), segment_offset.size(),
157	(red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
158	red_op(), init, context);
159	#else // __NVCC__
160	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
161	#endif // __NVCC__
162	}
163
164
165	/! \briefMerge all datas*
166	*
167	* \param vct_index_old sorted vector of the old indexes
168	* \param vct_data_old vector of old data
169	* \param vct_index_out output indexes merged new and old indexes
170	* \param vct_index_merge_id indicate from where it come from the merged index (if the number is bigger than vct_index_old.size()
171	* the merged index come from the new data)
172	* \param vct_index_merge indexes old and new merged with conflicts
173	* \param vct_add_data_unique data to add (has been already reduced)
174	* \param vct_data_old old data
175	* \param vct_add_data data to add in its original form in the insert buffer
176	* \param vct_data_out reduced data vector new + old
177	* \param vct_index_dtmp temporal buffer vector used for intermediate calculation
178	*
179	*/
180	template <
181	typename vector_data_type,
182	typename vector_index_type,
183	typename vector_index_type2,
184	typename vector_index_dtmp_type,
185	typename Ti,
186	typename ... v_reduce>
187	static void solveConflicts(
188	vector_index_type & vct_index_old,
189	vector_index_type & vct_index_merge,
190	vector_index_type & vct_index_merge_id,
191	vector_index_type & vct_index_out,
192	vector_index_dtmp_type & vct_index_dtmp,
193	vector_index_type & data_map,
194	vector_index_type2 & segments_new,
195	vector_data_type & vct_data_old,
196	vector_data_type & vct_add_data,
197	vector_data_type & vct_add_data_unique,
198	vector_data_type & vct_data_out,
199	ite_gpu<`1`> & itew,
200	block_functor & blf,
201	mgpu::ofp_context_t & context
202	)
203	{
204	#ifdef __NVCC__
205
206	CUDA_LAUNCH((solve_conflicts<
207	decltype(vct_index_merge.toKernel()),
208	decltype(vct_data_old.toKernel()),
209	decltype(vct_index_dtmp.toKernel()),
210	`128`,
211	v_reduce ...
212	>),
213	itew,
214	vct_index_merge.toKernel(),vct_data_old.toKernel(),
215	vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(),
216	vct_index_out.toKernel(),vct_data_out.toKernel(),
217	vct_index_dtmp.toKernel(),
218	vct_index_old.size());
219
220	// we scan tmp3
221	openfpm::scan(
222	(Ti*)vct_index_dtmp.template getDeviceBuffer<`0`>(),
223	vct_index_dtmp.size(),
224	(Ti *)vct_index_dtmp.template getDeviceBuffer<`1`>(),
225	context);
226
227	// get the size to resize vct_index and vct_data
228	vct_index_dtmp.template deviceToHost<`0`,`1`>(vct_index_dtmp.size()-`1`,vct_index_dtmp.size()-`1`);
229	int size = vct_index_dtmp.template get<`1`>(vct_index_dtmp.size()-`1`) + vct_index_dtmp.template get<`0`>(vct_index_dtmp.size()-`1`);
230
231	vct_index_old.resize(size);
232	vct_data_old.resize(size);
233
234	CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(),
235	vct_index_old.toKernel(), vct_data_old.toKernel(),
236	vct_index_dtmp.toKernel());
237
238
239	#else // __NVCC__
240	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
241	#endif // __NVCC__
242	}
243	};
244
245
246	template<typename block_functor>
247	struct scalar_block_implementation_switch<`2`, block_functor> // Case for blocked implementations
248	{
249	template <unsigned int p, typename vector_index_type>
250	static void extendSegments(vector_index_type & segments, size_t dataSize)
251	{
252	#ifdef __NVCC__
253	// Append trailing element to segment (marks end of last segment)
254	segments.resize(segments.size()+`1`);
255	segments.template get<p>(segments.size() - `1`) = dataSize;
256	segments.template hostToDevice<p>(segments.size() - `1`, segments.size() - `1`);
257	#else // __NVCC__
258	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
259	#endif // __NVCC__
260	}
261
262	template <unsigned int pSegment, typename vector_reduction, typename T, typename vector_data_type, typename vector_index_type ,typename vector_index_type2>
263	static void segreduce(vector_data_type & vector_data,
264	vector_data_type & vector_data_unsorted,
265	vector_index_type & vector_data_map,
266	vector_index_type2 & segment_offset,
267	vector_data_type & vector_data_red,
268	block_functor & blf,
269	mgpu::ofp_context_t & context)
270	{
271
272	}
273
274	template <
275	typename vector_data_type,
276	typename vector_index_type,
277	typename vector_index_type2,
278	typename vector_index_dtmp_type,
279	typename Ti,
280	typename ... v_reduce>
281	static void solveConflicts(
282	vector_index_type & vct_index_old,
283	vector_index_type & vct_index_merge,
284	vector_index_type & vct_index_merge_id,
285	vector_index_type & vct_index_out,
286	vector_index_dtmp_type & vct_index_dtmp,
287	vector_index_type & data_map,
288	vector_index_type2 & segments_new,
289	vector_data_type & vct_data,
290	vector_data_type & vct_add_data,
291	vector_data_type & vct_add_data_unique,
292	vector_data_type & vct_data_out,
293	ite_gpu<`1`> & itew,
294	block_functor & blf,
295	mgpu::ofp_context_t & context
296	)
297	{
298	#ifdef __NVCC__
299	blf.template solve_conflicts<`1`,
300	decltype(vct_index_merge),
301	decltype(segments_new),
302	decltype(vct_data),
303	v_reduce ...>
304	(vct_index_merge, vct_index_merge_id, segments_new, data_map,
305	vct_data, vct_add_data,
306	vct_index_old, vct_data_out,
307	context);
308	vct_data_out.swap(vct_data);
309
310	#else // __NVCC__
311	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
312	#endif // __NVCC__
313	}
314	};
315
316	template<typename Ti>
317	struct reorder
318	{
319	Ti id;
320	Ti id2;
321
322	bool operator<(const reorder & t) const
323	{
324	return id < t.id;
325	}
326	};
327
328	template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type>
329	struct sparse_vector_reduction_cpu_impl
330	{
331	template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
332	static inline void red(size_t & i, vector_data_type & vector_data_red,
333	vector_data_type & vector_data,
334	vector_index_type & vector_index,
335	vector_index_type_reo & reorder_add_index_cpu)
336	{
337	size_t start = reorder_add_index_cpu.get(i).id;
338	red_type red = vector_data.template get<reduction_type::prop::value>(i);
339
340	size_t j = `1`;
341	for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
342	{
343	cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
344	//reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
345	}
346	vector_data_red.add();
347	vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-`1`) = red;
348
349	if (T::value == `0`)
350	{
351	vector_index.add();
352	vector_index.template get<`0`>(vector_index.size() - `1`) = reorder_add_index_cpu.get(i).id;
353	}
354
355	i += j;
356	}
357	};
358
359
360	template<typename reduction_type, typename vector_reduction, typename T,unsigned int impl, typename red_type, unsigned int N1>
361	struct sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type[N1]>
362	{
363	template<typename vector_data_type, typename vector_index_type,typename vector_index_type_reo>
364	static inline void red(size_t & i, vector_data_type & vector_data_red,
365	vector_data_type & vector_data,
366	vector_index_type & vector_index,
367	vector_index_type_reo & reorder_add_index_cpu)
368	{
369	size_t start = reorder_add_index_cpu.get(i).id;
370	red_type red[N1];
371
372	for (size_t k = `0` ; k < N1 ; k++)
373	{
374	red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
375	}
376
377	size_t j = `1`;
378	for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
379	{
380	auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
381	cpu_block_process<reduction_type,impl+`1`>::process(ev,red);
382	//reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
383	}
384
385	vector_data_red.add();
386
387	for (size_t k = `0` ; k < N1 ; k++)
388	{
389	vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-`1`)[k] = red[k];
390	}
391
392	if (T::value == `0`)
393	{
394	vector_index.add();
395	vector_index.template get<`0`>(vector_index.size() - `1`) = reorder_add_index_cpu.get(i).id;
396	}
397
398	i += j;
399	}
400	};
401
402	/! \brief this class is a functor for "for_each" algorithm*
403	*
404	* This class is a functor for "for_each" algorithm. For each
405	* element of the boost::vector the operator() is called.
406	* Is mainly used to copy one encap into another encap object
407	*
408	* \tparam encap source
409	* \tparam encap dst
410	*
411	*/
412	template<typename vector_data_type,
413	typename vector_index_type,
414	typename vector_index_type_reo,
415	typename vector_reduction,
416	unsigned int impl>
417	struct sparse_vector_reduction_cpu
418	{
419	//! Vector in which to the reduction
420	vector_data_type & vector_data_red;
421
422	//! Vector in which to the reduction
423	vector_data_type & vector_data;
424
425	//! reorder vector index
426	vector_index_type_reo & reorder_add_index_cpu;
427
428	//! Index type vector
429	vector_index_type & vector_index;
430
431	/! \brief constructor*
432	*
433	* \param src source encapsulated object
434	* \param dst source encapsulated object
435	*
436	*/
437	inline sparse_vector_reduction_cpu(vector_data_type & vector_data_red,
438	vector_data_type & vector_data,
439	vector_index_type & vector_index,
440	vector_index_type_reo & reorder_add_index_cpu)
441	:vector_data_red(vector_data_red),vector_data(vector_data),vector_index(vector_index),reorder_add_index_cpu(reorder_add_index_cpu)
442	{};
443
444	//! It call the copy function for each property
445	template<typename T>
446	inline void operator()(T& t) const
447	{
448	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
449	typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
450
451	if (reduction_type::is_special() == false)
452	{
453	for (size_t i = `0` ; i < reorder_add_index_cpu.size() ; )
454	{
455	sparse_vector_reduction_cpu_impl<reduction_type,vector_reduction,T,impl,red_type>::red(i,vector_data_red,vector_data,vector_index,reorder_add_index_cpu);
456
457	/ size_t start = reorder_add_index_cpu.get(i).id;*
458	red_type red = vector_data.template get<reduction_type::prop::value>(i);
459
460	size_t j = 1;
461	for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
462	{
463	cpu_block_process<reduction_type,impl>::process(vector_data.template get<reduction_type::prop::value>(i+j),red);
464	//reduction_type::red(red,vector_data.template get<reduction_type::prop::value>(i+j));
465	}
466	vector_data_red.add();
467	vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) = red;
468
469	if (T::value == 0)
470	{
471	vector_index.add();
472	vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
473	}
474
475	i += j;/*
476	}
477	}
478	}
479	};
480
481	/! \brief this class is a functor for "for_each" algorithm*
482	*
483	* This class is a functor for "for_each" algorithm. For each
484	* element of the boost::vector the operator() is called.
485	* Is mainly used to copy one encap into another encap object
486	*
487	* \tparam encap source
488	* \tparam encap dst
489	*
490	*/
491	template<typename encap_src,
492	typename encap_dst,
493	typename vector_reduction>
494	struct sparse_vector_reduction_solve_conflict_assign_cpu
495	{
496	//! source
497	encap_src & src;
498
499	//! destination
500	encap_dst & dst;
501
502
503	/! \brief constructor*
504	*
505	* \param src source encapsulated object
506	* \param dst source encapsulated object
507	*
508	*/
509	inline sparse_vector_reduction_solve_conflict_assign_cpu(encap_src & src, encap_dst & dst)
510	:src(src),dst(dst)
511	{};
512
513	//! It call the copy function for each property
514	template<typename T>
515	inline void operator()(T& t) const
516	{
517	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
518
519	dst.template get<reduction_type::prop::value>() = src.template get<reduction_type::prop::value>();
520	}
521	};
522
523
524	template<unsigned int impl,typename vector_reduction, typename T,typename red_type>
525	struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl
526	{
527	template<typename encap_src, typename encap_dst>
528	static inline void red(encap_src & src, encap_dst & dst)
529	{
530	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
531
532	cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
533	}
534	};
535
536	template<unsigned int impl, typename vector_reduction, typename T,typename red_type, unsigned int N1>
537	struct sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type[N1]>
538	{
539	template<typename encap_src, typename encap_dst>
540	static inline void red(encap_src & src, encap_dst & dst)
541	{
542	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
543
544	auto src_e = src.template get<reduction_type::prop::value>();
545	auto dst_e = dst.template get<reduction_type::prop::value>();
546
547	cpu_block_process<reduction_type,impl+`1`>::template process_e<N1,red_type::size>(src_e,dst_e);
548	}
549	};
550
551	/! \brief this class is a functor for "for_each" algorithm*
552	*
553	* This class is a functor for "for_each" algorithm. For each
554	* element of the boost::vector the operator() is called.
555	* Is mainly used to copy one encap into another encap object
556	*
557	* \tparam encap source
558	* \tparam encap dst
559	*
560	*/
561	template<typename encap_src,
562	typename encap_dst,
563	typename vector_reduction,
564	unsigned int impl>
565	struct sparse_vector_reduction_solve_conflict_reduce_cpu
566	{
567	//! source
568	encap_src & src;
569
570	//! destination
571	encap_dst & dst;
572
573
574	/! \brief constructor*
575	*
576	* \param src source encapsulated object
577	* \param dst source encapsulated object
578	*
579	*/
580	inline sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src & src, encap_dst & dst)
581	:src(src),dst(dst)
582	{};
583
584	//! It call the copy function for each property
585	template<typename T>
586	inline void operator()(T& t) const
587	{
588	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
589	typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
590
591	sparse_vector_reduction_solve_conflict_reduce_cpu_impl<impl,vector_reduction,T,red_type>::red(src,dst);
592
593	// cpu_block_process<reduction_type,impl>::process(src.template get<reduction_type::prop::value>(),dst.template get<reduction_type::prop::value>());
594	// reduction_type::red(dst.template get<reduction_type::prop::value>(),src.template get<reduction_type::prop::value>());
595	}
596	};
597
598	/! \brief this class is a functor for "for_each" algorithm*
599	*
600	* This class is a functor for "for_each" algorithm. For each
601	* element of the boost::vector the operator() is called.
602	* Is mainly used to copy one encap into another encap object
603	*
604	* \tparam encap source
605	* \tparam encap dst
606	*
607	*/
608	template<typename vector_data_type,
609	typename vector_index_type,
610	typename vector_index_type2,
611	typename vector_reduction,
612	typename block_functor,
613	unsigned int impl2, unsigned int pSegment=`1`>
614	struct sparse_vector_reduction
615	{
616	//! Vector in which to the reduction
617	vector_data_type & vector_data_red;
618
619	//! new datas
620	vector_data_type & vector_data;
621
622	//! new data in an unsorted way
623	vector_data_type & vector_data_unsorted;
624
625	//! segment of offsets
626	vector_index_type2 & segment_offset;
627
628	//! map of the data
629	vector_index_type & vector_data_map;
630
631	//! block functor
632	block_functor & blf;
633
634	//! gpu context
635	mgpu::ofp_context_t & context;
636
637	/! \brief constructor*
638	*
639	* \param src source encapsulated object
640	* \param dst source encapsulated object
641	*
642	*/
643	inline sparse_vector_reduction(vector_data_type & vector_data_red,
644	vector_data_type & vector_data,
645	vector_data_type & vector_data_unsorted,
646	vector_index_type & vector_data_map,
647	vector_index_type2 & segment_offset,
648	block_functor & blf,
649	mgpu::ofp_context_t & context)
650	:vector_data_red(vector_data_red),
651	vector_data(vector_data),
652	vector_data_unsorted(vector_data_unsorted),
653	segment_offset(segment_offset),
654	vector_data_map(vector_data_map),
655	blf(blf),
656	context(context)
657	{};
658
659	//! It call the copy function for each property
660	template<typename T>
661	inline void operator()(T& t) const
662	{
663	#ifdef __NVCC__
664
665	typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
666	typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
667	if (reduction_type::is_special() == false)
668	{
669	scalar_block_implementation_switch<impl2, block_functor>::template segreduce<pSegment, vector_reduction, T>(
670	vector_data,
671	vector_data_unsorted,
672	vector_data_map,
673	segment_offset,
674	vector_data_red,
675	blf,
676	context);
677	}
678	#else
679	std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
680	#endif
681	}
682	};
683
684
685	struct stub_block_functor
686	{
687	template<unsigned int pSegment, typename vector_reduction, typename T, typename vector_index_type, typename vector_data_type>
688	static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
689	{
690	return true;
691	}
692
693	template<typename vector_index_type, typename vector_data_type, typename ... v_reduce>
694	static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
695	vector_data_type &data1, vector_data_type &data2,
696	vector_index_type &indices_tmp, vector_data_type &data_tmp,
697	vector_index_type &keysOut, vector_data_type &dataOut,
698	mgpu::ofp_context_t & context)
699	{
700	return true;
701	}
702
703	openfpm::vector_gpu<aggregate<unsigned int>> outputMap;
704
705	openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap()
706	{
707	return outputMap;
708	}
709
710	const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
711	{
712	return outputMap;
713	}
714	};
715
716	/! \brief this class is a functor for "for_each" algorithm*
717	*
718	* This class is a functor for "for_each" algorithm. For each
719	* element of the boost::vector the operator() is called.
720	* Is mainly used to copy one encap into another encap object
721	*
722	* \tparam encap source
723	* \tparam encap dst
724	*
725	*/
726	template<typename vector_data_type, typename vector_index_type, typename vector_reduction>
727	struct sparse_vector_special
728	{
729	//! Vector in which to the reduction
730	vector_data_type & vector_data_red;
731
732	//! Vector in which to the reduction
733	vector_data_type & vector_data;
734
735	//! segment of offsets
736	vector_index_type & segment_offset;
737
738	//! gpu context
739	mgpu::ofp_context_t & context;
740
741	/! \brief constructor*
742	*
743	* \param src source encapsulated object
744	* \param dst source encapsulated object
745	*
746	*/
747	inline sparse_vector_special(vector_data_type & vector_data_red,
748	vector_data_type & vector_data,
749	vector_index_type & segment_offset,
750	mgpu::ofp_context_t & context)
751	:vector_data_red(vector_data_red),vector_data(vector_data),segment_offset(segment_offset),context(context)
752	{};
753
754	//! It call the copy function for each property
755	template<typename T>
756	inline void operator()(T& t) const
757	{
758	#ifdef __NVCC__
759
760	typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
761
762	// reduction type
763	typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
764
765	if (reduction_type::is_special() == true)
766	{
767	auto ite = segment_offset.getGPUIterator();
768
769	CUDA_LAUNCH((reduce_from_offset<decltype(segment_offset.toKernel()),decltype(vector_data_red.toKernel()),reduction_type>),
770	ite,segment_offset.toKernel(),vector_data_red.toKernel(),vector_data.size());
771	}
772
773	#else
774	std::cout << __FILE__ << ":" << __LINE__ << " error: this file si supposed to be compiled with nvcc" << std::endl;
775	#endif
776	}
777	};
778
779	template<typename T,
780	typename Ti = long int,
781	typename Memory=HeapMemory,
782	typename layout=typename memory_traits_lin<T>::type,
783	template<typename> class layout_base=memory_traits_lin ,
784	typename grow_p=grow_policy_double,
785	unsigned int impl=vect_isel<T>::value,
786	unsigned int impl2 = VECTOR_SPARSE_STANDARD,
787	typename block_functor = stub_block_functor>
788	class vector_sparse
789	{
790	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index;
791	vector<T,Memory,layout_base,grow_p,impl> vct_data;
792	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_m_index;
793
794	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index;
795	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_rem_index;
796	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nadd_index;
797	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_nrem_index;
798	vector<T,Memory,layout_base,grow_p> vct_add_data;
799	vector<T,Memory,layout_base,grow_p> vct_add_data_reord;
800
801	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_0;
802	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_add_index_cont_1;
803	vector<T,Memory,layout_base,grow_p> vct_add_data_cont;
804	vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> vct_add_index_unique;
805	vector<aggregate<int,int>,Memory,layout_base,grow_p> segments_int;
806
807	vector<T,Memory,layout_base,grow_p,impl> vct_add_data_unique;
808
809	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp4;
810	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
811	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp2;
812	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp3;
813	vector<aggregate<Ti,Ti,Ti>,Memory,layout_base,grow_p> vct_index_dtmp;
814
815	// segments map (This is used only in case of Blocked data)
816	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_segment_index_map;
817
818	block_functor blf;
819
820	T bck;
821
822	CudaMemory mem;
823
824	openfpm::vector<reorder<Ti>> reorder_add_index_cpu;
825
826	size_t max_ele;
827
828	int n_gpu_add_block_slot = `0`;
829	int n_gpu_rem_block_slot = `0`;
830
831	/! \brief get the element i*
832	*
833	* search the element x
834	*
835	* \param i element i
836	*/
837	template<bool prefetch>
838	inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
839	{
840	if (vct_index.size() == `0`) {id = `0`; return -`1`;}
841	const Ti *base = &vct_index.template get<`0`>(`0`);
842	const Ti end = (const* Ti *)vct_index.template getPointer<`0`>() + vct_index.size();
843	Ti n = vct_data.size()-`1`;
844	while (n > `1`)
845	{
846	Ti half = n / `2`;
847	if (prefetch)
848	{
849	__builtin_prefetch(base + half/`2`, `0`, `0`);
850	__builtin_prefetch(base + half + half/`2`, `0`, `0`);
851	}
852	base = (base[half] < x) ? base+half : base;
853	n -= half;
854	}
855
856	int off = (*base < x);
857	id = base - &vct_index.template get<`0`>(`0`) + off;
858	return (base + off != end)?*(base + off):-`1`;
859	}
860
861	/! \brief get the element i*
862	*
863	* search the element x
864	*
865	* \param i element i
866	*/
867	template<bool prefetch>
868	inline void _branchfree_search(Ti x, Ti & id) const
869	{
870	Ti v = _branchfree_search_nobck<prefetch>(x,id);
871	id = (x == v)?id:vct_data.size()-`1`;
872	}
873
874
875	/ \brief take the indexes for the insertion pools and create a continuos array*
876	*
877	* \param vct_nadd_index number of insertions of each pool
878	* \param vct_add_index pool of inserts
879	* \param vct_add_cont_index output continuos array of inserted indexes
880	* \param vct_add_data array of added data
881	* \param vct_add_data_cont continuos array of inserted data
882	* \param contect mgpu context
883	*
884	*/
885	size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
886	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index,
887	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
888	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
889	vector<T,Memory,layout_base,grow_p> & vct_add_data,
890	vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
891	mgpu::ofp_context_t & context)
892	{
893	#ifdef __NVCC__
894
895	// Add 0 to the last element to vct_nadd_index
896	vct_nadd_index.resize(vct_nadd_index.size()+`1`);
897	vct_nadd_index.template get<`0`>(vct_nadd_index.size()-`1`) = `0`;
898	vct_nadd_index.template hostToDevice<`0`>(vct_nadd_index.size()-`1`,vct_nadd_index.size()-`1`);
899
900	// Merge the list of inserted points for each block
901	vct_index_tmp4.resize(vct_nadd_index.size());
902
903	openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<`0`>(),
904	vct_nadd_index.size(),
905	(Ti *)vct_index_tmp4.template getDeviceBuffer<`0`>() ,
906	context);
907
908	vct_index_tmp4.template deviceToHost<`0`>(vct_index_tmp4.size()-`1`,vct_index_tmp4.size()-`1`);
909	size_t n_ele = vct_index_tmp4.template get<`0`>(vct_index_tmp4.size()-`1`);
910
911	// we reuse vct_nadd_index
912	vct_add_cont_index.resize(n_ele);
913	vct_add_cont_index_map.resize(n_ele);
914
915	if (impl2 == VECTOR_SPARSE_STANDARD)
916	{
917	vct_add_data_cont.resize(n_ele);
918	}
919	else
920	{
921	vct_segment_index_map.resize(n_ele);
922	}
923
924	if (n_gpu_add_block_slot >= `128`)
925	{
926	ite_gpu<`1`> itew;
927	itew.wthr.x = vct_nadd_index.size()-`1`;
928	itew.wthr.y = `1`;
929	itew.wthr.z = `1`;
930	itew.thr.x = `128`;
931	itew.thr.y = `1`;
932	itew.thr.z = `1`;
933
934	CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
935	vct_nadd_index.toKernel(),
936	vct_index_tmp4.toKernel(),
937	vct_add_cont_index.toKernel(),
938	vct_add_cont_index_map.toKernel(),
939	n_gpu_add_block_slot);
940	}
941	else
942	{
943	auto itew = vct_add_index.getGPUIterator();
944
945	CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
946	vct_nadd_index.toKernel(),
947	vct_index_tmp4.toKernel(),
948	vct_add_cont_index.toKernel(),
949	vct_add_cont_index_map.toKernel(),
950	n_gpu_add_block_slot);
951	}
952
953	return n_ele;
954	#endif
955	return `0`;
956	}
957
958	/! \brief sort the continuos array of inserted key*
959	*
960	* \param context modern gpu context
961	* \param vct_add_cont_index array of indexes (unsorted), as output will be sorted
962	* \param vct_add_cont_index_map reference to the original indexes
963	* \param vct_add_data_reord sorted data output
964	* \param vct_add_data_cont added data in a continuos unsorted array
965	*
966	*/
967	void reorder_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index,
968	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
969	vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
970	vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
971	mgpu::ofp_context_t & context)
972	{
973	#ifdef __NVCC__
974	ite_gpu<`1`> itew;
975	itew.wthr.x = vct_nadd_index.size()-`1`;
976	itew.wthr.y = `1`;
977	itew.wthr.z = `1`;
978	itew.thr.x = `128`;
979	itew.thr.y = `1`;
980	itew.thr.z = `1`;
981
982	size_t n_ele = vct_add_cont_index.size();
983
984	n_gpu_add_block_slot = `0`;
985
986	// now we sort
987	openfpm::sort(
988	(Ti *)vct_add_cont_index.template getDeviceBuffer<`0`>(),
989	(Ti *)vct_add_cont_index_map.template getDeviceBuffer<`0`>(),
990	vct_add_cont_index.size(),
991	mgpu::template less_t<Ti>(),
992	context);
993
994	auto ite = vct_add_cont_index.getGPUIterator();
995
996	// Now we reorder the data vector accordingly to the indexes
997
998	if (impl2 == VECTOR_SPARSE_STANDARD)
999	{
1000	vct_add_data_reord.resize(n_ele);
1001	CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
1002	}
1003
1004	#endif
1005	}
1006
1007	/! \brief Merge indexes*
1008	*
1009	* \param
1010	*
1011	*
1012	*/
1013	template<typename ... v_reduce>
1014	void merge_indexes(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_sort,
1015	vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
1016	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index,
1017	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map,
1018	mgpu::ofp_context_t & context)
1019	{
1020	#ifdef __NVCC__
1021
1022	typedef boost::mpl::vector<v_reduce...> vv_reduce;
1023
1024	auto ite = vct_add_index_sort.getGPUIterator();
1025
1026	mem.allocate(sizeof(int));
1027	mem.fill(`0`);
1028	vct_add_index_unique.resize(vct_add_index_sort.size()+`1`);
1029
1030	ite = vct_add_index_sort.getGPUIterator();
1031
1032	vct_index_tmp4.resize(vct_add_index_sort.size()+`1`);
1033
1034	CUDA_LAUNCH(
1035	(
1036	find_buffer_offsets_for_scan
1037	<`0`,
1038	decltype(vct_add_index_sort.toKernel()),
1039	decltype(vct_index_tmp4.toKernel())
1040	>
1041	),
1042	ite,
1043	vct_add_index_sort.toKernel(),
1044	vct_index_tmp4.toKernel());
1045
1046	openfpm::scan((Ti )vct_index_tmp4.template getDeviceBuffer<`0`>(),vct_index_tmp4.size(),(Ti )vct_index_tmp4.template getDeviceBuffer<`0`>(),context);
1047
1048	vct_index_tmp4.template deviceToHost<`0`>(vct_index_tmp4.size()-`1`,vct_index_tmp4.size()-`1`);
1049	int n_ele_unique = vct_index_tmp4.template get<`0`>(vct_index_tmp4.size()-`1`);
1050
1051	vct_add_index_unique.resize(n_ele_unique);
1052
1053	if (impl2 == VECTOR_SPARSE_STANDARD)
1054	{
1055	vct_add_data_unique.resize(n_ele_unique);
1056	}
1057
1058	CUDA_LAUNCH(
1059	(construct_index_unique<`0`>),
1060	ite,
1061	vct_add_index_sort.toKernel(),
1062	vct_index_tmp4.toKernel(),
1063	vct_add_index_unique.toKernel());
1064
1065	typedef boost::mpl::vector<v_reduce...> vv_reduce;
1066
1067	// Then we merge the two list vct_index and vct_add_index_unique
1068
1069	// index to get merge index
1070	vct_m_index.resize(vct_index.size());
1071
1072	if (vct_m_index.size() != `0`)
1073	{
1074	ite = vct_m_index.getGPUIterator();
1075	CUDA_LAUNCH((set_indexes<`0`>),ite,vct_m_index.toKernel(),`0`);
1076	}
1077
1078	// after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1079	// we reorder the data array also
1080
1081	vct_merge_index.resize(vct_index.size() + vct_add_index_unique.size());
1082	vct_merge_index_map.resize(vct_index.size() + vct_add_index_unique.size());
1083	vct_index_tmp3.resize(vct_index.size() + vct_add_index_unique.size());
1084
1085	// Do not delete this reserve
1086	// Unfortunately all resize with DataBlocks are broken
1087	if (impl2 == VECTOR_SPARSE_STANDARD)
1088	{
1089	vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+`1`);
1090	vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
1091	}
1092
1093	ite = vct_add_index_unique.getGPUIterator();
1094	vct_index_tmp4.resize(vct_add_index_unique.size());
1095	CUDA_LAUNCH((set_indexes<`0`>),ite,vct_index_tmp4.toKernel(),vct_index.size());
1096
1097	ite_gpu<`1`> itew;
1098
1099	itew.wthr.x = vct_merge_index.size() / `128` + (vct_merge_index.size() % `128` != `0`);
1100	itew.wthr.y = `1`;
1101	itew.wthr.z = `1`;
1102	itew.thr.x = `128`;
1103	itew.thr.y = `1`;
1104	itew.thr.z = `1`;
1105
1106	vct_index_dtmp.resize(itew.wthr.x);
1107
1108	// we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes
1109	//
1110
1111	mgpu::merge((Ti )vct_index.template getDeviceBuffer<`0`>(),(Ti )vct_m_index.template getDeviceBuffer<`0`>(),vct_index.size(),
1112	(Ti )vct_add_index_unique.template getDeviceBuffer<`0`>(),(Ti )vct_index_tmp4.template getDeviceBuffer<`0`>(),vct_add_index_unique.size(),
1113	(Ti )vct_merge_index.template getDeviceBuffer<`0`>(),(Ti )vct_merge_index_map.template getDeviceBuffer<`0`>(),mgpu::less_t<Ti>(),context);
1114
1115
1116	#endif
1117	}
1118
1119
1120
1121	template<typename ... v_reduce>
1122	void merge_datas(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1123	vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new,
1124	vector<T,Memory,layout_base,grow_p> & vct_add_data,
1125	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map,
1126	mgpu::ofp_context_t & context)
1127	{
1128	#ifdef __NVCC__
1129	ite_gpu<`1`> itew;
1130	itew.wthr.x = vct_index_tmp.size() / `128` + (vct_index_tmp.size() % `128` != `0`);
1131	itew.wthr.y = `1`;
1132	itew.wthr.z = `1`;
1133	itew.thr.x = `128`;
1134	itew.thr.y = `1`;
1135	itew.thr.z = `1`;
1136
1137	typedef boost::mpl::vector<v_reduce...> vv_reduce;
1138
1139	////////////////////////////////////////////////////////////////////////////////////////////////////
1140
1141	// Now we can do a segmented reduction
1142	scalar_block_implementation_switch<impl2, block_functor>
1143	::template extendSegments<`1`>(vct_add_index_unique, vct_add_data_reord_map.size());
1144
1145	if (impl2 == VECTOR_SPARSE_STANDARD)
1146	{
1147	sparse_vector_reduction<typename std::remove_reference<decltype(vct_add_data)>::type,
1148	decltype(vct_add_data_reord_map),
1149	decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1150	svr(
1151	vct_add_data_unique,
1152	vct_add_data_reord,
1153	vct_add_data,
1154	vct_add_data_reord_map,
1155	vct_add_index_unique,
1156	blf,
1157	context);
1158
1159	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(svr);
1160	}
1161
1162	sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
1163	decltype(vct_add_index_unique),
1164	vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context);
1165	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(svr2);
1166
1167	//////////////////////////////////////////////////////////////////////////////////////////////////////
1168
1169	// Now perform the right solve_conflicts according to impl2
1170	scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1171	decltype(vct_data),
1172	decltype(vct_index),
1173	decltype(segments_new),
1174	decltype(vct_index_dtmp),
1175	Ti,
1176	v_reduce ...
1177	>
1178	(
1179	vct_index,
1180	vct_index_tmp,
1181	vct_index_tmp2,
1182	vct_index_tmp3,
1183	vct_index_dtmp,
1184	vct_add_data_reord_map,
1185	segments_new,
1186	vct_data,
1187	vct_add_data,
1188	vct_add_data_unique,
1189	vct_add_data_cont,
1190	itew,
1191	blf,
1192	context
1193	);
1194
1195
1196	#else
1197	std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1198	#endif
1199	}
1200
1201	template<typename ... v_reduce>
1202	void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1203	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1204	vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1205	mgpu::ofp_context_t & context)
1206	{
1207	#ifdef __NVCC__
1208
1209	// To avoid the case where you never called setGPUInsertBuffer
1210	if (n_gpu_add_block_slot == `0` \|\| vct_add_index.size() == `0`)
1211	{
1212	return;
1213	}
1214
1215	size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1,
1216	vct_add_data,vct_add_data_cont,context);
1217
1218	// At this point we can check whether we have not inserted anything actually,
1219	// in this case, return without further ado...
1220	if (vct_add_index_cont_0.size() == `0`)
1221	{return;}
1222
1223	reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context);
1224
1225	merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1226	vct_index_tmp,vct_index_tmp2,
1227	context);
1228
1229	merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context);
1230
1231	#else
1232	std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1233	#endif
1234	}
1235
1236
1237	void flush_on_gpu_remove(
1238	mgpu::ofp_context_t & context)
1239	{
1240	#ifdef __NVCC__
1241
1242	// Add 0 to the last element to vct_nadd_index
1243	vct_nrem_index.resize(vct_nrem_index.size()+`1`);
1244	vct_nrem_index.template get<`0`>(vct_nrem_index.size()-`1`) = `0`;
1245	vct_nrem_index.template hostToDevice<`0`>(vct_nrem_index.size()-`1`,vct_nrem_index.size()-`1`);
1246
1247	// Merge the list of inserted points for each block
1248	vct_index_tmp4.resize(vct_nrem_index.size());
1249
1250	openfpm::scan((Ti )vct_nrem_index.template getDeviceBuffer<`0`>(), vct_nrem_index.size(), (Ti )vct_index_tmp4.template getDeviceBuffer<`0`>() , context);
1251
1252	vct_index_tmp4.template deviceToHost<`0`>(vct_index_tmp4.size()-`1`,vct_index_tmp4.size()-`1`);
1253	size_t n_ele = vct_index_tmp4.template get<`0`>(vct_index_tmp4.size()-`1`);
1254
1255	// we reuse vct_nadd_index
1256	vct_add_index_cont_0.resize(n_ele);
1257	vct_add_index_cont_1.resize(n_ele);
1258
1259	ite_gpu<`1`> itew;
1260	itew.wthr.x = vct_nrem_index.size()-`1`;
1261	itew.wthr.y = `1`;
1262	itew.wthr.z = `1`;
1263	itew.thr.x = `128`;
1264	itew.thr.y = `1`;
1265	itew.thr.z = `1`;
1266
1267	CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1268	vct_nrem_index.toKernel(),
1269	vct_index_tmp4.toKernel(),
1270	vct_add_index_cont_0.toKernel(),
1271	vct_add_index_cont_1.toKernel(),
1272	n_gpu_rem_block_slot);
1273
1274	// now we sort
1275	openfpm::sort((Ti )vct_add_index_cont_0.template getDeviceBuffer<`0`>(),(Ti )vct_add_index_cont_1.template getDeviceBuffer<`0`>(),
1276	vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context);
1277
1278	auto ite = vct_add_index_cont_0.getGPUIterator();
1279
1280	mem.allocate(sizeof(int));
1281	mem.fill(`0`);
1282	vct_add_index_unique.resize(vct_add_index_cont_0.size()+`1`);
1283
1284	ite = vct_add_index_cont_0.getGPUIterator();
1285
1286	// produce unique index list
1287	// Find the buffer bases
1288	CUDA_LAUNCH((find_buffer_offsets_zero<`0`,decltype(vct_add_index_cont_0.toKernel()),decltype(vct_add_index_unique.toKernel())>),
1289	ite,
1290	vct_add_index_cont_0.toKernel(),(int *)mem.getDevicePointer(),vct_add_index_unique.toKernel());
1291
1292	mem.deviceToHost();
1293	int n_ele_unique = (int* *)mem.getPointer();
1294
1295	vct_add_index_unique.resize(n_ele_unique);
1296
1297	openfpm::sort((Ti )vct_add_index_unique.template getDeviceBuffer<`1`>(),(Ti )vct_add_index_unique.template getDeviceBuffer<`0`>(),
1298	vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context);
1299
1300	// Then we merge the two list vct_index and vct_add_index_unique
1301
1302	// index to get merge index
1303	vct_m_index.resize(vct_index.size() + vct_add_index_unique.size());
1304
1305	ite = vct_m_index.getGPUIterator();
1306	CUDA_LAUNCH((set_indexes<`0`>),ite,vct_m_index.toKernel(),`0`);
1307
1308	ite = vct_add_index_unique.getGPUIterator();
1309	CUDA_LAUNCH((set_indexes<`1`>),ite,vct_add_index_unique.toKernel(),vct_index.size());
1310
1311	// after merge we solve the last conflicts, running across the vector again and spitting 1 when there is something to merge
1312	// we reorder the data array also
1313
1314	vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1315	vct_index_tmp2.resize(vct_index.size() + vct_add_index_unique.size());
1316
1317	itew.wthr.x = vct_index_tmp.size() / `128` + (vct_index_tmp.size() % `128` != `0`);
1318	itew.wthr.y = `1`;
1319	itew.wthr.z = `1`;
1320	itew.thr.x = `128`;
1321	itew.thr.y = `1`;
1322	itew.thr.z = `1`;
1323
1324	vct_index_dtmp.resize(itew.wthr.x);
1325
1326	// we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
1327	//
1328	mgpu::merge((Ti )vct_index.template getDeviceBuffer<`0`>(),(Ti )vct_m_index.template getDeviceBuffer<`0`>(),vct_index.size(),
1329	(Ti )vct_add_index_unique.template getDeviceBuffer<`0`>(),(Ti )vct_add_index_unique.template getDeviceBuffer<`1`>(),vct_add_index_unique.size(),
1330	(Ti )vct_index_tmp.template getDeviceBuffer<`0`>(),(Ti )vct_index_tmp2.template getDeviceBuffer<`0`>(),mgpu::less_t<Ti>(),context);
1331
1332	vct_index_tmp3.resize(`128`*itew.wthr.x);
1333
1334	CUDA_LAUNCH((solve_conflicts_remove<decltype(vct_index_tmp.toKernel()),decltype(vct_index_dtmp.toKernel()),`128`>),
1335	itew,
1336	vct_index_tmp.toKernel(),
1337	vct_index_tmp2.toKernel(),
1338	vct_index_tmp3.toKernel(),
1339	vct_m_index.toKernel(),
1340	vct_index_dtmp.toKernel(),
1341	vct_index.size());
1342
1343	// we scan tmp3
1344	openfpm::scan((Ti)vct_index_dtmp.template getDeviceBuffer<`0`>(),vct_index_dtmp.size(),(Ti )vct_index_dtmp.template getDeviceBuffer<`1`>(),context);
1345
1346	// get the size to resize vct_index and vct_data
1347	vct_index_dtmp.template deviceToHost<`0`,`1`>(vct_index_dtmp.size()-`1`,vct_index_dtmp.size()-`1`);
1348	int size = vct_index_dtmp.template get<`1`>(vct_index_dtmp.size()-`1`) + vct_index_dtmp.template get<`0`>(vct_index_dtmp.size()-`1`);
1349
1350	vct_add_data_cont.resize(size);
1351	vct_index.resize(size);
1352
1353	CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1354	vct_index.toKernel(),vct_add_data_cont.toKernel(),
1355	vct_index_dtmp.toKernel());
1356
1357	vct_data.swap(vct_add_data_cont);
1358
1359	#else
1360	std::cout << __FILE__ << ":" << __LINE__ << " error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1361	#endif
1362	}
1363
1364	void resetBck()
1365	{
1366	// re-add background
1367	vct_data.resize(vct_data.size()+`1`);
1368	vct_data.get(vct_data.size()-`1`) = bck;
1369
1370	htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-`1`);
1371	boost::mpl::for_each_ref< boost::mpl::range_c<int,`0`,T::max_prop> >(trf);
1372	}
1373
1374	template<typename ... v_reduce>
1375	void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1376	vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1377	vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1378	mgpu::ofp_context_t & context)
1379	{
1380	flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
1381	}
1382
1383	template<typename ... v_reduce>
1384	void flush_on_cpu()
1385	{
1386	if (vct_add_index.size() == `0`)
1387	{return;}
1388
1389	// First copy the added index to reorder
1390	reorder_add_index_cpu.resize(vct_add_index.size());
1391	vct_add_data_cont.resize(vct_add_index.size());
1392
1393	for (size_t i = `0` ; i < reorder_add_index_cpu.size() ; i++)
1394	{
1395	reorder_add_index_cpu.get(i).id = vct_add_index.template get<`0`>(i);
1396	reorder_add_index_cpu.get(i).id2 = i;
1397	}
1398
1399	reorder_add_index_cpu.sort();
1400
1401	// Copy the data
1402	for (size_t i = `0` ; i < reorder_add_index_cpu.size() ; i++)
1403	{
1404	vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1405	}
1406
1407	typedef boost::mpl::vector<v_reduce...> vv_reduce;
1408
1409	sparse_vector_reduction_cpu<decltype(vct_add_data),
1410	decltype(vct_add_index_unique),
1411	decltype(reorder_add_index_cpu),
1412	vv_reduce,
1413	impl2>
1414	svr(vct_add_data_unique,
1415	vct_add_data_cont,
1416	vct_add_index_unique,
1417	reorder_add_index_cpu);
1418
1419	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(svr);
1420
1421	// merge the the data
1422
1423	vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1424	vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1425
1426	vct_data_tmp.resize(vct_data.size() + vct_add_data_unique.size());
1427	vct_index_tmp.resize(vct_index.size() + vct_add_index_unique.size());
1428
1429	Ti di = `0`;
1430	Ti ai = `0`;
1431	size_t i = `0`;
1432
1433	for ( ; i < vct_data_tmp.size() ; i++)
1434	{
1435	Ti id_a = (ai < vct_add_index_unique.size())?vct_add_index_unique.template get<`0`>(ai):std::numeric_limits<Ti>::max();
1436	Ti id_d = (di < vct_index.size())?vct_index.template get<`0`>(di):std::numeric_limits<Ti>::max();
1437
1438	if ( id_a <= id_d )
1439	{
1440	vct_index_tmp.template get<`0`>(i) = id_a;
1441
1442	if (id_a == id_d)
1443	{
1444	auto dst = vct_data_tmp.get(i);
1445	auto src = vct_add_data_unique.get(ai);
1446
1447	sparse_vector_reduction_solve_conflict_assign_cpu<decltype(vct_data_tmp.get(i)),
1448	decltype(vct_add_data.get(ai)),
1449	vv_reduce>
1450	sva(src,dst);
1451
1452	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(sva);
1453	ai++;
1454
1455	dst = vct_data_tmp.get(i);
1456	src = vct_data.get(di);
1457
1458	sparse_vector_reduction_solve_conflict_reduce_cpu<decltype(vct_data_tmp.get(i)),
1459	decltype(vct_data.get(di)),
1460	vv_reduce,
1461	impl2>
1462	svr(src,dst);
1463	boost::mpl::for_each_ref<boost::mpl::range_c<int,`0`,sizeof...(v_reduce)>>(svr);
1464
1465	di++;
1466
1467	vct_data_tmp.resize(vct_data_tmp.size()-`1`);
1468	vct_index_tmp.resize(vct_index_tmp.size()-`1`);
1469	}
1470	else
1471	{
1472	vct_index_tmp.template get<`0`>(i) = vct_add_index_unique.template get<`0`>(ai);
1473	vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1474	ai++;
1475	}
1476	}
1477	else
1478	{
1479	vct_index_tmp.template get<`0`>(i) = vct_index.template get<`0`>(di);
1480	vct_data_tmp.get(i) = vct_data.get(di);
1481	di++;
1482	}
1483	}
1484
1485	vct_index.swap(vct_index_tmp);
1486	vct_data.swap(vct_data_tmp);
1487
1488	vct_add_data.clear();
1489	vct_add_index.clear();
1490	vct_add_index_unique.clear();
1491	vct_add_data_unique.clear();
1492	}
1493
1494	public:
1495
1496	vector_sparse()
1497	:max_ele(`0`)
1498	{
1499	vct_data.resize(`1`);
1500	}
1501
1502	/! \brief Get the indices buffer*
1503	*
1504	* \return the reference to the indices buffer
1505	*/
1506	auto getIndexBuffer() -> decltype(vct_index)&
1507	{
1508	return vct_index;
1509	}
1510
1511	/! \brief Get the data buffer*
1512	*
1513	* \return the reference to the data buffer
1514	*/
1515	auto getDataBuffer() -> decltype(vct_data)&
1516	{
1517	return vct_data;
1518	}
1519
1520	/! \brief Get the indices buffer*
1521	*
1522	* \return the reference to the indices buffer
1523	*/
1524	auto getIndexBuffer() const -> const decltype(vct_index)&
1525	{
1526	return vct_index;
1527	}
1528
1529	/! \brief Get the data buffer*
1530	*
1531	* \return the reference to the data buffer
1532	*/
1533	auto getDataBuffer() const -> const decltype(vct_data)&
1534	{
1535	return vct_data;
1536	}
1537
1538	/! \brief Get the sparse index*
1539	*
1540	* Get the sparse index of the element id
1541	*
1542	* \note use get_index and get to retrieve the value index associated to the sparse index
1543	*
1544	* \param id Element to get
1545	*
1546	* \return the element value requested
1547	*
1548	*/
1549	inline openfpm::sparse_index<Ti> get_sparse(Ti id) const
1550	{
1551	Ti di;
1552	this->_branchfree_search<false>(id,di);
1553	openfpm::sparse_index<Ti> sid;
1554	sid.id = di;
1555
1556	return sid;
1557	}
1558
1559	/! \brief Get an element of the vector*
1560	*
1561	* Get an element of the vector
1562	*
1563	* \tparam p Property to get
1564	* \param id Element to get
1565	*
1566	* \return the element value requested
1567	*
1568	*/
1569	template <unsigned int p>
1570	inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
1571	{
1572	Ti di;
1573	this->_branchfree_search<false>(id,di);
1574	return vct_data.template get<p>(di);
1575	}
1576
1577	/! \brief Get an element of the vector*
1578	*
1579	* Get an element of the vector
1580	*
1581	* \tparam p Property to get
1582	* \param id Element to get
1583	*
1584	* \return the element value requested
1585	*
1586	*/
1587	inline auto get(Ti id) const -> decltype(vct_data.get(id))
1588	{
1589	Ti di;
1590	this->_branchfree_search<false>(id,di);
1591	return vct_data.get(di);
1592	}
1593
1594	/! \brief resize to n elements*
1595	*
1596	* \param n elements
1597	*
1598	*/
1599	void resize(size_t n)
1600	{
1601	max_ele = n;
1602	}
1603
1604	/! \brief*
1605	*
1606	* \warning After using this function to move out the vector of the indexes, this object become useless and
1607	* must be destroyed
1608	*
1609	* \param iv
1610	*
1611	*/
1612	void swapIndexVector(vector<aggregate<Ti>,Memory,layout_base,grow_p> & iv)
1613	{
1614	vct_index.swap(iv);
1615	}
1616
1617	/! \brief Set the background to bck (which value get must return when the value is not find)*
1618	*
1619	* \param bck
1620	*
1621	*/
1622	template <unsigned int p>
1623	auto getBackground() const -> decltype(vct_data.template get<p>(vct_data.size()-`1`))
1624	{
1625	return vct_data.template get<p>(vct_data.size()-`1`);
1626	}
1627
1628	/! \brief Set the background to bck (which value get must return when the value is not find)*
1629	*
1630	* \param bck
1631	*
1632	*/
1633	auto getBackground() const -> decltype(vct_data.get(vct_data.size()-`1`))
1634	{
1635	return vct_data.get(vct_data.size()-`1`);
1636	}
1637
1638	template<unsigned int p>
1639	void setBackground(const typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type & bck_)
1640	{
1641	meta_copy_d<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type,
1642	typename std::remove_reference<decltype(vct_data.template get<p>(vct_data.size()-`1`))>::type>
1643	::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.size()-`1`));
1644
1645	vct_data.template hostToDevice<p>(vct_data.size()-`1`,vct_data.size()-`1`);
1646
1647	meta_copy<typename boost::mpl::at<typename T::type, boost::mpl::int_<p>>::type>
1648	::meta_copy_(bck_,bck.template get<p>());
1649	}
1650
1651	/! \brief It insert an element in the sparse vector*
1652	*
1653	* \tparam p property id
1654	*
1655	* \param ele element id
1656	*
1657	*/
1658	template <unsigned int p>
1659	auto insert(Ti ele) -> decltype(vct_data.template get<p>(`0`))
1660	{
1661	vct_add_index.add();
1662	vct_add_index.template get<`0`>(vct_add_index.size()-`1`) = ele;
1663	vct_add_data.add();
1664	return vct_add_data.template get<p>(vct_add_data.size()-`1`);
1665	}
1666
1667	/! \brief It insert an element in the sparse vector*
1668	*
1669	* \tparam p property id
1670	*
1671	* \param ele element id
1672	*
1673	*/
1674	template <unsigned int p>
1675	auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(`0`))
1676	{
1677	size_t di;
1678
1679	// first we have to search if the block exist
1680	Ti v = _branchfree_search_nobck(ele,di);
1681
1682	if (v == ele)
1683	{
1684	// block exist
1685	return vct_data.template get<p>(di);
1686	}
1687
1688	// It does not exist, we create it di contain the index where we have to create the new block
1689	vct_index.insert(di);
1690	vct_data.isert(di);
1691
1692	return vct_data.template get<p>(di);
1693	}
1694
1695	/! \brief It insert an element in the sparse vector*
1696	*
1697	* \param ele element id
1698	*
1699	*/
1700	auto insertFlush(Ti ele) -> decltype(vct_data.get(`0`))
1701	{
1702	Ti di;
1703
1704	// first we have to search if the block exist
1705	Ti v = _branchfree_search_nobck<true>(ele,di);
1706
1707	if (v == ele)
1708	{
1709	// block exist
1710	return vct_data.get(di);
1711	}
1712
1713	// It does not exist, we create it di contain the index where we have to create the new block
1714	vct_index.insert(di);
1715	vct_data.insert(di);
1716
1717	vct_index.template get<`0`>(di) = ele;
1718
1719	return vct_data.get(di);
1720	}
1721
1722	/! \brief It insert an element in the sparse vector*
1723	*
1724	* \param ele element id
1725	*
1726	*/
1727	auto insert(Ti ele) -> decltype(vct_data.get(`0`))
1728	{
1729	vct_add_index.add();
1730	vct_add_index.template get<`0`>(vct_add_index.size()-`1`) = ele;
1731	vct_add_data.add();
1732	return vct_add_data.get(vct_add_data.size()-`1`);
1733	}
1734
1735	/! \brief merge the added element to the main data array but save the insert buffer in v*
1736	*
1737	* \param v insert buffer
1738	*
1739	* \param opt options
1740	*
1741	*/
1742	template<typename ... v_reduce>
1743	void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1744	mgpu::ofp_context_t & context,
1745	flush_type opt = FLUSH_ON_HOST,
1746	int i = `0`)
1747	{
1748	// Eliminate background
1749	vct_data.resize(vct_index.size());
1750
1751	if (opt & flush_type::FLUSH_ON_DEVICE)
1752	{this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);}
1753	else
1754	{this->flush_on_cpu<v_reduce ... >();}
1755
1756	resetBck();
1757	}
1758
1759	/! \brief merge the added element to the main data array but save the insert buffer in v*
1760	*
1761	* \param v insert buffer
1762	*
1763	* \param opt options
1764	*
1765	*/
1766	template<typename ... v_reduce>
1767	void flush_vd(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1768	mgpu::ofp_context_t & context,
1769	flush_type opt = FLUSH_ON_HOST)
1770	{
1771	// Eliminate background
1772	vct_data.resize(vct_index.size());
1773
1774	if (opt & flush_type::FLUSH_ON_DEVICE)
1775	{this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1776	else
1777	{this->flush_on_cpu<v_reduce ... >();}
1778
1779	resetBck();
1780	}
1781
1782	/! \brief merge the added element to the main data array*
1783	*
1784	* \param opt options
1785	*
1786	*/
1787	template<typename ... v_reduce>
1788	void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1789	{
1790	// Eliminate background
1791	vct_data.resize(vct_index.size());
1792
1793	if (opt & flush_type::FLUSH_ON_DEVICE)
1794	{this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1795	else
1796	{this->flush_on_cpu<v_reduce ... >();}
1797
1798	resetBck();
1799	}
1800
1801	/! \brief merge the added element to the main data array*
1802	*
1803	* \param opt options
1804	*
1805	*/
1806	void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
1807	{
1808	vct_data.resize(vct_data.size()-`1`);
1809
1810	if (opt & flush_type::FLUSH_ON_DEVICE)
1811	{this->flush_on_gpu_remove(context);}
1812	else
1813	{
1814	std::cerr << __FILE__ << ":" << __LINE__ << " error, flush_remove on CPU has not implemented yet";
1815	}
1816
1817	resetBck();
1818	}
1819
1820	/! \brief Return how many element you have in this map*
1821	*
1822	* \return the number of elements
1823	*/
1824	size_t size()
1825	{
1826	return vct_index.size();
1827	}
1828
1829	/! \brief Return the sorted vector of the indexes*
1830	*
1831	* \return return the sorted vector of the indexes
1832	*/
1833	vector<aggregate<Ti>,Memory,layout_base,grow_p> &
1834	private_get_vct_index()
1835	{
1836	return vct_index;
1837	}
1838
1839	/! \brief Transfer from device to host*
1840	*
1841	* \tparam set of parameters to transfer to host
1842	*
1843	*/
1844	template<unsigned int ... prp>
1845	void deviceToHost()
1846	{
1847	vct_index.template deviceToHost<`0`>();
1848	vct_data.template deviceToHost<prp...>();
1849	}
1850
1851	/! \brief Transfer from host to device*
1852	*
1853	* \tparam set of parameters to transfer to device
1854	*
1855	*/
1856	template<unsigned int ... prp>
1857	void hostToDevice()
1858	{
1859	vct_index.template hostToDevice<`0`>();
1860	vct_data.template hostToDevice<prp...>();
1861	}
1862
1863	/! \brief toKernel function transform this structure into one that can be used on GPU*
1864	*
1865	* \return structure that can be used on GPU
1866	*
1867	*/
1868	vector_sparse_gpu_ker<T,Ti,layout_base> toKernel()
1869	{
1870	vector_sparse_gpu_ker<T,Ti,layout_base> mvsck(vct_index.toKernel(),vct_data.toKernel(),
1871	vct_add_index.toKernel(),
1872	vct_rem_index.toKernel(),vct_add_data.toKernel(),
1873	vct_nadd_index.toKernel(),
1874	vct_nrem_index.toKernel(),
1875	n_gpu_add_block_slot,
1876	n_gpu_rem_block_slot);
1877
1878	return mvsck;
1879	}
1880
1881	/! \brief set the gpu insert buffer for every block*
1882	*
1883	* \param nblock number of blocks
1884	* \param nslot number of slots free for each block
1885	*
1886	*/
1887	void setGPUInsertBuffer(int nblock, int nslot)
1888	{
1889	vct_add_index.resize(nblock*nslot);
1890	vct_nadd_index.resize(nblock);
1891	vct_add_data.resize(nblock*nslot);
1892	n_gpu_add_block_slot = nslot;
1893	vct_nadd_index.template fill<`0`>(`0`);
1894	}
1895
1896	/! \brief In case we manually set the added index buffer and the add data buffer we have to call this*
1897	* function before flush
1898	*
1899	*
1900	*/
1901	void preFlush()
1902	{
1903	#ifdef __NVCC__
1904	vct_nadd_index.resize(vct_add_index.size());
1905
1906	if (vct_nadd_index.size() != `0`)
1907	{
1908	auto ite = vct_nadd_index.getGPUIterator();
1909	CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1910	}
1911	n_gpu_add_block_slot = `1`;
1912	#endif
1913	}
1914
1915	/! \brief Get the GPU insert buffer*
1916	*
1917	* \return the reference to the GPU insert buffer
1918	*/
1919	auto getGPUInsertBuffer() -> decltype(vct_add_data)&
1920	{
1921	return vct_add_data;
1922	}
1923
1924	/! \brief set the gpu remove buffer for every block*
1925	*
1926	* \param nblock number of blocks
1927	* \param nslot number of slots free for each block
1928	*
1929	*/
1930	void setGPURemoveBuffer(int nblock, int nslot)
1931	{
1932	vct_rem_index.resize(nblock*nslot);
1933	vct_nrem_index.resize(nblock);
1934	n_gpu_rem_block_slot = nslot;
1935	vct_nrem_index.template fill<`0`>(`0`);
1936	}
1937
1938	#ifdef CUDA_GPU
1939
1940	/! \brief Get iterator over the stored elements*
1941	*
1942	* \return an iterator
1943	*
1944	*/
1945	auto getGPUIterator() -> decltype(vct_index.getGPUIterator())
1946	{
1947	return vct_index.getGPUIterator();
1948	}
1949
1950	#endif
1951
1952	/! \brief Clear all from all the elements*
1953	*
1954	*
1955	*/
1956	void clear()
1957	{
1958	vct_data.clear();
1959	vct_index.clear();
1960	vct_add_index.clear();
1961	vct_add_data.clear();
1962
1963	// re-add background
1964	vct_data.resize(vct_data.size()+`1`);
1965	vct_data.get(vct_data.size()-`1`) = bck;
1966
1967	htoD<decltype(vct_data)> trf(vct_data,vct_data.size()-`1`);
1968	boost::mpl::for_each_ref< boost::mpl::range_c<int,`0`,T::max_prop> >(trf);
1969
1970	max_ele = `0`;
1971	n_gpu_add_block_slot = `0`;
1972	n_gpu_rem_block_slot = `0`;
1973	}
1974
1975	void swap(vector_sparse<T,Ti,Memory,layout,layout_base,grow_p,impl> & sp)
1976	{
1977	vct_data.swap(sp.vct_data);
1978	vct_index.swap(sp.vct_index);
1979	vct_add_index.swap(sp.vct_add_index);
1980	vct_add_data.swap(sp.vct_add_data);
1981
1982	size_t max_ele_ = sp.max_ele;
1983	sp.max_ele = max_ele;
1984	this->max_ele = max_ele_;
1985	}
1986
1987	vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1988	{
1989	return vct_add_data;
1990	}
1991
1992	vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1993	{
1994	return vct_add_index;
1995	}
1996
1997	const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index() const
1998	{
1999	return vct_add_index;
2000	}
2001
2002	vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2003	{
2004	return vct_nadd_index;
2005	}
2006
2007	const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index() const
2008	{
2009	return vct_nadd_index;
2010	}
2011
2012	auto getSegmentToOutMap() -> decltype(blf.get_outputMap())
2013	{
2014	return blf.get_outputMap();
2015	}
2016
2017	auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2018	{
2019	return blf.get_outputMap();
2020	}
2021
2022	/! \brief Eliminate many internal temporary buffer you can use this between flushes if you get some out of memory*
2023	*
2024	*
2025	*/
2026	void removeUnusedBuffers()
2027	{
2028	vct_add_data.resize(`0`);
2029	vct_add_data.shrink_to_fit();
2030
2031	vct_add_data.resize(`0`);
2032	vct_add_data.shrink_to_fit();
2033
2034	vct_add_data_reord.resize(`0`);
2035	vct_add_data_reord.shrink_to_fit();
2036
2037	vct_add_data_cont.resize(`0`);
2038	vct_add_data_cont.shrink_to_fit();
2039
2040	vct_add_data_unique.resize(`0`);
2041	vct_add_data_unique.shrink_to_fit();
2042	}
2043
2044	/ \brief Return the offsets of the segments for the merge indexes*
2045	*
2046	*
2047	*/
2048	vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
2049	{
2050	return vct_add_index_unique;
2051	}
2052
2053	vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap() const
2054	{
2055	return vct_add_index_unique;
2056	}
2057
2058	/! \brief Return the mapping vector*
2059	*
2060	* When we add new elements this vector contain the merged old elements and new elements position
2061	*
2062	* For example the old vector contain
2063	*
2064	* Old: 5 10 35 50 66 79 (6 elements)
2065	* New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44)
2066	*
2067	* The merged indexes are (when reordered)
2068	*
2069	* 5 7 7 9 10 35 44 44 50 66 79
2070	*
2071	* The returned map contain 5 elements indicating the position of the reordered elements:
2072	*
2073	* 0 2 3 1 4
2074	* (7)(7)(9)(44)(44)
2075	*/
2076	vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMappingVector()
2077	{
2078	return vct_add_index_cont_1;
2079	}
2080
2081	/! \brief Return the merge mapping vector*
2082	*
2083	* When we add new elements this vector contain the merged old elements and new elements position
2084	*
2085	* For example the old vector contain
2086	*
2087	* Old: 5 10 35 50 66 79 (6 elements)
2088	* New: 7 44 7 9 44 (5 elements) (in order are 7 7 9 44 44)
2089	*
2090	* The merged indexes are (when reordered)
2091	*
2092	* 5 7 7 9 10 35 44 44 50 66 79
2093	*
2094	* The returned map contain 5 elements indicating the position of the reordered elements:
2095	*
2096	* 0 6 7 8 1 2 9 10 3 4 5
2097	* (5)(7)(7)(9)(10)(35)(44)(44)(50)(66)(79)
2098	*/
2099	vector<aggregate<Ti>,Memory,layout_base,grow_p> & getMergeIndexMapVector()
2100	{
2101	return vct_index_tmp2;
2102	}
2103	};
2104
2105
2106	template<typename T, unsigned int blockSwitch = VECTOR_SPARSE_STANDARD, typename block_functor = stub_block_functor, typename indexT = int>
2107	using vector_sparse_gpu = openfpm::vector_sparse<
2108	T,
2109	indexT,
2110	CudaMemory,
2111	typename memory_traits_inte<T>::type,
2112	memory_traits_inte,
2113	grow_policy_double,
2114	vect_isel<T>::value,
2115	blockSwitch,
2116	block_functor
2117	>;
2118
2119	template<typename T, typename block_functor = stub_block_functor, typename indexT = long int>
2120	using vector_sparse_gpu_block = openfpm::vector_sparse<
2121	T,
2122	indexT,
2123	CudaMemory,
2124	typename memory_traits_inte<T>::type,
2125	memory_traits_inte,
2126	grow_policy_double,
2127	vect_isel<T>::value,
2128	VECTOR_SPARSE_BLOCK,
2129	block_functor
2130	>;
2131	}
2132
2133
2134
2135	#endif /* MAP_VECTOR_SPARSE_HPP_ */
2136

Browse the source code of openfpm/openfpm_data/src/Vector/map_vector_sparse.hpp