cudify_sequencial.hpp source code [openfpm/openfpm_devices/src/util/cudify/cudify_sequencial.hpp]

1	#ifndef CUDIFY_SEQUENCIAL_HPP_
2	#define CUDIFY_SEQUENCIAL_HPP_
3
4	#include "config.h"
5
6	#ifdef CUDA_ON_CPU
7
8	#include "cudify_hardware_common.hpp"
9
10	#ifdef HAVE_BOOST_CONTEXT
11
12	#define CUDIFY_ACTIVE
13
14	#include "util/cuda_util.hpp"
15	#include <boost/bind/bind.hpp>
16	#include <type_traits>
17	#ifdef HAVE_BOOST_CONTEXT
18	#include <boost/context/continuation.hpp>
19	#endif
20	#include <vector>
21	#include <string.h>
22
23
24	#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
25	#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
26	#endif
27
28	extern std::vector<void *>mem_stack;
29
30	extern dim3 threadIdx;
31	extern dim3 blockIdx;
32
33	extern dim3 blockDim;
34	extern dim3 gridDim;
35
36	extern std::vector<void *> mem_stack;
37	extern std::vector<boost::context::detail::fcontext_t> contexts;
38	extern void * par_glob;
39	extern boost::context::detail::fcontext_t main_ctx;
40
41	static void __syncthreads()
42	{
43	boost::context::detail::jump_fcontext(main_ctx,par_glob);
44	};
45
46	static void cudaDeviceSynchronize()
47	{}
48
49	static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
50	{
51	memcpy(dev_mem,global_cuda_error_array,sz);
52	}
53
54	/**
55	* CUDA memory copy types
56	*/
57	enum cudaMemcpyKind
58	{
59	cudaMemcpyHostToHost = `0`, /< Host -> Host /*
60	cudaMemcpyHostToDevice = `1`, /< Host -> Device /*
61	cudaMemcpyDeviceToHost = `2`, /< Device -> Host /*
62	cudaMemcpyDeviceToDevice = `3`, /< Device -> Device /*
63	cudaMemcpyDefault = `4` /< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing /*
64	};
65
66	extern int vct_atomic_add;
67	extern int vct_atomic_rem;
68
69	static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
70	{
71	memcpy(global_cuda_error_array+offset,mem,sz);
72	}
73
74	namespace cub
75	{
76	template<typename T, unsigned int dim>
77	class BlockScan
78	{
79	public:
80	typedef std::array<T,dim> TempStorage;
81
82	private:
83	TempStorage & tmp;
84
85	public:
86
87
88
89	BlockScan(TempStorage & tmp)
90	:tmp(tmp)
91	{};
92
93	void ExclusiveSum(T & in, T & out)
94	{
95	tmp[threadIdx.x] = in;
96
97	__syncthreads();
98
99	if (threadIdx.x == `0`)
100	{
101	T prec = tmp[`0`];
102	tmp[`0`] = `0`;
103	for (int i = `1` ; i < dim ; i++)
104	{
105	auto next = tmp[i-`1`] + prec;
106	prec = tmp[i];
107	tmp[i] = next;
108	}
109	}
110
111	__syncthreads();
112
113	out = tmp[threadIdx.x];
114	return;
115	}
116	};
117	}
118
119
120	template<typename T, typename T2>
121	static T atomicAdd(T * address, T2 val)
122	{
123	T old = *address;
124	*address += val;
125	return old;
126	};
127
128	#define MGPU_HOST_DEVICE
129
130	namespace mgpu
131	{
132	template<typename type_t>
133	struct less_t : public std::binary_function<type_t, type_t, bool> {
134	bool operator()(type_t a, type_t b) const {
135	return a < b;
136	}
137	template<typename type2_t, typename type3_t>
138	bool operator()(type2_t a, type3_t b) const {
139	return a < b;
140	}
141	};
142	/ template<typename type_t>*
143	struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
144	MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
145	return a <= b;
146	}
147	};/*
148	template<typename type_t>
149	struct greater_t : public std::binary_function<type_t, type_t, bool> {
150	MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
151	return a > b;
152	}
153	template<typename type2_t, typename type3_t>
154	MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const {
155	return a > b;
156	}
157	};
158	/ template<typename type_t>*
159	struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
160	MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
161	return a >= b;
162	}
163	};
164	template<typename type_t>
165	struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
166	MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
167	return a == b;
168	}
169	};
170	template<typename type_t>
171	struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
172	MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
173	return a != b;
174	}
175	};/*
176
177	////////////////////////////////////////////////////////////////////////////////
178	// Device-side arithmetic operators.
179
180	template<typename type_t>
181	struct plus_t : public std::binary_function<type_t, type_t, type_t> {
182	type_t operator()(type_t a, type_t b) const {
183	return a + b;
184	}
185	};
186
187	/ template<typename type_t>*
188	struct minus_t : public std::binary_function<type_t, type_t, type_t> {
189	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
190	return a - b;
191	}
192	};
193
194	template<typename type_t>
195	struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
196	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
197	return a b;*
198	}
199	};/*
200
201	template<typename type_t>
202	struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
203	type_t operator()(type_t a, type_t b) const {
204	return std::max(a, b);
205	}
206	};
207
208	template<typename type_t>
209	struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
210	type_t operator()(type_t a, type_t b) const {
211	return std::min(a, b);
212	}
213	};
214	}
215
216
217	namespace mgpu
218	{
219	template<typename input_it,
220	typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
221	void segreduce(input_it input, int count, segments_it segments,
222	int num_segments, output_it output, op_t op, type_t init,
223	context_t& context)
224	{
225	int i = `0`;
226	for ( ; i < num_segments - `1`; i++)
227	{
228	int j = segments[i];
229	output[i] = input[j];
230	++j;
231	for ( ; j < segments[i+`1`] ; j++)
232	{
233	output[i] = op(output[i],input[j]);
234	}
235	}
236
237	// Last segment
238	int j = segments[i];
239	output[i] = input[j];
240	++j;
241	for ( ; j < count ; j++)
242	{
243	output[i] = op(output[i],input[j]);
244	}
245	}
246
247	// Key-value merge.
248	template<typename a_keys_it, typename a_vals_it,
249	typename b_keys_it, typename b_vals_it,
250	typename c_keys_it, typename c_vals_it,
251	typename comp_t, typename context_t>
252	void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
253	b_keys_it b_keys, b_vals_it b_vals, int b_count,
254	c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
255	{
256	int a_it = `0`;
257	int b_it = `0`;
258	int c_it = `0`;
259
260	while (a_it < a_count \|\| b_it < b_count)
261	{
262	if (a_it < a_count)
263	{
264	if (b_it < b_count)
265	{
266	if (comp(b_keys[b_it],a_keys[a_it]))
267	{
268	c_keys[c_it] = b_keys[b_it];
269	c_vals[c_it] = b_vals[b_it];
270	c_it++;
271	b_it++;
272	}
273	else
274	{
275	c_keys[c_it] = a_keys[a_it];
276	c_vals[c_it] = a_vals[a_it];
277	c_it++;
278	a_it++;
279	}
280	}
281	else
282	{
283	c_keys[c_it] = a_keys[a_it];
284	c_vals[c_it] = a_vals[a_it];
285	c_it++;
286	a_it++;
287	}
288	}
289	else
290	{
291	c_keys[c_it] = b_keys[b_it];
292	c_vals[c_it] = b_vals[b_it];
293	c_it++;
294	b_it++;
295	}
296	}
297	}
298	}
299
300	static void init_wrappers()
301	{}
302
303	template<typename lambda_f>
304	struct Fun_enc
305	{
306	lambda_f Fn;
307
308	Fun_enc(lambda_f Fn)
309	:Fn(Fn)
310	{}
311
312	void run()
313	{
314	Fn();
315	}
316	};
317
318	template<typename Fun_enc_type>
319	void launch_kernel(boost::context::detail::transfer_t par)
320	{
321	main_ctx = par.fctx;
322	par_glob = par.data;
323	Fun_enc_type * ptr = (Fun_enc_type *)par.data;
324
325	ptr->run();
326
327	boost::context::detail::jump_fcontext(par.fctx,`0`);
328	}
329
330	template<typename lambda_f, typename ite_type>
331	static void exe_kernel(lambda_f f, ite_type & ite)
332	{
333	if (ite.nthrs() == `0` \|\| ite.nblocks() == `0`) {return;}
334
335	if (mem_stack.size() < ite.nthrs())
336	{
337	int old_size = mem_stack.size();
338	mem_stack.resize(ite.nthrs());
339
340	for (int i = old_size ; i < mem_stack.size() ; i++)
341	{
342	mem_stack[i] = new char [`8192`];
343	}
344	}
345
346	// Resize contexts
347	contexts.resize(mem_stack.size());
348
349	Fun_enc<lambda_f> fe(f);
350
351	for (int i = `0` ; i < ite.wthr.z ; i++)
352	{
353	blockIdx.z = i;
354	for (int j = `0` ; j < ite.wthr.y ; j++)
355	{
356	blockIdx.y = j;
357	for (int k = `0` ; k < ite.wthr.x ; k++)
358	{
359	blockIdx.x = k;
360	int nc = `0`;
361	for (int it = `0` ; it < ite.thr.z ; it++)
362	{
363	for (int jt = `0` ; jt < ite.thr.y ; jt++)
364	{
365	for (int kt = `0` ; kt < ite.thr.x ; kt++)
366	{
367	contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-`16`,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);;
368	nc++;
369	}
370	}
371	}
372
373	bool work_to_do = true;
374	while(work_to_do)
375	{
376	nc = `0`;
377	// Work threads
378	for (int it = `0` ; it < ite.thr.z ; it++)
379	{
380	threadIdx.z = it;
381	for (int jt = `0` ; jt < ite.thr.y ; jt++)
382	{
383	threadIdx.y = jt;
384	for (int kt = `0` ; kt < ite.thr.x ; kt++)
385	{
386	threadIdx.x = kt;
387	auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
388	contexts[nc] = t.fctx;
389	work_to_do &= (t.data != `0`);
390	nc++;
391	}
392	}
393	}
394	}
395	}
396	}
397	}
398	}
399
400	template<typename lambda_f, typename ite_type>
401	static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
402	{
403	for (int i = `0` ; i < ite.wthr.z ; i++)
404	{
405	blockIdx.z = i;
406	for (int j = `0` ; j < ite.wthr.y ; j++)
407	{
408	blockIdx.y = j;
409	for (int k = `0` ; k < ite.wthr.x ; k++)
410	{
411	blockIdx.x = k;
412	int fb = `0`;
413	// Work threads
414	for (int it = `0` ; it < ite.wthr.z ; it++)
415	{
416	threadIdx.z = it;
417	for (int jt = `0` ; jt < ite.wthr.y ; jt++)
418	{
419	threadIdx.y = jt;
420	for (int kt = `0` ; kt < ite.wthr.x ; kt++)
421	{
422	threadIdx.x = kt;
423	f();
424	}
425	}
426	}
427	}
428	}
429	}
430	}
431
432	#ifdef PRINT_CUDA_LAUNCHES
433
434	#define CUDA_LAUNCH(cuda_call,ite, ...)\
435	\
436	gridDim.x = ite.wthr.x;\
437	gridDim.y = ite.wthr.y;\
438	gridDim.z = ite.wthr.z;\
439	\
440	blockDim.x = ite.thr.x;\
441	blockDim.y = ite.thr.y;\
442	blockDim.z = ite.thr.z;\
443	\
444	CHECK_SE_CLASS1_PRE\
445	\
446	std::cout << "Launching: " << #cuda_call << std::endl;\
447	\
448	exe_kernel(\
449	[&](boost::context::fiber && main) -> void {\
450	\
451	\
452	main_fib = main;
453	\
454	cuda_call(__VA_ARGS__);\
455	},ite);\
456	CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
457	}
458
459
460	#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
461	{\
462	dim3 wthr__(wthr_);\
463	dim3 thr__(thr_);\
464	\
465	ite_gpu<1> itg;\
466	itg.wthr = wthr;\
467	itg.thr = thr;\
468	\
469	gridDim.x = wthr__.x;\
470	gridDim.y = wthr__.y;\
471	gridDim.z = wthr__.z;\
472	\
473	blockDim.x = thr__.x;\
474	blockDim.y = thr__.y;\
475	blockDim.z = thr__.z;\
476	\
477	CHECK_SE_CLASS1_PRE\
478	std::cout << "Launching: " << #cuda_call << std::endl;\
479	\
480	exe_kernel(\
481	[&] (boost::context::fiber && main) -> void {\
482	\
483	\
484	main_fib = std::move(main);\
485	\
486	cuda_call(__VA_ARGS__);\
487	\
488	return std::move(main_fib);\
489	\
490	});\
491	CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
492	}
493
494	#define CUDA_CHECK()
495
496	#else
497
498	#define CUDA_LAUNCH(cuda_call,ite, ...) \
499	{\
500	gridDim.x = ite.wthr.x;\
501	gridDim.y = ite.wthr.y;\
502	gridDim.z = ite.wthr.z;\
503	\
504	blockDim.x = ite.thr.x;\
505	blockDim.y = ite.thr.y;\
506	blockDim.z = ite.thr.z;\
507	\
508	CHECK_SE_CLASS1_PRE\
509	\
510	exe_kernel([&]() -> void {\
511	\
512	\
513	cuda_call(__VA_ARGS__);\
514	\
515	},ite);\
516	\
517	CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
518	}
519
520
521	#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
522	{\
523	dim3 wthr__(wthr_);\
524	dim3 thr__(thr_);\
525	\
526	ite_gpu<1> itg;\
527	itg.wthr = wthr_;\
528	itg.thr = thr_;\
529	\
530	gridDim.x = wthr__.x;\
531	gridDim.y = wthr__.y;\
532	gridDim.z = wthr__.z;\
533	\
534	blockDim.x = thr__.x;\
535	blockDim.y = thr__.y;\
536	blockDim.z = thr__.z;\
537	\
538	CHECK_SE_CLASS1_PRE\
539	\
540	exe_kernel([&]() -> void {\
541	\
542	cuda_call(__VA_ARGS__);\
543	\
544	},itg);\
545	\
546	CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
547	}
548
549	#define CUDA_CHECK()
550
551	#endif
552
553	#endif
554
555	#endif
556
557	#endif
558

Browse the source code of openfpm/openfpm_devices/src/util/cudify/cudify_sequencial.hpp