1#ifndef CUDIFY_SEQUENCIAL_HPP_
2#define CUDIFY_SEQUENCIAL_HPP_
3
4#include "config.h"
5
6#ifdef CUDA_ON_CPU
7
8#include "cudify_hardware_common.hpp"
9
10#ifdef HAVE_BOOST_CONTEXT
11
12#define CUDIFY_ACTIVE
13
14#include "util/cuda_util.hpp"
15#include <boost/bind/bind.hpp>
16#include <type_traits>
17#ifdef HAVE_BOOST_CONTEXT
18#include <boost/context/continuation.hpp>
19#endif
20#include <vector>
21#include <string.h>
22
23
24#ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE
25#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192
26#endif
27
28extern std::vector<void *>mem_stack;
29
30extern dim3 threadIdx;
31extern dim3 blockIdx;
32
33extern dim3 blockDim;
34extern dim3 gridDim;
35
36extern std::vector<void *> mem_stack;
37extern std::vector<boost::context::detail::fcontext_t> contexts;
38extern void * par_glob;
39extern boost::context::detail::fcontext_t main_ctx;
40
41static void __syncthreads()
42{
43 boost::context::detail::jump_fcontext(main_ctx,par_glob);
44};
45
46static void cudaDeviceSynchronize()
47{}
48
49static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
50{
51 memcpy(dev_mem,global_cuda_error_array,sz);
52}
53
54/**
55 * CUDA memory copy types
56 */
57enum cudaMemcpyKind
58{
59 cudaMemcpyHostToHost = 0, /**< Host -> Host */
60 cudaMemcpyHostToDevice = 1, /**< Host -> Device */
61 cudaMemcpyDeviceToHost = 2, /**< Device -> Host */
62 cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */
63 cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
64};
65
66extern int vct_atomic_add;
67extern int vct_atomic_rem;
68
69static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
70{
71 memcpy(global_cuda_error_array+offset,mem,sz);
72}
73
74namespace cub
75{
76 template<typename T, unsigned int dim>
77 class BlockScan
78 {
79 public:
80 typedef std::array<T,dim> TempStorage;
81
82 private:
83 TempStorage & tmp;
84
85 public:
86
87
88
89 BlockScan(TempStorage & tmp)
90 :tmp(tmp)
91 {};
92
93 void ExclusiveSum(T & in, T & out)
94 {
95 tmp[threadIdx.x] = in;
96
97 __syncthreads();
98
99 if (threadIdx.x == 0)
100 {
101 T prec = tmp[0];
102 tmp[0] = 0;
103 for (int i = 1 ; i < dim ; i++)
104 {
105 auto next = tmp[i-1] + prec;
106 prec = tmp[i];
107 tmp[i] = next;
108 }
109 }
110
111 __syncthreads();
112
113 out = tmp[threadIdx.x];
114 return;
115 }
116 };
117}
118
119
120template<typename T, typename T2>
121static T atomicAdd(T * address, T2 val)
122{
123 T old = *address;
124 *address += val;
125 return old;
126};
127
128#define MGPU_HOST_DEVICE
129
130namespace mgpu
131{
132 template<typename type_t>
133 struct less_t : public std::binary_function<type_t, type_t, bool> {
134 bool operator()(type_t a, type_t b) const {
135 return a < b;
136 }
137 template<typename type2_t, typename type3_t>
138 bool operator()(type2_t a, type3_t b) const {
139 return a < b;
140 }
141 };
142/* template<typename type_t>
143 struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
144 MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
145 return a <= b;
146 }
147 };*/
148 template<typename type_t>
149 struct greater_t : public std::binary_function<type_t, type_t, bool> {
150 MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
151 return a > b;
152 }
153 template<typename type2_t, typename type3_t>
154 MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const {
155 return a > b;
156 }
157 };
158/* template<typename type_t>
159 struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
160 MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
161 return a >= b;
162 }
163 };
164 template<typename type_t>
165 struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
166 MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
167 return a == b;
168 }
169 };
170 template<typename type_t>
171 struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
172 MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
173 return a != b;
174 }
175 };*/
176
177 ////////////////////////////////////////////////////////////////////////////////
178 // Device-side arithmetic operators.
179
180 template<typename type_t>
181 struct plus_t : public std::binary_function<type_t, type_t, type_t> {
182 type_t operator()(type_t a, type_t b) const {
183 return a + b;
184 }
185 };
186
187/* template<typename type_t>
188 struct minus_t : public std::binary_function<type_t, type_t, type_t> {
189 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
190 return a - b;
191 }
192 };
193
194 template<typename type_t>
195 struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
196 MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
197 return a * b;
198 }
199 };*/
200
201 template<typename type_t>
202 struct maximum_t : public std::binary_function<type_t, type_t, type_t> {
203 type_t operator()(type_t a, type_t b) const {
204 return std::max(a, b);
205 }
206 };
207
208 template<typename type_t>
209 struct minimum_t : public std::binary_function<type_t, type_t, type_t> {
210 type_t operator()(type_t a, type_t b) const {
211 return std::min(a, b);
212 }
213 };
214}
215
216
217namespace mgpu
218{
219 template<typename input_it,
220 typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
221 void segreduce(input_it input, int count, segments_it segments,
222 int num_segments, output_it output, op_t op, type_t init,
223 context_t& context)
224 {
225 int i = 0;
226 for ( ; i < num_segments - 1; i++)
227 {
228 int j = segments[i];
229 output[i] = input[j];
230 ++j;
231 for ( ; j < segments[i+1] ; j++)
232 {
233 output[i] = op(output[i],input[j]);
234 }
235 }
236
237 // Last segment
238 int j = segments[i];
239 output[i] = input[j];
240 ++j;
241 for ( ; j < count ; j++)
242 {
243 output[i] = op(output[i],input[j]);
244 }
245 }
246
247 // Key-value merge.
248 template<typename a_keys_it, typename a_vals_it,
249 typename b_keys_it, typename b_vals_it,
250 typename c_keys_it, typename c_vals_it,
251 typename comp_t, typename context_t>
252 void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
253 b_keys_it b_keys, b_vals_it b_vals, int b_count,
254 c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
255 {
256 int a_it = 0;
257 int b_it = 0;
258 int c_it = 0;
259
260 while (a_it < a_count || b_it < b_count)
261 {
262 if (a_it < a_count)
263 {
264 if (b_it < b_count)
265 {
266 if (comp(b_keys[b_it],a_keys[a_it]))
267 {
268 c_keys[c_it] = b_keys[b_it];
269 c_vals[c_it] = b_vals[b_it];
270 c_it++;
271 b_it++;
272 }
273 else
274 {
275 c_keys[c_it] = a_keys[a_it];
276 c_vals[c_it] = a_vals[a_it];
277 c_it++;
278 a_it++;
279 }
280 }
281 else
282 {
283 c_keys[c_it] = a_keys[a_it];
284 c_vals[c_it] = a_vals[a_it];
285 c_it++;
286 a_it++;
287 }
288 }
289 else
290 {
291 c_keys[c_it] = b_keys[b_it];
292 c_vals[c_it] = b_vals[b_it];
293 c_it++;
294 b_it++;
295 }
296 }
297 }
298}
299
300static void init_wrappers()
301{}
302
303template<typename lambda_f>
304struct Fun_enc
305{
306 lambda_f Fn;
307
308 Fun_enc(lambda_f Fn)
309 :Fn(Fn)
310 {}
311
312 void run()
313 {
314 Fn();
315 }
316};
317
318template<typename Fun_enc_type>
319void launch_kernel(boost::context::detail::transfer_t par)
320{
321 main_ctx = par.fctx;
322 par_glob = par.data;
323 Fun_enc_type * ptr = (Fun_enc_type *)par.data;
324
325 ptr->run();
326
327 boost::context::detail::jump_fcontext(par.fctx,0);
328}
329
330template<typename lambda_f, typename ite_type>
331static void exe_kernel(lambda_f f, ite_type & ite)
332{
333 if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;}
334
335 if (mem_stack.size() < ite.nthrs())
336 {
337 int old_size = mem_stack.size();
338 mem_stack.resize(ite.nthrs());
339
340 for (int i = old_size ; i < mem_stack.size() ; i++)
341 {
342 mem_stack[i] = new char [8192];
343 }
344 }
345
346 // Resize contexts
347 contexts.resize(mem_stack.size());
348
349 Fun_enc<lambda_f> fe(f);
350
351 for (int i = 0 ; i < ite.wthr.z ; i++)
352 {
353 blockIdx.z = i;
354 for (int j = 0 ; j < ite.wthr.y ; j++)
355 {
356 blockIdx.y = j;
357 for (int k = 0 ; k < ite.wthr.x ; k++)
358 {
359 blockIdx.x = k;
360 int nc = 0;
361 for (int it = 0 ; it < ite.thr.z ; it++)
362 {
363 for (int jt = 0 ; jt < ite.thr.y ; jt++)
364 {
365 for (int kt = 0 ; kt < ite.thr.x ; kt++)
366 {
367 contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);;
368 nc++;
369 }
370 }
371 }
372
373 bool work_to_do = true;
374 while(work_to_do)
375 {
376 nc = 0;
377 // Work threads
378 for (int it = 0 ; it < ite.thr.z ; it++)
379 {
380 threadIdx.z = it;
381 for (int jt = 0 ; jt < ite.thr.y ; jt++)
382 {
383 threadIdx.y = jt;
384 for (int kt = 0 ; kt < ite.thr.x ; kt++)
385 {
386 threadIdx.x = kt;
387 auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe);
388 contexts[nc] = t.fctx;
389 work_to_do &= (t.data != 0);
390 nc++;
391 }
392 }
393 }
394 }
395 }
396 }
397 }
398}
399
400template<typename lambda_f, typename ite_type>
401static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
402{
403 for (int i = 0 ; i < ite.wthr.z ; i++)
404 {
405 blockIdx.z = i;
406 for (int j = 0 ; j < ite.wthr.y ; j++)
407 {
408 blockIdx.y = j;
409 for (int k = 0 ; k < ite.wthr.x ; k++)
410 {
411 blockIdx.x = k;
412 int fb = 0;
413 // Work threads
414 for (int it = 0 ; it < ite.wthr.z ; it++)
415 {
416 threadIdx.z = it;
417 for (int jt = 0 ; jt < ite.wthr.y ; jt++)
418 {
419 threadIdx.y = jt;
420 for (int kt = 0 ; kt < ite.wthr.x ; kt++)
421 {
422 threadIdx.x = kt;
423 f();
424 }
425 }
426 }
427 }
428 }
429 }
430}
431
432#ifdef PRINT_CUDA_LAUNCHES
433
434#define CUDA_LAUNCH(cuda_call,ite, ...)\
435 \
436 gridDim.x = ite.wthr.x;\
437 gridDim.y = ite.wthr.y;\
438 gridDim.z = ite.wthr.z;\
439 \
440 blockDim.x = ite.thr.x;\
441 blockDim.y = ite.thr.y;\
442 blockDim.z = ite.thr.z;\
443 \
444 CHECK_SE_CLASS1_PRE\
445 \
446 std::cout << "Launching: " << #cuda_call << std::endl;\
447 \
448 exe_kernel(\
449 [&](boost::context::fiber && main) -> void {\
450 \
451 \
452 main_fib = main;
453\
454 cuda_call(__VA_ARGS__);\
455 },ite);\
456 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
457 }
458
459
460#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
461 {\
462 dim3 wthr__(wthr_);\
463 dim3 thr__(thr_);\
464 \
465 ite_gpu<1> itg;\
466 itg.wthr = wthr;\
467 itg.thr = thr;\
468 \
469 gridDim.x = wthr__.x;\
470 gridDim.y = wthr__.y;\
471 gridDim.z = wthr__.z;\
472 \
473 blockDim.x = thr__.x;\
474 blockDim.y = thr__.y;\
475 blockDim.z = thr__.z;\
476 \
477 CHECK_SE_CLASS1_PRE\
478 std::cout << "Launching: " << #cuda_call << std::endl;\
479 \
480 exe_kernel(\
481 [&] (boost::context::fiber && main) -> void {\
482 \
483 \
484 main_fib = std::move(main);\
485\
486 cuda_call(__VA_ARGS__);\
487 \
488 return std::move(main_fib);\
489 \
490 });\
491 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
492 }
493
494#define CUDA_CHECK()
495
496#else
497
498#define CUDA_LAUNCH(cuda_call,ite, ...) \
499 {\
500 gridDim.x = ite.wthr.x;\
501 gridDim.y = ite.wthr.y;\
502 gridDim.z = ite.wthr.z;\
503 \
504 blockDim.x = ite.thr.x;\
505 blockDim.y = ite.thr.y;\
506 blockDim.z = ite.thr.z;\
507 \
508 CHECK_SE_CLASS1_PRE\
509 \
510 exe_kernel([&]() -> void {\
511 \
512 \
513 cuda_call(__VA_ARGS__);\
514 \
515 },ite);\
516 \
517 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
518 }
519
520
521#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
522 {\
523 dim3 wthr__(wthr_);\
524 dim3 thr__(thr_);\
525 \
526 ite_gpu<1> itg;\
527 itg.wthr = wthr_;\
528 itg.thr = thr_;\
529 \
530 gridDim.x = wthr__.x;\
531 gridDim.y = wthr__.y;\
532 gridDim.z = wthr__.z;\
533 \
534 blockDim.x = thr__.x;\
535 blockDim.y = thr__.y;\
536 blockDim.z = thr__.z;\
537 \
538 CHECK_SE_CLASS1_PRE\
539 \
540 exe_kernel([&]() -> void {\
541 \
542 cuda_call(__VA_ARGS__);\
543 \
544 },itg);\
545 \
546 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
547 }
548
549#define CUDA_CHECK()
550
551#endif
552
553#endif
554
555#endif
556
557#endif
558