1 | #ifndef CUDIFY_SEQUENCIAL_HPP_ |
2 | #define CUDIFY_SEQUENCIAL_HPP_ |
3 | |
4 | #include "config.h" |
5 | |
6 | #ifdef CUDA_ON_CPU |
7 | |
8 | #include "cudify_hardware_common.hpp" |
9 | |
10 | #ifdef HAVE_BOOST_CONTEXT |
11 | |
12 | #define CUDIFY_ACTIVE |
13 | |
14 | #include "util/cuda_util.hpp" |
15 | #include <boost/bind/bind.hpp> |
16 | #include <type_traits> |
17 | #ifdef HAVE_BOOST_CONTEXT |
18 | #include <boost/context/continuation.hpp> |
19 | #endif |
20 | #include <vector> |
21 | #include <string.h> |
22 | |
23 | |
24 | #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE |
25 | #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192 |
26 | #endif |
27 | |
28 | extern std::vector<void *>mem_stack; |
29 | |
30 | extern dim3 threadIdx; |
31 | extern dim3 blockIdx; |
32 | |
33 | extern dim3 blockDim; |
34 | extern dim3 gridDim; |
35 | |
36 | extern std::vector<void *> mem_stack; |
37 | extern std::vector<boost::context::detail::fcontext_t> contexts; |
38 | extern void * par_glob; |
39 | extern boost::context::detail::fcontext_t main_ctx; |
40 | |
41 | static void __syncthreads() |
42 | { |
43 | boost::context::detail::jump_fcontext(main_ctx,par_glob); |
44 | }; |
45 | |
46 | static void cudaDeviceSynchronize() |
47 | {} |
48 | |
49 | static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz) |
50 | { |
51 | memcpy(dev_mem,global_cuda_error_array,sz); |
52 | } |
53 | |
54 | /** |
55 | * CUDA memory copy types |
56 | */ |
57 | enum cudaMemcpyKind |
58 | { |
59 | cudaMemcpyHostToHost = 0, /**< Host -> Host */ |
60 | cudaMemcpyHostToDevice = 1, /**< Host -> Device */ |
61 | cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ |
62 | cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ |
63 | cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ |
64 | }; |
65 | |
66 | extern int vct_atomic_add; |
67 | extern int vct_atomic_rem; |
68 | |
69 | static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused) |
70 | { |
71 | memcpy(global_cuda_error_array+offset,mem,sz); |
72 | } |
73 | |
74 | namespace cub |
75 | { |
76 | template<typename T, unsigned int dim> |
77 | class BlockScan |
78 | { |
79 | public: |
80 | typedef std::array<T,dim> TempStorage; |
81 | |
82 | private: |
83 | TempStorage & tmp; |
84 | |
85 | public: |
86 | |
87 | |
88 | |
89 | BlockScan(TempStorage & tmp) |
90 | :tmp(tmp) |
91 | {}; |
92 | |
93 | void ExclusiveSum(T & in, T & out) |
94 | { |
95 | tmp[threadIdx.x] = in; |
96 | |
97 | __syncthreads(); |
98 | |
99 | if (threadIdx.x == 0) |
100 | { |
101 | T prec = tmp[0]; |
102 | tmp[0] = 0; |
103 | for (int i = 1 ; i < dim ; i++) |
104 | { |
105 | auto next = tmp[i-1] + prec; |
106 | prec = tmp[i]; |
107 | tmp[i] = next; |
108 | } |
109 | } |
110 | |
111 | __syncthreads(); |
112 | |
113 | out = tmp[threadIdx.x]; |
114 | return; |
115 | } |
116 | }; |
117 | } |
118 | |
119 | |
120 | template<typename T, typename T2> |
121 | static T atomicAdd(T * address, T2 val) |
122 | { |
123 | T old = *address; |
124 | *address += val; |
125 | return old; |
126 | }; |
127 | |
128 | #define MGPU_HOST_DEVICE |
129 | |
130 | namespace mgpu |
131 | { |
132 | template<typename type_t> |
133 | struct less_t : public std::binary_function<type_t, type_t, bool> { |
134 | bool operator()(type_t a, type_t b) const { |
135 | return a < b; |
136 | } |
137 | template<typename type2_t, typename type3_t> |
138 | bool operator()(type2_t a, type3_t b) const { |
139 | return a < b; |
140 | } |
141 | }; |
142 | /* template<typename type_t> |
143 | struct less_equal_t : public std::binary_function<type_t, type_t, bool> { |
144 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
145 | return a <= b; |
146 | } |
147 | };*/ |
148 | template<typename type_t> |
149 | struct greater_t : public std::binary_function<type_t, type_t, bool> { |
150 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
151 | return a > b; |
152 | } |
153 | template<typename type2_t, typename type3_t> |
154 | MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const { |
155 | return a > b; |
156 | } |
157 | }; |
158 | /* template<typename type_t> |
159 | struct greater_equal_t : public std::binary_function<type_t, type_t, bool> { |
160 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
161 | return a >= b; |
162 | } |
163 | }; |
164 | template<typename type_t> |
165 | struct equal_to_t : public std::binary_function<type_t, type_t, bool> { |
166 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
167 | return a == b; |
168 | } |
169 | }; |
170 | template<typename type_t> |
171 | struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> { |
172 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
173 | return a != b; |
174 | } |
175 | };*/ |
176 | |
177 | //////////////////////////////////////////////////////////////////////////////// |
178 | // Device-side arithmetic operators. |
179 | |
180 | template<typename type_t> |
181 | struct plus_t : public std::binary_function<type_t, type_t, type_t> { |
182 | type_t operator()(type_t a, type_t b) const { |
183 | return a + b; |
184 | } |
185 | }; |
186 | |
187 | /* template<typename type_t> |
188 | struct minus_t : public std::binary_function<type_t, type_t, type_t> { |
189 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
190 | return a - b; |
191 | } |
192 | }; |
193 | |
194 | template<typename type_t> |
195 | struct multiplies_t : public std::binary_function<type_t, type_t, type_t> { |
196 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
197 | return a * b; |
198 | } |
199 | };*/ |
200 | |
201 | template<typename type_t> |
202 | struct maximum_t : public std::binary_function<type_t, type_t, type_t> { |
203 | type_t operator()(type_t a, type_t b) const { |
204 | return std::max(a, b); |
205 | } |
206 | }; |
207 | |
208 | template<typename type_t> |
209 | struct minimum_t : public std::binary_function<type_t, type_t, type_t> { |
210 | type_t operator()(type_t a, type_t b) const { |
211 | return std::min(a, b); |
212 | } |
213 | }; |
214 | } |
215 | |
216 | |
217 | namespace mgpu |
218 | { |
219 | template<typename input_it, |
220 | typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t> |
221 | void segreduce(input_it input, int count, segments_it segments, |
222 | int num_segments, output_it output, op_t op, type_t init, |
223 | context_t& context) |
224 | { |
225 | int i = 0; |
226 | for ( ; i < num_segments - 1; i++) |
227 | { |
228 | int j = segments[i]; |
229 | output[i] = input[j]; |
230 | ++j; |
231 | for ( ; j < segments[i+1] ; j++) |
232 | { |
233 | output[i] = op(output[i],input[j]); |
234 | } |
235 | } |
236 | |
237 | // Last segment |
238 | int j = segments[i]; |
239 | output[i] = input[j]; |
240 | ++j; |
241 | for ( ; j < count ; j++) |
242 | { |
243 | output[i] = op(output[i],input[j]); |
244 | } |
245 | } |
246 | |
247 | // Key-value merge. |
248 | template<typename a_keys_it, typename a_vals_it, |
249 | typename b_keys_it, typename b_vals_it, |
250 | typename c_keys_it, typename c_vals_it, |
251 | typename comp_t, typename context_t> |
252 | void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, |
253 | b_keys_it b_keys, b_vals_it b_vals, int b_count, |
254 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) |
255 | { |
256 | int a_it = 0; |
257 | int b_it = 0; |
258 | int c_it = 0; |
259 | |
260 | while (a_it < a_count || b_it < b_count) |
261 | { |
262 | if (a_it < a_count) |
263 | { |
264 | if (b_it < b_count) |
265 | { |
266 | if (comp(b_keys[b_it],a_keys[a_it])) |
267 | { |
268 | c_keys[c_it] = b_keys[b_it]; |
269 | c_vals[c_it] = b_vals[b_it]; |
270 | c_it++; |
271 | b_it++; |
272 | } |
273 | else |
274 | { |
275 | c_keys[c_it] = a_keys[a_it]; |
276 | c_vals[c_it] = a_vals[a_it]; |
277 | c_it++; |
278 | a_it++; |
279 | } |
280 | } |
281 | else |
282 | { |
283 | c_keys[c_it] = a_keys[a_it]; |
284 | c_vals[c_it] = a_vals[a_it]; |
285 | c_it++; |
286 | a_it++; |
287 | } |
288 | } |
289 | else |
290 | { |
291 | c_keys[c_it] = b_keys[b_it]; |
292 | c_vals[c_it] = b_vals[b_it]; |
293 | c_it++; |
294 | b_it++; |
295 | } |
296 | } |
297 | } |
298 | } |
299 | |
300 | static void init_wrappers() |
301 | {} |
302 | |
303 | template<typename lambda_f> |
304 | struct Fun_enc |
305 | { |
306 | lambda_f Fn; |
307 | |
308 | Fun_enc(lambda_f Fn) |
309 | :Fn(Fn) |
310 | {} |
311 | |
312 | void run() |
313 | { |
314 | Fn(); |
315 | } |
316 | }; |
317 | |
318 | template<typename Fun_enc_type> |
319 | void launch_kernel(boost::context::detail::transfer_t par) |
320 | { |
321 | main_ctx = par.fctx; |
322 | par_glob = par.data; |
323 | Fun_enc_type * ptr = (Fun_enc_type *)par.data; |
324 | |
325 | ptr->run(); |
326 | |
327 | boost::context::detail::jump_fcontext(par.fctx,0); |
328 | } |
329 | |
330 | template<typename lambda_f, typename ite_type> |
331 | static void exe_kernel(lambda_f f, ite_type & ite) |
332 | { |
333 | if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;} |
334 | |
335 | if (mem_stack.size() < ite.nthrs()) |
336 | { |
337 | int old_size = mem_stack.size(); |
338 | mem_stack.resize(ite.nthrs()); |
339 | |
340 | for (int i = old_size ; i < mem_stack.size() ; i++) |
341 | { |
342 | mem_stack[i] = new char [8192]; |
343 | } |
344 | } |
345 | |
346 | // Resize contexts |
347 | contexts.resize(mem_stack.size()); |
348 | |
349 | Fun_enc<lambda_f> fe(f); |
350 | |
351 | for (int i = 0 ; i < ite.wthr.z ; i++) |
352 | { |
353 | blockIdx.z = i; |
354 | for (int j = 0 ; j < ite.wthr.y ; j++) |
355 | { |
356 | blockIdx.y = j; |
357 | for (int k = 0 ; k < ite.wthr.x ; k++) |
358 | { |
359 | blockIdx.x = k; |
360 | int nc = 0; |
361 | for (int it = 0 ; it < ite.thr.z ; it++) |
362 | { |
363 | for (int jt = 0 ; jt < ite.thr.y ; jt++) |
364 | { |
365 | for (int kt = 0 ; kt < ite.thr.x ; kt++) |
366 | { |
367 | contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);; |
368 | nc++; |
369 | } |
370 | } |
371 | } |
372 | |
373 | bool work_to_do = true; |
374 | while(work_to_do) |
375 | { |
376 | nc = 0; |
377 | // Work threads |
378 | for (int it = 0 ; it < ite.thr.z ; it++) |
379 | { |
380 | threadIdx.z = it; |
381 | for (int jt = 0 ; jt < ite.thr.y ; jt++) |
382 | { |
383 | threadIdx.y = jt; |
384 | for (int kt = 0 ; kt < ite.thr.x ; kt++) |
385 | { |
386 | threadIdx.x = kt; |
387 | auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe); |
388 | contexts[nc] = t.fctx; |
389 | work_to_do &= (t.data != 0); |
390 | nc++; |
391 | } |
392 | } |
393 | } |
394 | } |
395 | } |
396 | } |
397 | } |
398 | } |
399 | |
400 | template<typename lambda_f, typename ite_type> |
401 | static void exe_kernel_no_sync(lambda_f f, ite_type & ite) |
402 | { |
403 | for (int i = 0 ; i < ite.wthr.z ; i++) |
404 | { |
405 | blockIdx.z = i; |
406 | for (int j = 0 ; j < ite.wthr.y ; j++) |
407 | { |
408 | blockIdx.y = j; |
409 | for (int k = 0 ; k < ite.wthr.x ; k++) |
410 | { |
411 | blockIdx.x = k; |
412 | int fb = 0; |
413 | // Work threads |
414 | for (int it = 0 ; it < ite.wthr.z ; it++) |
415 | { |
416 | threadIdx.z = it; |
417 | for (int jt = 0 ; jt < ite.wthr.y ; jt++) |
418 | { |
419 | threadIdx.y = jt; |
420 | for (int kt = 0 ; kt < ite.wthr.x ; kt++) |
421 | { |
422 | threadIdx.x = kt; |
423 | f(); |
424 | } |
425 | } |
426 | } |
427 | } |
428 | } |
429 | } |
430 | } |
431 | |
432 | #ifdef PRINT_CUDA_LAUNCHES |
433 | |
434 | #define CUDA_LAUNCH(cuda_call,ite, ...)\ |
435 | \ |
436 | gridDim.x = ite.wthr.x;\ |
437 | gridDim.y = ite.wthr.y;\ |
438 | gridDim.z = ite.wthr.z;\ |
439 | \ |
440 | blockDim.x = ite.thr.x;\ |
441 | blockDim.y = ite.thr.y;\ |
442 | blockDim.z = ite.thr.z;\ |
443 | \ |
444 | CHECK_SE_CLASS1_PRE\ |
445 | \ |
446 | std::cout << "Launching: " << #cuda_call << std::endl;\ |
447 | \ |
448 | exe_kernel(\ |
449 | [&](boost::context::fiber && main) -> void {\ |
450 | \ |
451 | \ |
452 | main_fib = main; |
453 | \ |
454 | cuda_call(__VA_ARGS__);\ |
455 | },ite);\ |
456 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
457 | } |
458 | |
459 | |
460 | #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ |
461 | {\ |
462 | dim3 wthr__(wthr_);\ |
463 | dim3 thr__(thr_);\ |
464 | \ |
465 | ite_gpu<1> itg;\ |
466 | itg.wthr = wthr;\ |
467 | itg.thr = thr;\ |
468 | \ |
469 | gridDim.x = wthr__.x;\ |
470 | gridDim.y = wthr__.y;\ |
471 | gridDim.z = wthr__.z;\ |
472 | \ |
473 | blockDim.x = thr__.x;\ |
474 | blockDim.y = thr__.y;\ |
475 | blockDim.z = thr__.z;\ |
476 | \ |
477 | CHECK_SE_CLASS1_PRE\ |
478 | std::cout << "Launching: " << #cuda_call << std::endl;\ |
479 | \ |
480 | exe_kernel(\ |
481 | [&] (boost::context::fiber && main) -> void {\ |
482 | \ |
483 | \ |
484 | main_fib = std::move(main);\ |
485 | \ |
486 | cuda_call(__VA_ARGS__);\ |
487 | \ |
488 | return std::move(main_fib);\ |
489 | \ |
490 | });\ |
491 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
492 | } |
493 | |
494 | #define CUDA_CHECK() |
495 | |
496 | #else |
497 | |
498 | #define CUDA_LAUNCH(cuda_call,ite, ...) \ |
499 | {\ |
500 | gridDim.x = ite.wthr.x;\ |
501 | gridDim.y = ite.wthr.y;\ |
502 | gridDim.z = ite.wthr.z;\ |
503 | \ |
504 | blockDim.x = ite.thr.x;\ |
505 | blockDim.y = ite.thr.y;\ |
506 | blockDim.z = ite.thr.z;\ |
507 | \ |
508 | CHECK_SE_CLASS1_PRE\ |
509 | \ |
510 | exe_kernel([&]() -> void {\ |
511 | \ |
512 | \ |
513 | cuda_call(__VA_ARGS__);\ |
514 | \ |
515 | },ite);\ |
516 | \ |
517 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
518 | } |
519 | |
520 | |
521 | #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ |
522 | {\ |
523 | dim3 wthr__(wthr_);\ |
524 | dim3 thr__(thr_);\ |
525 | \ |
526 | ite_gpu<1> itg;\ |
527 | itg.wthr = wthr_;\ |
528 | itg.thr = thr_;\ |
529 | \ |
530 | gridDim.x = wthr__.x;\ |
531 | gridDim.y = wthr__.y;\ |
532 | gridDim.z = wthr__.z;\ |
533 | \ |
534 | blockDim.x = thr__.x;\ |
535 | blockDim.y = thr__.y;\ |
536 | blockDim.z = thr__.z;\ |
537 | \ |
538 | CHECK_SE_CLASS1_PRE\ |
539 | \ |
540 | exe_kernel([&]() -> void {\ |
541 | \ |
542 | cuda_call(__VA_ARGS__);\ |
543 | \ |
544 | },itg);\ |
545 | \ |
546 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
547 | } |
548 | |
549 | #define CUDA_CHECK() |
550 | |
551 | #endif |
552 | |
553 | #endif |
554 | |
555 | #endif |
556 | |
557 | #endif |
558 | |