| 1 | #ifndef CUDIFY_SEQUENCIAL_HPP_ |
| 2 | #define CUDIFY_SEQUENCIAL_HPP_ |
| 3 | |
| 4 | #include "config.h" |
| 5 | |
| 6 | #ifdef CUDA_ON_CPU |
| 7 | |
| 8 | #include "cudify_hardware_common.hpp" |
| 9 | |
| 10 | #ifdef HAVE_BOOST_CONTEXT |
| 11 | |
| 12 | #define CUDIFY_ACTIVE |
| 13 | |
| 14 | #include "util/cuda_util.hpp" |
| 15 | #include <boost/bind/bind.hpp> |
| 16 | #include <type_traits> |
| 17 | #ifdef HAVE_BOOST_CONTEXT |
| 18 | #include <boost/context/continuation.hpp> |
| 19 | #endif |
| 20 | #include <vector> |
| 21 | #include <string.h> |
| 22 | |
| 23 | |
| 24 | #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE |
| 25 | #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192 |
| 26 | #endif |
| 27 | |
| 28 | extern std::vector<void *>mem_stack; |
| 29 | |
| 30 | extern dim3 threadIdx; |
| 31 | extern dim3 blockIdx; |
| 32 | |
| 33 | extern dim3 blockDim; |
| 34 | extern dim3 gridDim; |
| 35 | |
| 36 | extern std::vector<void *> mem_stack; |
| 37 | extern std::vector<boost::context::detail::fcontext_t> contexts; |
| 38 | extern void * par_glob; |
| 39 | extern boost::context::detail::fcontext_t main_ctx; |
| 40 | |
| 41 | static void __syncthreads() |
| 42 | { |
| 43 | boost::context::detail::jump_fcontext(main_ctx,par_glob); |
| 44 | }; |
| 45 | |
| 46 | static void cudaDeviceSynchronize() |
| 47 | {} |
| 48 | |
| 49 | static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz) |
| 50 | { |
| 51 | memcpy(dev_mem,global_cuda_error_array,sz); |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * CUDA memory copy types |
| 56 | */ |
| 57 | enum cudaMemcpyKind |
| 58 | { |
| 59 | cudaMemcpyHostToHost = 0, /**< Host -> Host */ |
| 60 | cudaMemcpyHostToDevice = 1, /**< Host -> Device */ |
| 61 | cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ |
| 62 | cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ |
| 63 | cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ |
| 64 | }; |
| 65 | |
| 66 | extern int vct_atomic_add; |
| 67 | extern int vct_atomic_rem; |
| 68 | |
| 69 | static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused) |
| 70 | { |
| 71 | memcpy(global_cuda_error_array+offset,mem,sz); |
| 72 | } |
| 73 | |
| 74 | namespace cub |
| 75 | { |
| 76 | template<typename T, unsigned int dim> |
| 77 | class BlockScan |
| 78 | { |
| 79 | public: |
| 80 | typedef std::array<T,dim> TempStorage; |
| 81 | |
| 82 | private: |
| 83 | TempStorage & tmp; |
| 84 | |
| 85 | public: |
| 86 | |
| 87 | |
| 88 | |
| 89 | BlockScan(TempStorage & tmp) |
| 90 | :tmp(tmp) |
| 91 | {}; |
| 92 | |
| 93 | void ExclusiveSum(T & in, T & out) |
| 94 | { |
| 95 | tmp[threadIdx.x] = in; |
| 96 | |
| 97 | __syncthreads(); |
| 98 | |
| 99 | if (threadIdx.x == 0) |
| 100 | { |
| 101 | T prec = tmp[0]; |
| 102 | tmp[0] = 0; |
| 103 | for (int i = 1 ; i < dim ; i++) |
| 104 | { |
| 105 | auto next = tmp[i-1] + prec; |
| 106 | prec = tmp[i]; |
| 107 | tmp[i] = next; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | __syncthreads(); |
| 112 | |
| 113 | out = tmp[threadIdx.x]; |
| 114 | return; |
| 115 | } |
| 116 | }; |
| 117 | } |
| 118 | |
| 119 | |
| 120 | template<typename T, typename T2> |
| 121 | static T atomicAdd(T * address, T2 val) |
| 122 | { |
| 123 | T old = *address; |
| 124 | *address += val; |
| 125 | return old; |
| 126 | }; |
| 127 | |
| 128 | #define MGPU_HOST_DEVICE |
| 129 | |
| 130 | namespace mgpu |
| 131 | { |
| 132 | template<typename type_t> |
| 133 | struct less_t : public std::binary_function<type_t, type_t, bool> { |
| 134 | bool operator()(type_t a, type_t b) const { |
| 135 | return a < b; |
| 136 | } |
| 137 | template<typename type2_t, typename type3_t> |
| 138 | bool operator()(type2_t a, type3_t b) const { |
| 139 | return a < b; |
| 140 | } |
| 141 | }; |
| 142 | /* template<typename type_t> |
| 143 | struct less_equal_t : public std::binary_function<type_t, type_t, bool> { |
| 144 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
| 145 | return a <= b; |
| 146 | } |
| 147 | };*/ |
| 148 | template<typename type_t> |
| 149 | struct greater_t : public std::binary_function<type_t, type_t, bool> { |
| 150 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
| 151 | return a > b; |
| 152 | } |
| 153 | template<typename type2_t, typename type3_t> |
| 154 | MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const { |
| 155 | return a > b; |
| 156 | } |
| 157 | }; |
| 158 | /* template<typename type_t> |
| 159 | struct greater_equal_t : public std::binary_function<type_t, type_t, bool> { |
| 160 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
| 161 | return a >= b; |
| 162 | } |
| 163 | }; |
| 164 | template<typename type_t> |
| 165 | struct equal_to_t : public std::binary_function<type_t, type_t, bool> { |
| 166 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
| 167 | return a == b; |
| 168 | } |
| 169 | }; |
| 170 | template<typename type_t> |
| 171 | struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> { |
| 172 | MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { |
| 173 | return a != b; |
| 174 | } |
| 175 | };*/ |
| 176 | |
| 177 | //////////////////////////////////////////////////////////////////////////////// |
| 178 | // Device-side arithmetic operators. |
| 179 | |
| 180 | template<typename type_t> |
| 181 | struct plus_t : public std::binary_function<type_t, type_t, type_t> { |
| 182 | type_t operator()(type_t a, type_t b) const { |
| 183 | return a + b; |
| 184 | } |
| 185 | }; |
| 186 | |
| 187 | /* template<typename type_t> |
| 188 | struct minus_t : public std::binary_function<type_t, type_t, type_t> { |
| 189 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
| 190 | return a - b; |
| 191 | } |
| 192 | }; |
| 193 | |
| 194 | template<typename type_t> |
| 195 | struct multiplies_t : public std::binary_function<type_t, type_t, type_t> { |
| 196 | MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { |
| 197 | return a * b; |
| 198 | } |
| 199 | };*/ |
| 200 | |
| 201 | template<typename type_t> |
| 202 | struct maximum_t : public std::binary_function<type_t, type_t, type_t> { |
| 203 | type_t operator()(type_t a, type_t b) const { |
| 204 | return std::max(a, b); |
| 205 | } |
| 206 | }; |
| 207 | |
| 208 | template<typename type_t> |
| 209 | struct minimum_t : public std::binary_function<type_t, type_t, type_t> { |
| 210 | type_t operator()(type_t a, type_t b) const { |
| 211 | return std::min(a, b); |
| 212 | } |
| 213 | }; |
| 214 | } |
| 215 | |
| 216 | |
| 217 | namespace mgpu |
| 218 | { |
| 219 | template<typename input_it, |
| 220 | typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t> |
| 221 | void segreduce(input_it input, int count, segments_it segments, |
| 222 | int num_segments, output_it output, op_t op, type_t init, |
| 223 | context_t& context) |
| 224 | { |
| 225 | int i = 0; |
| 226 | for ( ; i < num_segments - 1; i++) |
| 227 | { |
| 228 | int j = segments[i]; |
| 229 | output[i] = input[j]; |
| 230 | ++j; |
| 231 | for ( ; j < segments[i+1] ; j++) |
| 232 | { |
| 233 | output[i] = op(output[i],input[j]); |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | // Last segment |
| 238 | int j = segments[i]; |
| 239 | output[i] = input[j]; |
| 240 | ++j; |
| 241 | for ( ; j < count ; j++) |
| 242 | { |
| 243 | output[i] = op(output[i],input[j]); |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | // Key-value merge. |
| 248 | template<typename a_keys_it, typename a_vals_it, |
| 249 | typename b_keys_it, typename b_vals_it, |
| 250 | typename c_keys_it, typename c_vals_it, |
| 251 | typename comp_t, typename context_t> |
| 252 | void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, |
| 253 | b_keys_it b_keys, b_vals_it b_vals, int b_count, |
| 254 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) |
| 255 | { |
| 256 | int a_it = 0; |
| 257 | int b_it = 0; |
| 258 | int c_it = 0; |
| 259 | |
| 260 | while (a_it < a_count || b_it < b_count) |
| 261 | { |
| 262 | if (a_it < a_count) |
| 263 | { |
| 264 | if (b_it < b_count) |
| 265 | { |
| 266 | if (comp(b_keys[b_it],a_keys[a_it])) |
| 267 | { |
| 268 | c_keys[c_it] = b_keys[b_it]; |
| 269 | c_vals[c_it] = b_vals[b_it]; |
| 270 | c_it++; |
| 271 | b_it++; |
| 272 | } |
| 273 | else |
| 274 | { |
| 275 | c_keys[c_it] = a_keys[a_it]; |
| 276 | c_vals[c_it] = a_vals[a_it]; |
| 277 | c_it++; |
| 278 | a_it++; |
| 279 | } |
| 280 | } |
| 281 | else |
| 282 | { |
| 283 | c_keys[c_it] = a_keys[a_it]; |
| 284 | c_vals[c_it] = a_vals[a_it]; |
| 285 | c_it++; |
| 286 | a_it++; |
| 287 | } |
| 288 | } |
| 289 | else |
| 290 | { |
| 291 | c_keys[c_it] = b_keys[b_it]; |
| 292 | c_vals[c_it] = b_vals[b_it]; |
| 293 | c_it++; |
| 294 | b_it++; |
| 295 | } |
| 296 | } |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | static void init_wrappers() |
| 301 | {} |
| 302 | |
| 303 | template<typename lambda_f> |
| 304 | struct Fun_enc |
| 305 | { |
| 306 | lambda_f Fn; |
| 307 | |
| 308 | Fun_enc(lambda_f Fn) |
| 309 | :Fn(Fn) |
| 310 | {} |
| 311 | |
| 312 | void run() |
| 313 | { |
| 314 | Fn(); |
| 315 | } |
| 316 | }; |
| 317 | |
| 318 | template<typename Fun_enc_type> |
| 319 | void launch_kernel(boost::context::detail::transfer_t par) |
| 320 | { |
| 321 | main_ctx = par.fctx; |
| 322 | par_glob = par.data; |
| 323 | Fun_enc_type * ptr = (Fun_enc_type *)par.data; |
| 324 | |
| 325 | ptr->run(); |
| 326 | |
| 327 | boost::context::detail::jump_fcontext(par.fctx,0); |
| 328 | } |
| 329 | |
| 330 | template<typename lambda_f, typename ite_type> |
| 331 | static void exe_kernel(lambda_f f, ite_type & ite) |
| 332 | { |
| 333 | if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;} |
| 334 | |
| 335 | if (mem_stack.size() < ite.nthrs()) |
| 336 | { |
| 337 | int old_size = mem_stack.size(); |
| 338 | mem_stack.resize(ite.nthrs()); |
| 339 | |
| 340 | for (int i = old_size ; i < mem_stack.size() ; i++) |
| 341 | { |
| 342 | mem_stack[i] = new char [8192]; |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | // Resize contexts |
| 347 | contexts.resize(mem_stack.size()); |
| 348 | |
| 349 | Fun_enc<lambda_f> fe(f); |
| 350 | |
| 351 | for (int i = 0 ; i < ite.wthr.z ; i++) |
| 352 | { |
| 353 | blockIdx.z = i; |
| 354 | for (int j = 0 ; j < ite.wthr.y ; j++) |
| 355 | { |
| 356 | blockIdx.y = j; |
| 357 | for (int k = 0 ; k < ite.wthr.x ; k++) |
| 358 | { |
| 359 | blockIdx.x = k; |
| 360 | int nc = 0; |
| 361 | for (int it = 0 ; it < ite.thr.z ; it++) |
| 362 | { |
| 363 | for (int jt = 0 ; jt < ite.thr.y ; jt++) |
| 364 | { |
| 365 | for (int kt = 0 ; kt < ite.thr.x ; kt++) |
| 366 | { |
| 367 | contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>);; |
| 368 | nc++; |
| 369 | } |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | bool work_to_do = true; |
| 374 | while(work_to_do) |
| 375 | { |
| 376 | nc = 0; |
| 377 | // Work threads |
| 378 | for (int it = 0 ; it < ite.thr.z ; it++) |
| 379 | { |
| 380 | threadIdx.z = it; |
| 381 | for (int jt = 0 ; jt < ite.thr.y ; jt++) |
| 382 | { |
| 383 | threadIdx.y = jt; |
| 384 | for (int kt = 0 ; kt < ite.thr.x ; kt++) |
| 385 | { |
| 386 | threadIdx.x = kt; |
| 387 | auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe); |
| 388 | contexts[nc] = t.fctx; |
| 389 | work_to_do &= (t.data != 0); |
| 390 | nc++; |
| 391 | } |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | } |
| 396 | } |
| 397 | } |
| 398 | } |
| 399 | |
| 400 | template<typename lambda_f, typename ite_type> |
| 401 | static void exe_kernel_no_sync(lambda_f f, ite_type & ite) |
| 402 | { |
| 403 | for (int i = 0 ; i < ite.wthr.z ; i++) |
| 404 | { |
| 405 | blockIdx.z = i; |
| 406 | for (int j = 0 ; j < ite.wthr.y ; j++) |
| 407 | { |
| 408 | blockIdx.y = j; |
| 409 | for (int k = 0 ; k < ite.wthr.x ; k++) |
| 410 | { |
| 411 | blockIdx.x = k; |
| 412 | int fb = 0; |
| 413 | // Work threads |
| 414 | for (int it = 0 ; it < ite.wthr.z ; it++) |
| 415 | { |
| 416 | threadIdx.z = it; |
| 417 | for (int jt = 0 ; jt < ite.wthr.y ; jt++) |
| 418 | { |
| 419 | threadIdx.y = jt; |
| 420 | for (int kt = 0 ; kt < ite.wthr.x ; kt++) |
| 421 | { |
| 422 | threadIdx.x = kt; |
| 423 | f(); |
| 424 | } |
| 425 | } |
| 426 | } |
| 427 | } |
| 428 | } |
| 429 | } |
| 430 | } |
| 431 | |
| 432 | #ifdef PRINT_CUDA_LAUNCHES |
| 433 | |
| 434 | #define CUDA_LAUNCH(cuda_call,ite, ...)\ |
| 435 | \ |
| 436 | gridDim.x = ite.wthr.x;\ |
| 437 | gridDim.y = ite.wthr.y;\ |
| 438 | gridDim.z = ite.wthr.z;\ |
| 439 | \ |
| 440 | blockDim.x = ite.thr.x;\ |
| 441 | blockDim.y = ite.thr.y;\ |
| 442 | blockDim.z = ite.thr.z;\ |
| 443 | \ |
| 444 | CHECK_SE_CLASS1_PRE\ |
| 445 | \ |
| 446 | std::cout << "Launching: " << #cuda_call << std::endl;\ |
| 447 | \ |
| 448 | exe_kernel(\ |
| 449 | [&](boost::context::fiber && main) -> void {\ |
| 450 | \ |
| 451 | \ |
| 452 | main_fib = main; |
| 453 | \ |
| 454 | cuda_call(__VA_ARGS__);\ |
| 455 | },ite);\ |
| 456 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
| 457 | } |
| 458 | |
| 459 | |
| 460 | #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ |
| 461 | {\ |
| 462 | dim3 wthr__(wthr_);\ |
| 463 | dim3 thr__(thr_);\ |
| 464 | \ |
| 465 | ite_gpu<1> itg;\ |
| 466 | itg.wthr = wthr;\ |
| 467 | itg.thr = thr;\ |
| 468 | \ |
| 469 | gridDim.x = wthr__.x;\ |
| 470 | gridDim.y = wthr__.y;\ |
| 471 | gridDim.z = wthr__.z;\ |
| 472 | \ |
| 473 | blockDim.x = thr__.x;\ |
| 474 | blockDim.y = thr__.y;\ |
| 475 | blockDim.z = thr__.z;\ |
| 476 | \ |
| 477 | CHECK_SE_CLASS1_PRE\ |
| 478 | std::cout << "Launching: " << #cuda_call << std::endl;\ |
| 479 | \ |
| 480 | exe_kernel(\ |
| 481 | [&] (boost::context::fiber && main) -> void {\ |
| 482 | \ |
| 483 | \ |
| 484 | main_fib = std::move(main);\ |
| 485 | \ |
| 486 | cuda_call(__VA_ARGS__);\ |
| 487 | \ |
| 488 | return std::move(main_fib);\ |
| 489 | \ |
| 490 | });\ |
| 491 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
| 492 | } |
| 493 | |
| 494 | #define CUDA_CHECK() |
| 495 | |
| 496 | #else |
| 497 | |
| 498 | #define CUDA_LAUNCH(cuda_call,ite, ...) \ |
| 499 | {\ |
| 500 | gridDim.x = ite.wthr.x;\ |
| 501 | gridDim.y = ite.wthr.y;\ |
| 502 | gridDim.z = ite.wthr.z;\ |
| 503 | \ |
| 504 | blockDim.x = ite.thr.x;\ |
| 505 | blockDim.y = ite.thr.y;\ |
| 506 | blockDim.z = ite.thr.z;\ |
| 507 | \ |
| 508 | CHECK_SE_CLASS1_PRE\ |
| 509 | \ |
| 510 | exe_kernel([&]() -> void {\ |
| 511 | \ |
| 512 | \ |
| 513 | cuda_call(__VA_ARGS__);\ |
| 514 | \ |
| 515 | },ite);\ |
| 516 | \ |
| 517 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
| 518 | } |
| 519 | |
| 520 | |
| 521 | #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ |
| 522 | {\ |
| 523 | dim3 wthr__(wthr_);\ |
| 524 | dim3 thr__(thr_);\ |
| 525 | \ |
| 526 | ite_gpu<1> itg;\ |
| 527 | itg.wthr = wthr_;\ |
| 528 | itg.thr = thr_;\ |
| 529 | \ |
| 530 | gridDim.x = wthr__.x;\ |
| 531 | gridDim.y = wthr__.y;\ |
| 532 | gridDim.z = wthr__.z;\ |
| 533 | \ |
| 534 | blockDim.x = thr__.x;\ |
| 535 | blockDim.y = thr__.y;\ |
| 536 | blockDim.z = thr__.z;\ |
| 537 | \ |
| 538 | CHECK_SE_CLASS1_PRE\ |
| 539 | \ |
| 540 | exe_kernel([&]() -> void {\ |
| 541 | \ |
| 542 | cuda_call(__VA_ARGS__);\ |
| 543 | \ |
| 544 | },itg);\ |
| 545 | \ |
| 546 | CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ |
| 547 | } |
| 548 | |
| 549 | #define CUDA_CHECK() |
| 550 | |
| 551 | #endif |
| 552 | |
| 553 | #endif |
| 554 | |
| 555 | #endif |
| 556 | |
| 557 | #endif |
| 558 | |