1/*
2 * ofp_context.hxx
3 *
4 * Created on: Nov 15, 2018
5 * Author: i-bird
6 */
7
8#ifndef OFP_CONTEXT_HXX_
9#define OFP_CONTEXT_HXX_
10
11#include <iostream>
12
13#ifdef CUDA_ON_CPU
14
15namespace mgpu
16{
17 enum gpu_context_opt
18 {
19 no_print_props,//!< no_print_props
20 print_props, //!< print_props
21 dummy //!< dummy
22 };
23
24 struct context_t {};
25
26 class ofp_context_t : public context_t
27 {
28 protected:
29
30 std::string _props;
31
32 openfpm::vector<aggregate<unsigned char>> tmem;
33
34 // Making this a template argument means we won't generate an instance
35 // of dummy_k for each translation unit.
36 template<int dummy_arg = 0>
37 void init(int dev_num, gpu_context_opt opt)
38 {}
39
40 public:
41
42 /*! \brief gpu context constructor
43 *
44 * \param opt options for this gpu context
45 *
46 */
47 ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, int stream_ = 0)
48 {}
49
50 ~ofp_context_t()
51 {}
52
53 virtual const std::string& props() const
54 {
55 return _props;
56 }
57
58 virtual int ptx_version() const
59 {
60 return 0;
61 }
62
63 virtual int stream()
64 {
65 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
66 return 0;
67 }
68
69 // Alloc GPU memory.
70 virtual void* alloc(size_t size, int space)
71 {
72 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
73 return NULL;
74 }
75
76 virtual void free(void* p, int space)
77 {
78 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
79 }
80
81 virtual void synchronize()
82 {
83 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
84 }
85
86 virtual int event()
87 {
88 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
89 return 0;
90 }
91
92 virtual void timer_begin()
93 {
94 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
95 }
96
97 virtual double timer_end()
98 {
99 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
100 return 0.0;
101 }
102
103 virtual int getDevice()
104 {
105 std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
106 return 0;
107 }
108 };
109
110}
111
112#else
113
114 #ifdef CUDA_GPU
115
116 #ifdef __NVCC__
117
118 #include "util/cuda/moderngpu/context.hxx"
119
120 namespace mgpu
121 {
122 enum gpu_context_opt
123 {
124 no_print_props,//!< no_print_props
125 print_props, //!< print_props
126 dummy //!< dummy
127 };
128
129
130 ////////////////////////////////////////////////////////////////////////////////
131 // standard_context_t is a trivial implementation of context_t. Users can
132 // derive this type to provide a custom allocator.
133
134 class ofp_context_t : public context_t
135 {
136 protected:
137 cudaDeviceProp _props;
138 int _ptx_version;
139 cudaStream_t _stream;
140
141 cudaEvent_t _timer[2];
142 cudaEvent_t _event;
143
144 openfpm::vector_gpu<aggregate<unsigned char>> tmem;
145 openfpm::vector_gpu<aggregate<unsigned char>> tmem2;
146 openfpm::vector_gpu<aggregate<unsigned char>> tmem3;
147
148 // Making this a template argument means we won't generate an instance
149 // of dummy_k for each translation unit.
150 template<int dummy_arg = 0>
151 void init(int dev_num, gpu_context_opt opt)
152 {
153 cudaFuncAttributes attr;
154 cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>);
155 if(cudaSuccess != result) throw cuda_exception_t(result);
156 _ptx_version = attr.ptxVersion;
157
158 int num_dev;
159 cudaGetDeviceCount(&num_dev);
160
161 if (num_dev == 0) {return;}
162
163 if (opt != gpu_context_opt::dummy)
164 {
165 cudaSetDevice(dev_num % num_dev);
166 }
167
168 int ord;
169 cudaGetDevice(&ord);
170 cudaGetDeviceProperties(&_props, ord);
171
172 cudaEventCreate(&_timer[0]);
173 cudaEventCreate(&_timer[1]);
174 cudaEventCreate(&_event);
175 }
176
177 public:
178
179
180 /*! \brief gpu context constructor
181 *
182 * \param opt options for this gpu context
183 *
184 */
185 ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0)
186 :context_t(), _stream(stream_)
187 {
188 init(dev_num,opt);
189 if(opt == gpu_context_opt::print_props)
190 {
191 printf("%s\n", device_prop_string(_props).c_str());
192 }
193 }
194
195 ~ofp_context_t()
196 {
197 cudaEventDestroy(_timer[0]);
198 cudaEventDestroy(_timer[1]);
199 cudaEventDestroy(_event);
200 }
201
202 virtual const cudaDeviceProp& props() const { return _props; }
203 virtual int ptx_version() const { return _ptx_version; }
204 virtual cudaStream_t stream() { return _stream; }
205
206 // Alloc GPU memory.
207 virtual void* alloc(size_t size, memory_space_t space)
208 {
209 void* p = nullptr;
210 if(size)
211 {
212 cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size);
213 if(cudaSuccess != result) throw cuda_exception_t(result);
214 }
215 return p;
216 }
217
218 virtual void free(void* p, memory_space_t space)
219 {
220 if(p)
221 {
222 cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p);
223 if(cudaSuccess != result) throw cuda_exception_t(result);
224 }
225 }
226
227 virtual void synchronize()
228 {
229 cudaError_t result = _stream ?
230 cudaStreamSynchronize(_stream) :
231 cudaDeviceSynchronize();
232 if(cudaSuccess != result) throw cuda_exception_t(result);
233 }
234
235 virtual cudaEvent_t event()
236 {
237 return _event;
238 }
239
240 virtual void timer_begin()
241 {
242 cudaEventRecord(_timer[0], _stream);
243 }
244
245 virtual double timer_end()
246 {
247 cudaEventRecord(_timer[1], _stream);
248 cudaEventSynchronize(_timer[1]);
249 float ms;
250 cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
251 return ms / 1.0e3;
252 }
253
254 virtual int getDevice()
255 {
256 int dev = 0;
257
258 cudaGetDevice(&dev);
259
260 return dev;
261 }
262
263 virtual int getNDevice()
264 {
265 int num_dev;
266 cudaGetDeviceCount(&num_dev);
267
268 return num_dev;
269 }
270
271 openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB()
272 {
273 return tmem;
274 }
275
276 openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB2()
277 {
278 return tmem2;
279 }
280
281 openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB3()
282 {
283 return tmem3;
284 }
285 };
286
287 }
288
289 #else
290
291 #include "util/cuda/moderngpu/context_reduced.hxx"
292
293 namespace mgpu
294 {
295 enum gpu_context_opt
296 {
297 no_print_props,//!< no_print_props
298 print_props, //!< print_props
299 dummy //!< dummy
300 };
301
302
303 ////////////////////////////////////////////////////////////////////////////////
304 // standard_context_t is a trivial implementation of context_t. Users can
305 // derive this type to provide a custom allocator.
306
307 class ofp_context_t : public context_t
308 {
309 protected:
310 cudaDeviceProp _props;
311 int _ptx_version;
312 cudaStream_t _stream;
313
314 cudaEvent_t _timer[2];
315 cudaEvent_t _event;
316
317 openfpm::vector<aggregate<unsigned char>> tmem;
318
319 // Making this a template argument means we won't generate an instance
320 // of dummy_k for each translation unit.
321 template<int dummy_arg = 0>
322 void init(int dev_num, gpu_context_opt opt)
323 {
324 cudaFuncAttributes attr;
325
326 _ptx_version = 0;
327
328 int num_dev;
329 cudaGetDeviceCount(&num_dev);
330
331 if (num_dev == 0) {return;}
332
333 if (opt != gpu_context_opt::dummy)
334 {
335 cudaSetDevice(dev_num % num_dev);
336 }
337
338 int ord;
339 cudaGetDevice(&ord);
340 cudaGetDeviceProperties(&_props, ord);
341
342 cudaEventCreate(&_timer[0]);
343 cudaEventCreate(&_timer[1]);
344 cudaEventCreate(&_event);
345 }
346
347 public:
348
349 /*! \brief gpu context constructor
350 *
351 * \param opt options for this gpu context
352 *
353 */
354 ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0)
355 :context_t(), _stream(stream_)
356 {
357 init(dev_num,opt);
358 if(opt == gpu_context_opt::print_props)
359 {
360 printf("%s\n", device_prop_string(_props).c_str());
361 }
362 }
363
364 ~ofp_context_t()
365 {
366 cudaEventDestroy(_timer[0]);
367 cudaEventDestroy(_timer[1]);
368 cudaEventDestroy(_event);
369 }
370
371 virtual const cudaDeviceProp& props() const
372 {
373 return _props;
374 }
375
376 virtual int ptx_version() const
377 {
378 std::cout << __FILE__ << ":" << __LINE__ << " error to use this function you must compile the class ofp_context_t with NVCC" << std::endl;
379 return 0;
380 }
381
382 virtual cudaStream_t stream() { return _stream; }
383
384 // Alloc GPU memory.
385 virtual void* alloc(size_t size, memory_space_t space)
386 {
387 void* p = nullptr;
388 if(size)
389 {
390 cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size);
391 if(cudaSuccess != result) throw cuda_exception_t(result);
392 }
393 return p;
394 }
395
396 virtual void free(void* p, memory_space_t space)
397 {
398 if(p)
399 {
400 cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p);
401 if(cudaSuccess != result) throw cuda_exception_t(result);
402 }
403 }
404
405 virtual void synchronize()
406 {
407 cudaError_t result = _stream ?
408 cudaStreamSynchronize(_stream) :
409 cudaDeviceSynchronize();
410 if(cudaSuccess != result) throw cuda_exception_t(result);
411 }
412
413 virtual cudaEvent_t event()
414 {
415 return _event;
416 }
417
418 virtual void timer_begin()
419 {
420 cudaEventRecord(_timer[0], _stream);
421 }
422
423 virtual double timer_end()
424 {
425 cudaEventRecord(_timer[1], _stream);
426 cudaEventSynchronize(_timer[1]);
427 float ms;
428 cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
429 return ms / 1.0e3;
430 }
431
432 virtual int getDevice()
433 {
434 int dev = 0;
435
436 cudaGetDevice(&dev);
437
438 return dev;
439 }
440 };
441
442 }
443
444 #endif
445
446 #else
447
448 namespace mgpu
449 {
450
451 enum gpu_context_opt
452 {
453 no_print_props,//!< no_print_props
454 print_props, //!< print_props
455 dummy //!< dummy
456 };
457
458 // Stub class for modern gpu
459
460 struct ofp_context_t
461 {
462 ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0)
463 {}
464 };
465 }
466
467 #endif
468
469#endif
470
471
472#endif /* OFP_CONTEXT_HXX_ */
473