1 | /* |
2 | * ofp_context.hxx |
3 | * |
4 | * Created on: Nov 15, 2018 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef OFP_CONTEXT_HXX_ |
9 | #define OFP_CONTEXT_HXX_ |
10 | |
11 | #include <iostream> |
12 | |
13 | #ifdef CUDA_ON_CPU |
14 | |
15 | namespace mgpu |
16 | { |
17 | enum gpu_context_opt |
18 | { |
19 | no_print_props,//!< no_print_props |
20 | print_props, //!< print_props |
21 | dummy //!< dummy |
22 | }; |
23 | |
24 | struct context_t {}; |
25 | |
26 | class ofp_context_t : public context_t |
27 | { |
28 | protected: |
29 | |
30 | std::string _props; |
31 | |
32 | openfpm::vector<aggregate<unsigned char>> tmem; |
33 | |
34 | // Making this a template argument means we won't generate an instance |
35 | // of dummy_k for each translation unit. |
36 | template<int dummy_arg = 0> |
37 | void init(int dev_num, gpu_context_opt opt) |
38 | {} |
39 | |
40 | public: |
41 | |
42 | /*! \brief gpu context constructor |
43 | * |
44 | * \param opt options for this gpu context |
45 | * |
46 | */ |
47 | ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, int stream_ = 0) |
48 | {} |
49 | |
50 | ~ofp_context_t() |
51 | {} |
52 | |
53 | virtual const std::string& props() const |
54 | { |
55 | return _props; |
56 | } |
57 | |
58 | virtual int ptx_version() const |
59 | { |
60 | return 0; |
61 | } |
62 | |
63 | virtual int stream() |
64 | { |
65 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
66 | return 0; |
67 | } |
68 | |
69 | // Alloc GPU memory. |
70 | virtual void* alloc(size_t size, int space) |
71 | { |
72 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
73 | return NULL; |
74 | } |
75 | |
76 | virtual void free(void* p, int space) |
77 | { |
78 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
79 | } |
80 | |
81 | virtual void synchronize() |
82 | { |
83 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
84 | } |
85 | |
86 | virtual int event() |
87 | { |
88 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
89 | return 0; |
90 | } |
91 | |
92 | virtual void timer_begin() |
93 | { |
94 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
95 | } |
96 | |
97 | virtual double timer_end() |
98 | { |
99 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
100 | return 0.0; |
101 | } |
102 | |
103 | virtual int getDevice() |
104 | { |
105 | std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; |
106 | return 0; |
107 | } |
108 | }; |
109 | |
110 | } |
111 | |
112 | #else |
113 | |
114 | #ifdef CUDA_GPU |
115 | |
116 | #ifdef __NVCC__ |
117 | |
118 | #include "util/cuda/moderngpu/context.hxx" |
119 | |
120 | namespace mgpu |
121 | { |
122 | enum gpu_context_opt |
123 | { |
124 | no_print_props,//!< no_print_props |
125 | print_props, //!< print_props |
126 | dummy //!< dummy |
127 | }; |
128 | |
129 | |
130 | //////////////////////////////////////////////////////////////////////////////// |
131 | // standard_context_t is a trivial implementation of context_t. Users can |
132 | // derive this type to provide a custom allocator. |
133 | |
134 | class ofp_context_t : public context_t |
135 | { |
136 | protected: |
137 | cudaDeviceProp _props; |
138 | int _ptx_version; |
139 | cudaStream_t _stream; |
140 | |
141 | cudaEvent_t _timer[2]; |
142 | cudaEvent_t _event; |
143 | |
144 | openfpm::vector_gpu<aggregate<unsigned char>> tmem; |
145 | openfpm::vector_gpu<aggregate<unsigned char>> tmem2; |
146 | openfpm::vector_gpu<aggregate<unsigned char>> tmem3; |
147 | |
148 | // Making this a template argument means we won't generate an instance |
149 | // of dummy_k for each translation unit. |
150 | template<int dummy_arg = 0> |
151 | void init(int dev_num, gpu_context_opt opt) |
152 | { |
153 | cudaFuncAttributes attr; |
154 | cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>); |
155 | if(cudaSuccess != result) throw cuda_exception_t(result); |
156 | _ptx_version = attr.ptxVersion; |
157 | |
158 | int num_dev; |
159 | cudaGetDeviceCount(&num_dev); |
160 | |
161 | if (num_dev == 0) {return;} |
162 | |
163 | if (opt != gpu_context_opt::dummy) |
164 | { |
165 | cudaSetDevice(dev_num % num_dev); |
166 | } |
167 | |
168 | int ord; |
169 | cudaGetDevice(&ord); |
170 | cudaGetDeviceProperties(&_props, ord); |
171 | |
172 | cudaEventCreate(&_timer[0]); |
173 | cudaEventCreate(&_timer[1]); |
174 | cudaEventCreate(&_event); |
175 | } |
176 | |
177 | public: |
178 | |
179 | |
180 | /*! \brief gpu context constructor |
181 | * |
182 | * \param opt options for this gpu context |
183 | * |
184 | */ |
185 | ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0) |
186 | :context_t(), _stream(stream_) |
187 | { |
188 | init(dev_num,opt); |
189 | if(opt == gpu_context_opt::print_props) |
190 | { |
191 | printf("%s\n" , device_prop_string(_props).c_str()); |
192 | } |
193 | } |
194 | |
195 | ~ofp_context_t() |
196 | { |
197 | cudaEventDestroy(_timer[0]); |
198 | cudaEventDestroy(_timer[1]); |
199 | cudaEventDestroy(_event); |
200 | } |
201 | |
202 | virtual const cudaDeviceProp& props() const { return _props; } |
203 | virtual int ptx_version() const { return _ptx_version; } |
204 | virtual cudaStream_t stream() { return _stream; } |
205 | |
206 | // Alloc GPU memory. |
207 | virtual void* alloc(size_t size, memory_space_t space) |
208 | { |
209 | void* p = nullptr; |
210 | if(size) |
211 | { |
212 | cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size); |
213 | if(cudaSuccess != result) throw cuda_exception_t(result); |
214 | } |
215 | return p; |
216 | } |
217 | |
218 | virtual void free(void* p, memory_space_t space) |
219 | { |
220 | if(p) |
221 | { |
222 | cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p); |
223 | if(cudaSuccess != result) throw cuda_exception_t(result); |
224 | } |
225 | } |
226 | |
227 | virtual void synchronize() |
228 | { |
229 | cudaError_t result = _stream ? |
230 | cudaStreamSynchronize(_stream) : |
231 | cudaDeviceSynchronize(); |
232 | if(cudaSuccess != result) throw cuda_exception_t(result); |
233 | } |
234 | |
235 | virtual cudaEvent_t event() |
236 | { |
237 | return _event; |
238 | } |
239 | |
240 | virtual void timer_begin() |
241 | { |
242 | cudaEventRecord(_timer[0], _stream); |
243 | } |
244 | |
245 | virtual double timer_end() |
246 | { |
247 | cudaEventRecord(_timer[1], _stream); |
248 | cudaEventSynchronize(_timer[1]); |
249 | float ms; |
250 | cudaEventElapsedTime(&ms, _timer[0], _timer[1]); |
251 | return ms / 1.0e3; |
252 | } |
253 | |
254 | virtual int getDevice() |
255 | { |
256 | int dev = 0; |
257 | |
258 | cudaGetDevice(&dev); |
259 | |
260 | return dev; |
261 | } |
262 | |
263 | virtual int getNDevice() |
264 | { |
265 | int num_dev; |
266 | cudaGetDeviceCount(&num_dev); |
267 | |
268 | return num_dev; |
269 | } |
270 | |
271 | openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB() |
272 | { |
273 | return tmem; |
274 | } |
275 | |
276 | openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB2() |
277 | { |
278 | return tmem2; |
279 | } |
280 | |
281 | openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB3() |
282 | { |
283 | return tmem3; |
284 | } |
285 | }; |
286 | |
287 | } |
288 | |
289 | #else |
290 | |
291 | #include "util/cuda/moderngpu/context_reduced.hxx" |
292 | |
293 | namespace mgpu |
294 | { |
295 | enum gpu_context_opt |
296 | { |
297 | no_print_props,//!< no_print_props |
298 | print_props, //!< print_props |
299 | dummy //!< dummy |
300 | }; |
301 | |
302 | |
303 | //////////////////////////////////////////////////////////////////////////////// |
304 | // standard_context_t is a trivial implementation of context_t. Users can |
305 | // derive this type to provide a custom allocator. |
306 | |
307 | class ofp_context_t : public context_t |
308 | { |
309 | protected: |
310 | cudaDeviceProp _props; |
311 | int _ptx_version; |
312 | cudaStream_t _stream; |
313 | |
314 | cudaEvent_t _timer[2]; |
315 | cudaEvent_t _event; |
316 | |
317 | openfpm::vector<aggregate<unsigned char>> tmem; |
318 | |
319 | // Making this a template argument means we won't generate an instance |
320 | // of dummy_k for each translation unit. |
321 | template<int dummy_arg = 0> |
322 | void init(int dev_num, gpu_context_opt opt) |
323 | { |
324 | cudaFuncAttributes attr; |
325 | |
326 | _ptx_version = 0; |
327 | |
328 | int num_dev; |
329 | cudaGetDeviceCount(&num_dev); |
330 | |
331 | if (num_dev == 0) {return;} |
332 | |
333 | if (opt != gpu_context_opt::dummy) |
334 | { |
335 | cudaSetDevice(dev_num % num_dev); |
336 | } |
337 | |
338 | int ord; |
339 | cudaGetDevice(&ord); |
340 | cudaGetDeviceProperties(&_props, ord); |
341 | |
342 | cudaEventCreate(&_timer[0]); |
343 | cudaEventCreate(&_timer[1]); |
344 | cudaEventCreate(&_event); |
345 | } |
346 | |
347 | public: |
348 | |
349 | /*! \brief gpu context constructor |
350 | * |
351 | * \param opt options for this gpu context |
352 | * |
353 | */ |
354 | ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0) |
355 | :context_t(), _stream(stream_) |
356 | { |
357 | init(dev_num,opt); |
358 | if(opt == gpu_context_opt::print_props) |
359 | { |
360 | printf("%s\n" , device_prop_string(_props).c_str()); |
361 | } |
362 | } |
363 | |
364 | ~ofp_context_t() |
365 | { |
366 | cudaEventDestroy(_timer[0]); |
367 | cudaEventDestroy(_timer[1]); |
368 | cudaEventDestroy(_event); |
369 | } |
370 | |
371 | virtual const cudaDeviceProp& props() const |
372 | { |
373 | return _props; |
374 | } |
375 | |
376 | virtual int ptx_version() const |
377 | { |
378 | std::cout << __FILE__ << ":" << __LINE__ << " error to use this function you must compile the class ofp_context_t with NVCC" << std::endl; |
379 | return 0; |
380 | } |
381 | |
382 | virtual cudaStream_t stream() { return _stream; } |
383 | |
384 | // Alloc GPU memory. |
385 | virtual void* alloc(size_t size, memory_space_t space) |
386 | { |
387 | void* p = nullptr; |
388 | if(size) |
389 | { |
390 | cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size); |
391 | if(cudaSuccess != result) throw cuda_exception_t(result); |
392 | } |
393 | return p; |
394 | } |
395 | |
396 | virtual void free(void* p, memory_space_t space) |
397 | { |
398 | if(p) |
399 | { |
400 | cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p); |
401 | if(cudaSuccess != result) throw cuda_exception_t(result); |
402 | } |
403 | } |
404 | |
405 | virtual void synchronize() |
406 | { |
407 | cudaError_t result = _stream ? |
408 | cudaStreamSynchronize(_stream) : |
409 | cudaDeviceSynchronize(); |
410 | if(cudaSuccess != result) throw cuda_exception_t(result); |
411 | } |
412 | |
413 | virtual cudaEvent_t event() |
414 | { |
415 | return _event; |
416 | } |
417 | |
418 | virtual void timer_begin() |
419 | { |
420 | cudaEventRecord(_timer[0], _stream); |
421 | } |
422 | |
423 | virtual double timer_end() |
424 | { |
425 | cudaEventRecord(_timer[1], _stream); |
426 | cudaEventSynchronize(_timer[1]); |
427 | float ms; |
428 | cudaEventElapsedTime(&ms, _timer[0], _timer[1]); |
429 | return ms / 1.0e3; |
430 | } |
431 | |
432 | virtual int getDevice() |
433 | { |
434 | int dev = 0; |
435 | |
436 | cudaGetDevice(&dev); |
437 | |
438 | return dev; |
439 | } |
440 | }; |
441 | |
442 | } |
443 | |
444 | #endif |
445 | |
446 | #else |
447 | |
448 | namespace mgpu |
449 | { |
450 | |
451 | enum gpu_context_opt |
452 | { |
453 | no_print_props,//!< no_print_props |
454 | print_props, //!< print_props |
455 | dummy //!< dummy |
456 | }; |
457 | |
458 | // Stub class for modern gpu |
459 | |
460 | struct ofp_context_t |
461 | { |
462 | ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0) |
463 | {} |
464 | }; |
465 | } |
466 | |
467 | #endif |
468 | |
469 | #endif |
470 | |
471 | |
472 | #endif /* OFP_CONTEXT_HXX_ */ |
473 | |