1/*
2 * SparseGrid_conv_opt.hpp
3 *
4 * Created on: Jul 19, 2020
5 * Author: i-bird
6 */
7
8#ifndef SPARSEGRID_CONV_OPT_HPP_
9#define SPARSEGRID_CONV_OPT_HPP_
10
11template<unsigned int l>
12union data_il
13{
14};
15
16template<>
17union data_il<8>
18{
19 typedef long int type;
20
21 unsigned char uc[8];
22 long int i;
23};
24
25template<>
26union data_il<4>
27{
28 typedef int type;
29
30 unsigned char uc[4];
31 int i;
32};
33
34template<>
35union data_il<2>
36{
37 typedef short int type;
38
39 unsigned char uc[2];
40 short int i;
41};
42
43template<>
44union data_il<1>
45{
46 typedef char type;
47
48 unsigned char uc[4];
49 char i;
50};
51
52
53template<unsigned int dim, unsigned int sz>
54struct ids_crs
55{
56 long int sumdm[dim];
57 long int sumdp[dim];
58
59 long int s2;
60 bool mask_row[sz];
61 int k;
62};
63
64
65
66template<unsigned int dim>
67struct conv_impl
68{
69 template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size , unsigned int N, typename SparseGridType, typename lambda_f, typename ... ArgsT >
70 void conv(int (& stencil)[N][3], grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
71 {
72#ifndef __NVCC__
73 std::cout << __FILE__ << ":" << __LINE__ << " error conv operation not implemented for this dimension " << std::endl;
74#else
75 std::cout << __FILE__ << ":" << __LINE__ << " error conv is unsupported when compiled on NVCC " << std::endl;
76#endif
77 }
78
79 template<bool findNN, unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename SparseGridType, typename lambda_f, typename ... ArgsT >
80 static void conv_cross(grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
81 {
82#ifndef __NVCC__
83 std::cout << __FILE__ << ":" << __LINE__ << " error conv_cross operation not implemented for this dimension " << std::endl;
84#else
85 std::cout << __FILE__ << ":" << __LINE__ << " error conv_cross is unsupported when compiled on NVCC " << std::endl;
86#endif
87 }
88
89 template<bool findNN, typename NNType, unsigned int prop_src1, unsigned int prop_src2,
90 unsigned int prop_dst1, unsigned int prop_dst2,
91 unsigned int stencil_size , unsigned int N,
92 typename SparseGridType, typename lambda_f, typename ... ArgsT >
93 static void conv2(int (& stencil)[N][3], grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
94 {
95#ifndef __NVCC__
96 std::cout << __FILE__ << ":" << __LINE__ << " error conv2 operation not implemented for this dimension " << std::endl;
97#else
98 std::cout << __FILE__ << ":" << __LINE__ << " error conv2 is unsupported when compiled on NVCC " << std::endl;
99#endif
100 }
101
102 template<bool findNN, unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1, unsigned int prop_dst2, unsigned int stencil_size, typename SparseGridType, typename lambda_f, typename ... ArgsT >
103 static void conv_cross2(grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
104 {
105#ifndef __NVCC__
106 std::cout << __FILE__ << ":" << __LINE__ << " error conv_cross2 operation not implemented for this dimension " << std::endl;
107#else
108 std::cout << __FILE__ << ":" << __LINE__ << " error conv_cross2 is unsupported when compiled on NVCC " << std::endl;
109#endif
110 }
111};
112
113#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
114
115
116template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type>
117void load_crs(vect_type & cs1, chunk_type & chunk, ids_type & ids)
118{
119 if (dir == 0 && p < 0)
120 {
121 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[ids.s2]);
122
123 cs1 = cmd1;
124 cs1 = cs1.shifted(-1);
125 cs1[0] = chunk.template get<prop_src1>()[ids.sumdm[dir]];
126 }
127 else if (dir == 0 && p > 0)
128 {
129 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[ids.s2]);
130
131 cs1 = cmd1;
132 cs1 = cs1.shifted(1);
133 cs1[Vc::Vector<typename vect_type::EntryType>::Size - 1] = chunk.template get<prop_src1>()[ids.sumdp[0]];
134 }
135 else if (p < 0)
136 {
137 cs1.load(&chunk.template get<prop_src1>()[ids.sumdm[dir]],Vc::Aligned);
138 }
139 else if (p > 0)
140 {
141 cs1.load(&chunk.template get<prop_src1>()[ids.sumdp[dir]],Vc::Aligned);
142 }
143 else
144 {
145 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[ids.s2]);
146
147 cs1 = cmd1;
148 }
149}
150
151template<unsigned int prop_dst1,typename chunk_type, typename vect_type, typename ids_type>
152void store_crs(chunk_type & chunk, vect_type & res, ids_type & ids)
153{
154 Vc::Mask<typename vect_type::EntryType> m(&ids.mask_row[ids.k]);
155
156 res.store(&chunk.template get<prop_dst1>()[ids.s2],m,Vc::Aligned);
157}
158
159template<unsigned int prop_dst1,unsigned int comp, typename chunk_type, typename vect_type, typename ids_type>
160void store_crs_v(chunk_type & chunk, vect_type & res, ids_type & ids)
161{
162 Vc::Mask<typename vect_type::EntryType> m(&ids.mask_row[ids.k]);
163
164 res.store(&chunk.template get<prop_dst1>()[comp][ids.s2],m,Vc::Aligned);
165}
166
167template<unsigned int dir,int p, unsigned int comp, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type>
168void load_crs_v(vect_type & cs1, chunk_type & chunk, ids_type & ids)
169{
170 if (dir == 0 && p < 0)
171 {
172 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[comp][ids.s2]);
173
174 cs1 = cmd1;
175 cs1 = cs1.shifted(-1);
176 cs1[0] = chunk.template get<prop_src1>()[comp][ids.sumdm[dir]];
177 }
178 else if (dir == 0 && p > 0)
179 {
180 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[comp][ids.s2]);
181
182 cs1 = cmd1;
183 cs1 = cs1.shifted(1);
184 cs1[Vc::Vector<typename vect_type::EntryType>::Size - 1] = chunk.template get<prop_src1>()[comp][ids.sumdp[dir]];
185 }
186 else if (p < 0)
187 {
188 cs1.load(&chunk.template get<prop_src1>()[comp][ids.sumdm[dir]],Vc::Aligned);
189 }
190 else if (p > 0)
191 {
192 cs1.load(&chunk.template get<prop_src1>()[comp][ids.sumdp[dir]],Vc::Aligned);
193 }
194 else
195 {
196 Vc::Vector<typename vect_type::EntryType> cmd1(&chunk.template get<prop_src1>()[comp][ids.s2]);
197
198 cs1 = cmd1;
199 }
200}
201
202struct cross_stencil_v
203{
204 Vc::double_v xm;
205 Vc::double_v xp;
206 Vc::double_v ym;
207 Vc::double_v yp;
208 Vc::double_v zm;
209 Vc::double_v zp;
210};
211
212template<>
213struct conv_impl<3>
214{
215 template<bool findNN, typename NNtype, unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size , unsigned int N, typename SparseGridType, typename lambda_f, typename ... ArgsT >
216 static void conv(int (& stencil)[N][3], grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
217 {
218 auto it = grid.template getBlockIterator<stencil_size>(start,stop);
219
220 typedef typename boost::mpl::at<typename SparseGridType::value_type::type, boost::mpl::int_<prop_src>>::type prop_type;
221
222 unsigned char mask[decltype(it)::sizeBlockBord];
223 unsigned char mask_sum[decltype(it)::sizeBlockBord];
224 unsigned char mask_unused[decltype(it)::sizeBlock];
225 __attribute__ ((aligned (32))) prop_type block_bord_src[decltype(it)::sizeBlockBord];
226 __attribute__ ((aligned (32))) prop_type block_bord_dst[decltype(it)::sizeBlock];
227
228 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<0>>::type sz0;
229 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<1>>::type sz1;
230 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<2>>::type sz2;
231
232 while (it.isNext())
233 {
234 it.template loadBlockBorder<prop_src,NNtype,findNN>(block_bord_src,mask);
235
236 if (it.start_b(2) != stencil_size || it.start_b(1) != stencil_size || it.start_b(0) != stencil_size ||
237 it.stop_b(2) != sz2::value+stencil_size || it.stop_b(1) != sz1::value+stencil_size || it.stop_b(0) != sz0::value+stencil_size)
238 {
239 auto & header_mask = grid.private_get_header_mask();
240 auto & header_inf = grid.private_get_header_inf();
241
242 loadBlock_impl<prop_dst,0,3,typename decltype(it)::vector_blocks_exts_type, typename decltype(it)::vector_ext_type>::template loadBlock<decltype(it)::sizeBlock>(block_bord_dst,grid,it.getChunkId(),mask_unused);
243 }
244
245 // Sum the mask
246 for (int k = it.start_b(2) ; k < it.stop_b(2) ; k++)
247 {
248 for (int j = it.start_b(1) ; j < it.stop_b(1) ; j++)
249 {
250 int cc = it.LinB(it.start_b(0),j,k);
251 int c[N];
252
253 for (int s = 0 ; s < N ; s++)
254 {
255 c[s] = it.LinB(it.start_b(0)+stencil[s][0],j+stencil[s][1],k+stencil[s][2]);
256 }
257
258 for (int i = it.start_b(0) ; i < it.stop_b(0) ; i += sizeof(size_t))
259 {
260 size_t cmd = *(size_t *)&mask[cc];
261
262 if (cmd != 0)
263 {
264 size_t xm[N];
265
266 for (int s = 0 ; s < N ; s++)
267 {
268 xm[s] = *(size_t *)&mask[c[s]];
269 }
270
271 size_t sum = 0;
272 for (int s = 0 ; s < N ; s++)
273 {
274 sum += xm[s];
275 }
276
277 *(size_t *)&mask_sum[cc] = sum;
278 }
279
280 cc += sizeof(size_t);
281 for (int s = 0 ; s < N ; s++)
282 {
283 c[s] += sizeof(size_t);
284 }
285 }
286 }
287 }
288
289 for (int k = it.start_b(2) ; k < it.stop_b(2) ; k++)
290 {
291 for (int j = it.start_b(1) ; j < it.stop_b(1) ; j++)
292 {
293 int cc = it.LinB(it.start_b(0),j,k);
294 int c[N];
295
296 int cd = it.LinB_off(it.start_b(0),j,k);
297
298 for (int s = 0 ; s < N ; s++)
299 {
300 c[s] = it.LinB(it.start_b(0)+stencil[s][0],j+stencil[s][1],k+stencil[s][2]);
301 }
302
303 for (int i = it.start_b(0) ; i < it.stop_b(0) ; i += Vc::Vector<prop_type>::Size)
304 {
305 Vc::Mask<prop_type> cmp;
306
307 for (int s = 0 ; s < Vc::Vector<prop_type>::Size ; s++)
308 {
309 cmp[s] = (mask[cc+s] == true && i+s < it.stop_b(0));
310 }
311
312 // we do only if exist the point
313 if (Vc::none_of(cmp) == false)
314 {
315 Vc::Mask<prop_type> surround;
316
317 Vc::Vector<prop_type> xs[N+1];
318
319 xs[0] = Vc::Vector<prop_type>(&block_bord_src[cc],Vc::Unaligned);
320
321 for (int s = 1 ; s < N+1 ; s++)
322 {
323 xs[s] = Vc::Vector<prop_type>(&block_bord_src[c[s-1]],Vc::Unaligned);
324 }
325
326 auto res = func(xs, &mask_sum[cc], args ...);
327
328 res.store(&block_bord_dst[cd],cmp,Vc::Aligned);
329 }
330
331 cc += Vc::Vector<prop_type>::Size;
332 for (int s = 0 ; s < N ; s++)
333 {
334 c[s] += Vc::Vector<prop_type>::Size;
335 }
336 cd += Vc::Vector<prop_type>::Size;
337 }
338 }
339 }
340
341 it.template storeBlock<prop_dst>(block_bord_dst);
342
343 ++it;
344 }
345 }
346
347 template<bool findNN, unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename SparseGridType, typename lambda_f, typename ... ArgsT >
348 static void conv_cross(grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
349 {
350 auto it = grid.template getBlockIterator<1>(start,stop);
351
352 auto & datas = grid.private_get_data();
353 auto & headers = grid.private_get_header_mask();
354
355 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<0>>::type sz0;
356 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<1>>::type sz1;
357 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<2>>::type sz2;
358
359 typedef typename SparseGridType::chunking_type chunking;
360
361 typedef typename boost::mpl::at<typename SparseGridType::value_type::type, boost::mpl::int_<prop_src>>::type prop_type;
362
363 while (it.isNext())
364 {
365 // Load
366 long int offset_jump[6];
367
368 size_t cid = it.getChunkId();
369
370 auto chunk = datas.get(cid);
371 auto & mask = headers.get(cid);
372
373 bool exist;
374 grid_key_dx<3> p = grid.getChunkPos(cid) + grid_key_dx<3>({-1,0,0});
375 long int r = grid.getChunk(p,exist);
376 offset_jump[0] = (r-cid)*decltype(it)::sizeBlock;
377
378 p = grid.getChunkPos(cid) + grid_key_dx<3>({1,0,0});
379 r = grid.getChunk(p,exist);
380 offset_jump[1] = (r-cid)*decltype(it)::sizeBlock;
381
382 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,-1,0});
383 r = grid.getChunk(p,exist);
384 offset_jump[2] = (r-cid)*decltype(it)::sizeBlock;
385
386 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,1,0});
387 r = grid.getChunk(p,exist);
388 offset_jump[3] = (r-cid)*decltype(it)::sizeBlock;
389
390 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,-1});
391 r = grid.getChunk(p,exist);
392 offset_jump[4] = (r-cid)*decltype(it)::sizeBlock;
393
394 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,1});
395 r = grid.getChunk(p,exist);
396 offset_jump[5] = (r-cid)*decltype(it)::sizeBlock;
397
398 // Load offset jumps
399
400 // construct a row mask
401
402 long int s2 = 0;
403
404 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<2>>::type sz;
405 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<1>>::type sy;
406 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<0>>::type sx;
407
408
409 bool mask_row[sx::value];
410
411 for (int k = 0 ; k < sx::value ; k++)
412 {
413 mask_row[k] = (k >= it.start(0) && k < it.stop(0))?true:false;
414 }
415
416 for (int v = it.start(2) ; v < it.stop(2) ; v++)
417 {
418 for (int j = it.start(1) ; j < it.stop(1) ; j++)
419 {
420 s2 = it.Lin(0,j,v);
421 for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
422 {
423 // we do only id exist the point
424 if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
425
426 data_il<Vc::Vector<prop_type>::Size> mxm;
427 data_il<Vc::Vector<prop_type>::Size> mxp;
428 data_il<Vc::Vector<prop_type>::Size> mym;
429 data_il<Vc::Vector<prop_type>::Size> myp;
430 data_il<Vc::Vector<prop_type>::Size> mzm;
431 data_il<Vc::Vector<prop_type>::Size> mzp;
432
433 cross_stencil_v cs;
434
435 Vc::Vector<prop_type> cmd(&chunk.template get<prop_src>()[s2]);
436
437 // Load x-1
438 long int sumxm = s2-1;
439 sumxm += (k==0)?offset_jump[0] + sx::value:0;
440
441 // Load x+1
442 long int sumxp = s2+Vc::Vector<prop_type>::Size;
443 sumxp += (k+Vc::Vector<prop_type>::Size == sx::value)?offset_jump[1] - sx::value:0;
444
445 long int sumym = (j == 0)?offset_jump[2] + (sy::value-1)*sx::value:-sx::value;
446 sumym += s2;
447 long int sumyp = (j == sy::value-1)?offset_jump[3] - (sy::value - 1)*sx::value:sx::value;
448 sumyp += s2;
449 long int sumzm = (v == 0)?offset_jump[4] + (sz::value-1)*sx::value*sy::value:-sx::value*sy::value;
450 sumzm += s2;
451 long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
452 sumzp += s2;
453
454 if (Vc::Vector<prop_type>::Size == 2)
455 {
456 mxm.i = *(short int *)&mask.mask[s2];
457 mxm.i = mxm.i << 8;
458 mxm.i |= (short int)mask.mask[sumxm];
459
460 mxp.i = *(short int *)&mask.mask[s2];
461 mxp.i = mxp.i >> 8;
462 mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
463
464 mym.i = *(short int *)&mask.mask[sumym];
465 myp.i = *(short int *)&mask.mask[sumyp];
466
467 mzm.i = *(short int *)&mask.mask[sumzm];
468 mzp.i = *(short int *)&mask.mask[sumzp];
469 }
470 else if (Vc::Vector<prop_type>::Size == 4)
471 {
472 mxm.i = *(int *)&mask.mask[s2];
473 mxm.i = mxm.i << 8;
474 mxm.i |= (int)mask.mask[sumxm];
475
476 mxp.i = *(int *)&mask.mask[s2];
477 mxp.i = mxp.i >> 8;
478 mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
479
480 mym.i = *(int *)&mask.mask[sumym];
481 myp.i = *(int *)&mask.mask[sumyp];
482
483 mzm.i = *(int *)&mask.mask[sumzm];
484 mzp.i = *(int *)&mask.mask[sumzp];
485 }
486 else
487 {
488 std::cout << __FILE__ << ":" << __LINE__ << " UNSUPPORTED" << std::endl;
489 }
490
491 cs.xm = cmd;
492 cs.xm = cs.xm.shifted(-1);
493 cs.xm[0] = chunk.template get<prop_src>()[sumxm];
494
495
496 cs.xp = cmd;
497 cs.xp = cs.xp.shifted(1);
498 cs.xp[Vc::Vector<prop_type>::Size - 1] = chunk.template get<prop_src>()[sumxp];
499
500 // Load y and z direction
501
502 cs.ym.load(&chunk.template get<prop_src>()[sumym],Vc::Aligned);
503 cs.yp.load(&chunk.template get<prop_src>()[sumyp],Vc::Aligned);
504 cs.zm.load(&chunk.template get<prop_src>()[sumzm],Vc::Aligned);
505 cs.zp.load(&chunk.template get<prop_src>()[sumzp],Vc::Aligned);
506
507 // Calculate
508
509 data_il<Vc::Vector<prop_type>::Size> tot_m;
510 tot_m.i = mxm.i + mxp.i + mym.i + myp.i + mzm.i + mzp.i;
511
512 Vc::Vector<prop_type> res = func(cmd,cs,tot_m.uc,args ... );
513
514 Vc::Mask<prop_type> m(&mask_row[k]);
515
516 res.store(&chunk.template get<prop_dst>()[s2],m,Vc::Aligned);
517
518 s2 += Vc::Vector<prop_type>::Size;
519 }
520 }
521 }
522
523 ++it;
524 }
525 }
526
527
528 template<bool findNN, typename NNType, unsigned int prop_src1, unsigned int prop_src2,
529 unsigned int prop_dst1, unsigned int prop_dst2,
530 unsigned int stencil_size , unsigned int N,
531 typename SparseGridType, typename lambda_f, typename ... ArgsT >
532 static void conv2(int (& stencil)[N][3], grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
533 {
534 auto it = grid.template getBlockIterator<stencil_size>(start,stop);
535
536 typedef typename boost::mpl::at<typename SparseGridType::value_type::type, boost::mpl::int_<prop_src1>>::type prop_type;
537
538 unsigned char mask[decltype(it)::sizeBlockBord];
539 unsigned char mask_sum[decltype(it)::sizeBlockBord];
540 __attribute__ ((aligned (64))) prop_type block_bord_src1[decltype(it)::sizeBlockBord];
541 __attribute__ ((aligned (64))) prop_type block_bord_dst1[decltype(it)::sizeBlock+16];
542 __attribute__ ((aligned (64))) prop_type block_bord_src2[decltype(it)::sizeBlockBord];
543 __attribute__ ((aligned (64))) prop_type block_bord_dst2[decltype(it)::sizeBlock+16];
544
545 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<0>>::type sz0;
546 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<1>>::type sz1;
547 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<2>>::type sz2;
548
549 while (it.isNext())
550 {
551 it.template loadBlockBorder<prop_src1,NNType,findNN>(block_bord_src1,mask);
552 it.template loadBlockBorder<prop_src2,NNType,findNN>(block_bord_src2,mask);
553
554 // Sum the mask
555 for (int k = it.start_b(2) ; k < it.stop_b(2) ; k++)
556 {
557 for (int j = it.start_b(1) ; j < it.stop_b(1) ; j++)
558 {
559 int cc = it.LinB(it.start_b(0),j,k);
560 int c[N];
561
562 for (int s = 0 ; s < N ; s++)
563 {
564 c[s] = it.LinB(it.start_b(0)+stencil[s][0],j+stencil[s][1],k+stencil[s][2]);
565 }
566
567 for (int i = it.start_b(0) ; i < it.stop_b(0) ; i += sizeof(size_t))
568 {
569 size_t cmd = *(size_t *)&mask[cc];
570
571 if (cmd == 0) {continue;}
572
573
574 size_t xm[N];
575
576 for (int s = 0 ; s < N ; s++)
577 {
578 xm[s] = *(size_t *)&mask[c[s]];
579 }
580
581 size_t sum = 0;
582 for (int s = 0 ; s < N ; s++)
583 {
584 sum += xm[s];
585 }
586
587 *(size_t *)&mask_sum[cc] = sum;
588
589 cc += sizeof(size_t);
590 for (int s = 0 ; s < N ; s++)
591 {
592 c[s] += sizeof(size_t);
593 }
594 }
595 }
596 }
597
598 for (int k = it.start_b(2) ; k < it.stop_b(2) ; k++)
599 {
600 for (int j = it.start_b(1) ; j < it.stop_b(1) ; j++)
601 {
602 int cc = it.LinB(it.start_b(0),j,k);
603 int c[N];
604
605 int cd = it.LinB_off(it.start_b(0),j,k);
606
607 for (int s = 0 ; s < N ; s++)
608 {
609 c[s] = it.LinB(it.start_b(0)+stencil[s][0],j+stencil[s][1],k+stencil[s][2]);
610 }
611
612 for (int i = it.start_b(0) ; i < it.stop_b(0) ; i += Vc::Vector<prop_type>::Size)
613 {
614 Vc::Mask<prop_type> cmp;
615
616 for (int s = 0 ; s < Vc::Vector<prop_type>::Size ; s++)
617 {
618 cmp[s] = (mask[cc+s] == true);
619 }
620
621 // we do only id exist the point
622 if (Vc::none_of(cmp) == true) {continue;}
623
624 Vc::Mask<prop_type> surround;
625
626 Vc::Vector<prop_type> xs1[N+1];
627 Vc::Vector<prop_type> xs2[N+1];
628
629 xs1[0] = Vc::Vector<prop_type>(&block_bord_src1[cc],Vc::Unaligned);
630 xs2[0] = Vc::Vector<prop_type>(&block_bord_src2[cc],Vc::Unaligned);
631
632 for (int s = 1 ; s < N+1 ; s++)
633 {
634 xs1[s] = Vc::Vector<prop_type>(&block_bord_src1[c[s-1]],Vc::Unaligned);
635 xs2[s] = Vc::Vector<prop_type>(&block_bord_src2[c[s-1]],Vc::Unaligned);
636 }
637
638 Vc::Vector<prop_type> vo1;
639 Vc::Vector<prop_type> vo2;
640
641 func(vo1, vo2, xs1, xs2, &mask_sum[cc], args ...);
642
643 vo1.store(&block_bord_dst1[cd],cmp,Vc::Unaligned);
644 vo2.store(&block_bord_dst2[cd],cmp,Vc::Unaligned);
645
646 cc += Vc::Vector<prop_type>::Size;
647 for (int s = 0 ; s < N ; s++)
648 {
649 c[s] += Vc::Vector<prop_type>::Size;
650 }
651 cd += Vc::Vector<prop_type>::Size;
652 }
653 }
654 }
655
656 it.template storeBlock<prop_dst1>(block_bord_dst1);
657 it.template storeBlock<prop_dst2>(block_bord_dst2);
658
659 ++it;
660 }
661 }
662
663 template<bool findNN, unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1, unsigned int prop_dst2, unsigned int stencil_size, typename SparseGridType, typename lambda_f, typename ... ArgsT >
664 static void conv_cross2(grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
665 {
666 auto it = grid.template getBlockIterator<stencil_size>(start,stop);
667
668 auto & datas = grid.private_get_data();
669 auto & headers = grid.private_get_header_mask();
670
671 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<0>>::type sz0;
672 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<1>>::type sz1;
673 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<2>>::type sz2;
674
675 typedef typename SparseGridType::chunking_type chunking;
676
677 typedef typename boost::mpl::at<typename SparseGridType::value_type::type, boost::mpl::int_<prop_src1>>::type prop_type;
678
679 while (it.isNext())
680 {
681 // Load
682 long int offset_jump[6];
683
684 size_t cid = it.getChunkId();
685
686 auto chunk = datas.get(cid);
687 auto & mask = headers.get(cid);
688
689 bool exist;
690 grid_key_dx<3> p = grid.getChunkPos(cid) + grid_key_dx<3>({-1,0,0});
691 long int r = grid.getChunk(p,exist);
692 offset_jump[0] = (r-cid)*decltype(it)::sizeBlock;
693
694 p = grid.getChunkPos(cid) + grid_key_dx<3>({1,0,0});
695 r = grid.getChunk(p,exist);
696 offset_jump[1] = (r-cid)*decltype(it)::sizeBlock;
697
698 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,-1,0});
699 r = grid.getChunk(p,exist);
700 offset_jump[2] = (r-cid)*decltype(it)::sizeBlock;
701
702 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,1,0});
703 r = grid.getChunk(p,exist);
704 offset_jump[3] = (r-cid)*decltype(it)::sizeBlock;
705
706 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,-1});
707 r = grid.getChunk(p,exist);
708 offset_jump[4] = (r-cid)*decltype(it)::sizeBlock;
709
710 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,1});
711 r = grid.getChunk(p,exist);
712 offset_jump[5] = (r-cid)*decltype(it)::sizeBlock;
713
714 // Load offset jumps
715
716 // construct a row mask
717
718 long int s2 = 0;
719
720 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<2>>::type sz;
721 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<1>>::type sy;
722 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<0>>::type sx;
723
724
725 bool mask_row[sx::value];
726
727 for (int k = 0 ; k < sx::value ; k++)
728 {
729 mask_row[k] = (k >= it.start(0) && k < it.stop(0))?true:false;
730 }
731
732 for (int v = it.start(2) ; v < it.stop(2) ; v++)
733 {
734 for (int j = it.start(1) ; j < it.stop(1) ; j++)
735 {
736 s2 = it.Lin(0,j,v);
737 for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
738 {
739 // we do only id exist the point
740 if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
741
742 data_il<4> mxm;
743 data_il<4> mxp;
744 data_il<4> mym;
745 data_il<4> myp;
746 data_il<4> mzm;
747 data_il<4> mzp;
748
749 cross_stencil_v cs1;
750 cross_stencil_v cs2;
751
752 Vc::Vector<prop_type> cmd1(&chunk.template get<prop_src1>()[s2]);
753 Vc::Vector<prop_type> cmd2(&chunk.template get<prop_src2>()[s2]);
754
755 // Load x-1
756 long int sumxm = s2-1;
757 sumxm += (k==0)?offset_jump[0] + sx::value:0;
758
759 // Load x+1
760 long int sumxp = s2+Vc::Vector<prop_type>::Size;
761 sumxp += (k+Vc::Vector<prop_type>::Size == sx::value)?offset_jump[1] - sx::value:0;
762
763 long int sumym = (j == 0)?offset_jump[2] + (sy::value-1)*sx::value:-sx::value;
764 sumym += s2;
765 long int sumyp = (j == sy::value-1)?offset_jump[3] - (sy::value - 1)*sx::value:sx::value;
766 sumyp += s2;
767 long int sumzm = (v == 0)?offset_jump[4] + (sz::value-1)*sx::value*sy::value:-sx::value*sy::value;
768 sumzm += s2;
769 long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
770 sumzp += s2;
771
772 if (Vc::Vector<prop_type>::Size == 2)
773 {
774 mxm.i = *(short int *)&mask.mask[s2];
775 mxm.i = mxm.i << 8;
776 mxm.i |= (short int)mask.mask[sumxm];
777
778 mxp.i = *(short int *)&mask.mask[s2];
779 mxp.i = mxp.i >> 8;
780 mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
781
782 mym.i = *(short int *)&mask.mask[sumym];
783 myp.i = *(short int *)&mask.mask[sumyp];
784
785 mzm.i = *(short int *)&mask.mask[sumzm];
786 mzp.i = *(short int *)&mask.mask[sumzp];
787 }
788 else if (Vc::Vector<prop_type>::Size == 4)
789 {
790 mxm.i = *(int *)&mask.mask[s2];
791 mxm.i = mxm.i << 8;
792 mxm.i |= (int)mask.mask[sumxm];
793
794 mxp.i = *(int *)&mask.mask[s2];
795 mxp.i = mxp.i >> 8;
796 mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
797
798 mym.i = *(int *)&mask.mask[sumym];
799 myp.i = *(int *)&mask.mask[sumyp];
800
801 mzm.i = *(int *)&mask.mask[sumzm];
802 mzp.i = *(int *)&mask.mask[sumzp];
803 }
804 else
805 {
806 std::cout << __FILE__ << ":" << __LINE__ << " UNSUPPORTED" << std::endl;
807 }
808
809 cs1.xm = cmd1;
810 cs1.xm = cs1.xm.shifted(-1);
811 cs1.xm[0] = chunk.template get<prop_src1>()[sumxm];
812
813 cs2.xm = cmd2;
814 cs2.xm = cs2.xm.shifted(-1);
815 cs2.xm[0] = chunk.template get<prop_src2>()[sumxm];
816
817 cs1.xp = cmd1;
818 cs1.xp = cs1.xp.shifted(1);
819 cs1.xp[Vc::Vector<prop_type>::Size - 1] = chunk.template get<prop_src1>()[sumxp];
820
821 cs2.xp = cmd2;
822 cs2.xp = cs2.xp.shifted(1);
823 cs2.xp[Vc::Vector<prop_type>::Size - 1] = chunk.template get<prop_src2>()[sumxp];
824
825 // Load y and z direction
826
827 cs1.ym.load(&chunk.template get<prop_src1>()[sumym],Vc::Aligned);
828 cs1.yp.load(&chunk.template get<prop_src1>()[sumyp],Vc::Aligned);
829 cs1.zm.load(&chunk.template get<prop_src1>()[sumzm],Vc::Aligned);
830 cs1.zp.load(&chunk.template get<prop_src1>()[sumzp],Vc::Aligned);
831
832 cs2.ym.load(&chunk.template get<prop_src2>()[sumym],Vc::Aligned);
833 cs2.yp.load(&chunk.template get<prop_src2>()[sumyp],Vc::Aligned);
834 cs2.zm.load(&chunk.template get<prop_src2>()[sumzm],Vc::Aligned);
835 cs2.zp.load(&chunk.template get<prop_src2>()[sumzp],Vc::Aligned);
836
837 // Calculate
838
839 data_il<4> tot_m;
840 tot_m.i = mxm.i + mxp.i + mym.i + myp.i + mzm.i + mzp.i;
841
842 Vc::Vector<prop_type> res1;
843 Vc::Vector<prop_type> res2;
844
845 func(res1,res2,cmd1,cmd2,cs1,cs2,tot_m.uc,args ... );
846
847 Vc::Mask<prop_type> m(&mask_row[k]);
848
849 res1.store(&chunk.template get<prop_dst1>()[s2],m,Vc::Aligned);
850 res2.store(&chunk.template get<prop_dst2>()[s2],m,Vc::Aligned);
851
852 s2 += Vc::Vector<prop_type>::Size;
853 }
854 }
855 }
856
857 ++it;
858 }
859 }
860
861 template<bool findNN, unsigned int stencil_size, typename prop_type, typename SparseGridType, typename lambda_f, typename ... ArgsT >
862 static void conv_cross_ids(grid_key_dx<3> & start, grid_key_dx<3> & stop, SparseGridType & grid , lambda_f func, ArgsT ... args)
863 {
864 auto it = grid.template getBlockIterator<stencil_size>(start,stop);
865
866 auto & datas = grid.private_get_data();
867 auto & headers = grid.private_get_header_mask();
868
869 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<0>>::type sz0;
870 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<1>>::type sz1;
871 typedef typename boost::mpl::at<typename decltype(it)::stop_border_vmpl,boost::mpl::int_<2>>::type sz2;
872
873 typedef typename SparseGridType::chunking_type chunking;
874
875 while (it.isNext())
876 {
877 // Load
878 long int offset_jump[6];
879
880 size_t cid = it.getChunkId();
881
882 auto chunk = datas.get(cid);
883 auto & mask = headers.get(cid);
884
885 bool exist;
886 grid_key_dx<3> p = grid.getChunkPos(cid) + grid_key_dx<3>({-1,0,0});
887 long int r = grid.getChunk(p,exist);
888 offset_jump[0] = (r-cid)*decltype(it)::sizeBlock;
889
890 p = grid.getChunkPos(cid) + grid_key_dx<3>({1,0,0});
891 r = grid.getChunk(p,exist);
892 offset_jump[1] = (r-cid)*decltype(it)::sizeBlock;
893
894 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,-1,0});
895 r = grid.getChunk(p,exist);
896 offset_jump[2] = (r-cid)*decltype(it)::sizeBlock;
897
898 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,1,0});
899 r = grid.getChunk(p,exist);
900 offset_jump[3] = (r-cid)*decltype(it)::sizeBlock;
901
902 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,-1});
903 r = grid.getChunk(p,exist);
904 offset_jump[4] = (r-cid)*decltype(it)::sizeBlock;
905
906 p = grid.getChunkPos(cid) + grid_key_dx<3>({0,0,1});
907 r = grid.getChunk(p,exist);
908 offset_jump[5] = (r-cid)*decltype(it)::sizeBlock;
909
910 // Load offset jumps
911
912 // construct a row mask
913
914 long int s2 = 0;
915
916 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<2>>::type sz;
917 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<1>>::type sy;
918 typedef typename boost::mpl::at<typename chunking::type,boost::mpl::int_<0>>::type sx;
919
920 ids_crs<3,sx::value> ids;
921
922 for (int k = 0 ; k < sx::value ; k++)
923 {
924 ids.mask_row[k] = (k >= it.start(0) && k < it.stop(0))?true:false;
925 }
926
927 for (int v = it.start(2) ; v < it.stop(2) ; v++)
928 {
929 for (int j = it.start(1) ; j < it.stop(1) ; j++)
930 {
931 s2 = it.Lin(0,j,v);
932 for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
933 {
934 // we do only id exist the point
935 if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
936
937 data_il<4> mxm;
938 data_il<4> mxp;
939 data_il<4> mym;
940 data_il<4> myp;
941 data_il<4> mzm;
942 data_il<4> mzp;
943
944 ids.k = k;
945
946 // Load x-1
947 ids.sumdm[0] = s2-1;
948 ids.sumdm[0] += (k==0)?offset_jump[0] + sx::value:0;
949
950 // Load x+1
951 ids.sumdp[0] = s2+Vc::Vector<prop_type>::Size;
952 ids.sumdp[0] += (k+Vc::Vector<prop_type>::Size == sx::value)?offset_jump[1] - sx::value:0;
953
954 ids.sumdm[1] = (j == 0)?offset_jump[2] + (sy::value-1)*sx::value:-sx::value;
955 ids.sumdm[1] += s2;
956 ids.sumdp[1] = (j == sy::value-1)?offset_jump[3] - (sy::value - 1)*sx::value:sx::value;
957 ids.sumdp[1] += s2;
958 ids.sumdm[2] = (v == 0)?offset_jump[4] + (sz::value-1)*sx::value*sy::value:-sx::value*sy::value;
959 ids.sumdm[2] += s2;
960 ids.sumdp[2] = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
961 ids.sumdp[2] += s2;
962
963 ids.s2 = s2;
964
965 if (Vc::Vector<prop_type>::Size == 2)
966 {
967 mxm.i = *(short int *)&mask.mask[s2];
968 mxm.i = mxm.i << 8;
969 mxm.i |= (short int)mask.mask[ids.sumdm[0]];
970
971 mxp.i = *(short int *)&mask.mask[s2];
972 mxp.i = mxp.i >> 8;
973 mxp.i |= ((short int)mask.mask[ids.sumdp[0]]) << (Vc::Vector<prop_type>::Size - 1)*8;
974
975 mym.i = *(short int *)&mask.mask[ids.sumdm[1]];
976 myp.i = *(short int *)&mask.mask[ids.sumdp[1]];
977
978 mzm.i = *(short int *)&mask.mask[ids.sumdm[2]];
979 mzp.i = *(short int *)&mask.mask[ids.sumdp[2]];
980 }
981 else if (Vc::Vector<prop_type>::Size == 4)
982 {
983 mxm.i = *(int *)&mask.mask[s2];
984 mxm.i = mxm.i << 8;
985 mxm.i |= (int)mask.mask[ids.sumdm[0]];
986
987 mxp.i = *(int *)&mask.mask[s2];
988 mxp.i = mxp.i >> 8;
989 mxp.i |= ((int)mask.mask[ids.sumdp[0]]) << (Vc::Vector<prop_type>::Size - 1)*8;
990
991 mym.i = *(int *)&mask.mask[ids.sumdm[1]];
992 myp.i = *(int *)&mask.mask[ids.sumdp[1]];
993
994 mzm.i = *(int *)&mask.mask[ids.sumdm[2]];
995 mzp.i = *(int *)&mask.mask[ids.sumdp[2]];
996 }
997 else
998 {
999 std::cout << __FILE__ << ":" << __LINE__ << " UNSUPPORTED" << std::endl;
1000 }
1001
1002 // Calculate
1003
1004 data_il<4> tot_m;
1005 tot_m.i = mxm.i + mxp.i + mym.i + myp.i + mzm.i + mzp.i;
1006
1007 func(chunk,ids,tot_m.uc,args ... );
1008
1009 s2 += Vc::Vector<prop_type>::Size;
1010 }
1011 }
1012 }
1013
1014 ++it;
1015 }
1016 }
1017
1018};
1019
1020#endif
1021
1022
1023#endif /* SPARSEGRID_CONV_OPT_HPP_ */
1024