1 | /* |
2 | * compute_optimal_device_grid.hpp |
3 | * |
4 | * Created on: Oct 1, 2017 |
5 | * Author: i-bird |
6 | */ |
7 | |
8 | #ifndef OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ |
9 | #define OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ |
10 | |
11 | |
12 | /*! \brief Calculate an optimal decomposition in grids and threads for GPU or parallel accelerators |
13 | * |
14 | * First it factorize the grid size in each direction in prime factors. It try to get |
15 | * a multiple of two for the block size that does not pass the maximum block size. |
16 | * If the block size result too small (smaller than min_block_size) it try to construct a block |
17 | * non multiple of two but bigger than min_block_size and smaller than max_block_size. |
18 | * The min_block_size is constrain weakly imposed. When impossible (or difficult) to find |
19 | * a block size in the range the min_block_size constrain can be broken, but the max_block_size |
20 | * is never broken |
21 | * |
22 | * the function guarantee that |
23 | * |
24 | * sz[0] = dg.threads.x*dg.grids.x |
25 | * sz[1] = dg.threads.y*dg.grids.y |
26 | * sz[2]*...sz[dim-1] = dg.threads.z*dg.grids.z |
27 | * |
28 | * |
29 | * |
30 | * \param dg device grid |
31 | * \param sz size of the grid |
32 | * \param max_block_size maximum block size |
33 | * \param min_block_size minimum block size |
34 | * |
35 | */ |
36 | template<unsigned int dim> |
37 | void calculate_optimal_device_grid(device_grid<dim> & dg, |
38 | size_t (& sz)[dim], |
39 | size_t max_block_size, |
40 | size_t min_block_size) |
41 | { |
42 | |
43 | if (dim == 0) {return ;} |
44 | |
45 | size_t tot_block = 1; |
46 | |
47 | // Here we calculate the factors for each grid dimension and prioritize the |
48 | // factors by 2 for the blocks |
49 | |
50 | // Get the factors for x |
51 | std::vector<unsigned long int> x; |
52 | openfpm::math::getFactorization(sz[0],x); |
53 | |
54 | dg.threads.x = 1; |
55 | size_t jx = 0; |
56 | |
57 | while(jx < x.size() && x[jx] == 2 && tot_block < max_block_size) |
58 | { |
59 | if (tot_block * 2 > max_block_size) |
60 | {break;} |
61 | |
62 | dg.threads.x *= 2; |
63 | tot_block *= 2; |
64 | |
65 | jx++; |
66 | } |
67 | |
68 | // if we already reach the maximum block size we finished |
69 | if (tot_block * 2 > max_block_size) |
70 | { |
71 | dg.threads.y = 1; |
72 | dg.threads.z = 1; |
73 | |
74 | dg.grids.x = 1; |
75 | for (; jx < x.size() ; jx++) |
76 | {dg.grids.x *= x[jx];} |
77 | |
78 | if (dim >= 2) |
79 | {dg.grids.y = sz[1];} |
80 | else |
81 | {dg.grids.y = 1;} |
82 | |
83 | dg.grids.z = 1; |
84 | for (size_t k = 2 ; k < dim ; k++) |
85 | // coverty[dead_error_line] |
86 | {dg.grids.z *= sz[k];} |
87 | |
88 | return; |
89 | } |
90 | |
91 | |
92 | // Get the factors for y |
93 | std::vector<unsigned long int> y; |
94 | size_t jy = 0; |
95 | dg.threads.y = 1; |
96 | |
97 | if (dim >= 2) |
98 | { |
99 | openfpm::math::getFactorization(sz[1],y); |
100 | |
101 | while(jy < y.size() && y[jy] == 2 && tot_block < max_block_size) |
102 | { |
103 | if (tot_block * 2 > max_block_size) |
104 | {break;} |
105 | |
106 | dg.threads.y *= 2; |
107 | tot_block *= 2; |
108 | |
109 | jy++; |
110 | } |
111 | |
112 | // if we already reach the maximum block size we finished |
113 | if (tot_block * 2 > max_block_size) |
114 | { |
115 | dg.threads.z = 1; |
116 | |
117 | dg.grids.x = 1; |
118 | for (; jx < x.size() ; jx++) |
119 | {dg.grids.x *= x[jx];} |
120 | |
121 | dg.grids.y = 1; |
122 | for (; jy < y.size() ; jy++) |
123 | {dg.grids.y *= y[jy];} |
124 | |
125 | dg.grids.z = 1; |
126 | for (size_t k = 2 ; k < dim ; k++) |
127 | {dg.grids.z *= sz[k];} |
128 | |
129 | return; |
130 | } |
131 | } |
132 | |
133 | // Get the factors for z |
134 | std::vector<unsigned long int> z; |
135 | |
136 | size_t jz = 0; |
137 | dg.threads.z = 1; |
138 | |
139 | if (dim >= 3) |
140 | { |
141 | openfpm::math::getFactorization(sz[2],z); |
142 | |
143 | while(jz < z.size() && z[jz] == 2 && tot_block < max_block_size) |
144 | { |
145 | if (tot_block * 2 > max_block_size) |
146 | {break;} |
147 | |
148 | dg.threads.z *= 2; |
149 | tot_block *= 2; |
150 | |
151 | jz++; |
152 | } |
153 | |
154 | // if we already reach the maximum block size we finished |
155 | if (tot_block * 2 > max_block_size) |
156 | { |
157 | dg.grids.x = 1; |
158 | for (; jx < x.size() ; jx++) |
159 | {dg.grids.x *= x[jx];} |
160 | |
161 | dg.grids.y = 1; |
162 | for (; jy < y.size() ; jy++) |
163 | {dg.grids.y *= y[jy];} |
164 | |
165 | dg.grids.z = 1; |
166 | for (; jz < z.size() ; jz++) |
167 | {dg.grids.z *= z[jz];} |
168 | |
169 | for (size_t k = 3 ; k < dim ; k++) |
170 | // coverty[dead_error_line] |
171 | {dg.grids.z *= sz[k];} |
172 | |
173 | return; |
174 | } |
175 | } |
176 | |
177 | if (tot_block >= min_block_size) |
178 | {return;} |
179 | |
180 | // Calculate the grids from the threads configuration |
181 | |
182 | dg.grids.x = 1; |
183 | for (size_t k = jx ; k < x.size() ; k++) |
184 | {dg.grids.x *= x[k];} |
185 | |
186 | dg.grids.y = 1; |
187 | for (size_t k = jy ; k < y.size() ; k++) |
188 | {dg.grids.y *= y[k];} |
189 | |
190 | dg.grids.z = 1; |
191 | for (size_t k = jz ; k < z.size() ; k++) |
192 | {dg.grids.z *= z[k];} |
193 | |
194 | std::vector<unsigned long int> * ptr_xyz[3]; |
195 | ptr_xyz[0] = &x; |
196 | ptr_xyz[1] = &y; |
197 | ptr_xyz[2] = &z; |
198 | |
199 | size_t * jj[3]; |
200 | jj[0] = &jx; |
201 | jj[1] = &jy; |
202 | jj[2] = &jz; |
203 | |
204 | while (tot_block < min_block_size && (jx < x.size() || jy < y.size() || jz < z.size() ) ) |
205 | { |
206 | size_t best_fact = std::numeric_limits<size_t>::max(); |
207 | size_t k_best = 0; |
208 | |
209 | for (size_t k = 0 ; k < dim ; k++) |
210 | { |
211 | if (*jj[k] < ptr_xyz[k]->size() && ptr_xyz[k]->operator[](*jj[k]) < best_fact ) |
212 | { |
213 | best_fact = ptr_xyz[k]->operator[](*jj[k]); |
214 | k_best = k; |
215 | } |
216 | } |
217 | |
218 | // The maximum block size cannot be passed |
219 | if (tot_block * best_fact > max_block_size) |
220 | {break;} |
221 | |
222 | if (k_best == 0) |
223 | { |
224 | dg.threads.x *= best_fact; |
225 | dg.grids.x /= best_fact; |
226 | } |
227 | else if (k_best == 1) |
228 | { |
229 | dg.threads.y *= best_fact; |
230 | dg.grids.y /= best_fact; |
231 | } |
232 | /* coverty[dead_error_line] */ |
233 | else if (k_best == 2) |
234 | { |
235 | dg.threads.z *= best_fact; |
236 | dg.grids.z /= best_fact; |
237 | } |
238 | |
239 | tot_block *= best_fact; |
240 | |
241 | (*jj[k_best])++; |
242 | } |
243 | } |
244 | |
245 | |
246 | #endif /* OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ */ |
247 | |