| 1 | /* |
| 2 | * compute_optimal_device_grid.hpp |
| 3 | * |
| 4 | * Created on: Oct 1, 2017 |
| 5 | * Author: i-bird |
| 6 | */ |
| 7 | |
| 8 | #ifndef OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ |
| 9 | #define OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ |
| 10 | |
| 11 | |
| 12 | /*! \brief Calculate an optimal decomposition in grids and threads for GPU or parallel accelerators |
| 13 | * |
| 14 | * First it factorize the grid size in each direction in prime factors. It try to get |
| 15 | * a multiple of two for the block size that does not pass the maximum block size. |
| 16 | * If the block size result too small (smaller than min_block_size) it try to construct a block |
| 17 | * non multiple of two but bigger than min_block_size and smaller than max_block_size. |
| 18 | * The min_block_size is constrain weakly imposed. When impossible (or difficult) to find |
| 19 | * a block size in the range the min_block_size constrain can be broken, but the max_block_size |
| 20 | * is never broken |
| 21 | * |
| 22 | * the function guarantee that |
| 23 | * |
| 24 | * sz[0] = dg.threads.x*dg.grids.x |
| 25 | * sz[1] = dg.threads.y*dg.grids.y |
| 26 | * sz[2]*...sz[dim-1] = dg.threads.z*dg.grids.z |
| 27 | * |
| 28 | * |
| 29 | * |
| 30 | * \param dg device grid |
| 31 | * \param sz size of the grid |
| 32 | * \param max_block_size maximum block size |
| 33 | * \param min_block_size minimum block size |
| 34 | * |
| 35 | */ |
| 36 | template<unsigned int dim> |
| 37 | void calculate_optimal_device_grid(device_grid<dim> & dg, |
| 38 | size_t (& sz)[dim], |
| 39 | size_t max_block_size, |
| 40 | size_t min_block_size) |
| 41 | { |
| 42 | |
| 43 | if (dim == 0) {return ;} |
| 44 | |
| 45 | size_t tot_block = 1; |
| 46 | |
| 47 | // Here we calculate the factors for each grid dimension and prioritize the |
| 48 | // factors by 2 for the blocks |
| 49 | |
| 50 | // Get the factors for x |
| 51 | std::vector<unsigned long int> x; |
| 52 | openfpm::math::getFactorization(sz[0],x); |
| 53 | |
| 54 | dg.threads.x = 1; |
| 55 | size_t jx = 0; |
| 56 | |
| 57 | while(jx < x.size() && x[jx] == 2 && tot_block < max_block_size) |
| 58 | { |
| 59 | if (tot_block * 2 > max_block_size) |
| 60 | {break;} |
| 61 | |
| 62 | dg.threads.x *= 2; |
| 63 | tot_block *= 2; |
| 64 | |
| 65 | jx++; |
| 66 | } |
| 67 | |
| 68 | // if we already reach the maximum block size we finished |
| 69 | if (tot_block * 2 > max_block_size) |
| 70 | { |
| 71 | dg.threads.y = 1; |
| 72 | dg.threads.z = 1; |
| 73 | |
| 74 | dg.grids.x = 1; |
| 75 | for (; jx < x.size() ; jx++) |
| 76 | {dg.grids.x *= x[jx];} |
| 77 | |
| 78 | if (dim >= 2) |
| 79 | {dg.grids.y = sz[1];} |
| 80 | else |
| 81 | {dg.grids.y = 1;} |
| 82 | |
| 83 | dg.grids.z = 1; |
| 84 | for (size_t k = 2 ; k < dim ; k++) |
| 85 | // coverty[dead_error_line] |
| 86 | {dg.grids.z *= sz[k];} |
| 87 | |
| 88 | return; |
| 89 | } |
| 90 | |
| 91 | |
| 92 | // Get the factors for y |
| 93 | std::vector<unsigned long int> y; |
| 94 | size_t jy = 0; |
| 95 | dg.threads.y = 1; |
| 96 | |
| 97 | if (dim >= 2) |
| 98 | { |
| 99 | openfpm::math::getFactorization(sz[1],y); |
| 100 | |
| 101 | while(jy < y.size() && y[jy] == 2 && tot_block < max_block_size) |
| 102 | { |
| 103 | if (tot_block * 2 > max_block_size) |
| 104 | {break;} |
| 105 | |
| 106 | dg.threads.y *= 2; |
| 107 | tot_block *= 2; |
| 108 | |
| 109 | jy++; |
| 110 | } |
| 111 | |
| 112 | // if we already reach the maximum block size we finished |
| 113 | if (tot_block * 2 > max_block_size) |
| 114 | { |
| 115 | dg.threads.z = 1; |
| 116 | |
| 117 | dg.grids.x = 1; |
| 118 | for (; jx < x.size() ; jx++) |
| 119 | {dg.grids.x *= x[jx];} |
| 120 | |
| 121 | dg.grids.y = 1; |
| 122 | for (; jy < y.size() ; jy++) |
| 123 | {dg.grids.y *= y[jy];} |
| 124 | |
| 125 | dg.grids.z = 1; |
| 126 | for (size_t k = 2 ; k < dim ; k++) |
| 127 | {dg.grids.z *= sz[k];} |
| 128 | |
| 129 | return; |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | // Get the factors for z |
| 134 | std::vector<unsigned long int> z; |
| 135 | |
| 136 | size_t jz = 0; |
| 137 | dg.threads.z = 1; |
| 138 | |
| 139 | if (dim >= 3) |
| 140 | { |
| 141 | openfpm::math::getFactorization(sz[2],z); |
| 142 | |
| 143 | while(jz < z.size() && z[jz] == 2 && tot_block < max_block_size) |
| 144 | { |
| 145 | if (tot_block * 2 > max_block_size) |
| 146 | {break;} |
| 147 | |
| 148 | dg.threads.z *= 2; |
| 149 | tot_block *= 2; |
| 150 | |
| 151 | jz++; |
| 152 | } |
| 153 | |
| 154 | // if we already reach the maximum block size we finished |
| 155 | if (tot_block * 2 > max_block_size) |
| 156 | { |
| 157 | dg.grids.x = 1; |
| 158 | for (; jx < x.size() ; jx++) |
| 159 | {dg.grids.x *= x[jx];} |
| 160 | |
| 161 | dg.grids.y = 1; |
| 162 | for (; jy < y.size() ; jy++) |
| 163 | {dg.grids.y *= y[jy];} |
| 164 | |
| 165 | dg.grids.z = 1; |
| 166 | for (; jz < z.size() ; jz++) |
| 167 | {dg.grids.z *= z[jz];} |
| 168 | |
| 169 | for (size_t k = 3 ; k < dim ; k++) |
| 170 | // coverty[dead_error_line] |
| 171 | {dg.grids.z *= sz[k];} |
| 172 | |
| 173 | return; |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | if (tot_block >= min_block_size) |
| 178 | {return;} |
| 179 | |
| 180 | // Calculate the grids from the threads configuration |
| 181 | |
| 182 | dg.grids.x = 1; |
| 183 | for (size_t k = jx ; k < x.size() ; k++) |
| 184 | {dg.grids.x *= x[k];} |
| 185 | |
| 186 | dg.grids.y = 1; |
| 187 | for (size_t k = jy ; k < y.size() ; k++) |
| 188 | {dg.grids.y *= y[k];} |
| 189 | |
| 190 | dg.grids.z = 1; |
| 191 | for (size_t k = jz ; k < z.size() ; k++) |
| 192 | {dg.grids.z *= z[k];} |
| 193 | |
| 194 | std::vector<unsigned long int> * ptr_xyz[3]; |
| 195 | ptr_xyz[0] = &x; |
| 196 | ptr_xyz[1] = &y; |
| 197 | ptr_xyz[2] = &z; |
| 198 | |
| 199 | size_t * jj[3]; |
| 200 | jj[0] = &jx; |
| 201 | jj[1] = &jy; |
| 202 | jj[2] = &jz; |
| 203 | |
| 204 | while (tot_block < min_block_size && (jx < x.size() || jy < y.size() || jz < z.size() ) ) |
| 205 | { |
| 206 | size_t best_fact = std::numeric_limits<size_t>::max(); |
| 207 | size_t k_best = 0; |
| 208 | |
| 209 | for (size_t k = 0 ; k < dim ; k++) |
| 210 | { |
| 211 | if (*jj[k] < ptr_xyz[k]->size() && ptr_xyz[k]->operator[](*jj[k]) < best_fact ) |
| 212 | { |
| 213 | best_fact = ptr_xyz[k]->operator[](*jj[k]); |
| 214 | k_best = k; |
| 215 | } |
| 216 | } |
| 217 | |
| 218 | // The maximum block size cannot be passed |
| 219 | if (tot_block * best_fact > max_block_size) |
| 220 | {break;} |
| 221 | |
| 222 | if (k_best == 0) |
| 223 | { |
| 224 | dg.threads.x *= best_fact; |
| 225 | dg.grids.x /= best_fact; |
| 226 | } |
| 227 | else if (k_best == 1) |
| 228 | { |
| 229 | dg.threads.y *= best_fact; |
| 230 | dg.grids.y /= best_fact; |
| 231 | } |
| 232 | /* coverty[dead_error_line] */ |
| 233 | else if (k_best == 2) |
| 234 | { |
| 235 | dg.threads.z *= best_fact; |
| 236 | dg.grids.z /= best_fact; |
| 237 | } |
| 238 | |
| 239 | tot_block *= best_fact; |
| 240 | |
| 241 | (*jj[k_best])++; |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | |
| 246 | #endif /* OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ */ |
| 247 | |