compute_optimal_device_grid.hpp source code [openfpm/openfpm_data/src/util/compute_optimal_device_grid.hpp]

1	/*
2	* compute_optimal_device_grid.hpp
3	*
4	* Created on: Oct 1, 2017
5	* Author: i-bird
6	*/
7
8	#ifndef OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_
9	#define OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_
10
11
12	/! \brief Calculate an optimal decomposition in grids and threads for GPU or parallel accelerators*
13	*
14	* First it factorize the grid size in each direction in prime factors. It try to get
15	* a multiple of two for the block size that does not pass the maximum block size.
16	* If the block size result too small (smaller than min_block_size) it try to construct a block
17	* non multiple of two but bigger than min_block_size and smaller than max_block_size.
18	* The min_block_size is constrain weakly imposed. When impossible (or difficult) to find
19	* a block size in the range the min_block_size constrain can be broken, but the max_block_size
20	* is never broken
21	*
22	* the function guarantee that
23	*
24	* sz[0] = dg.threads.x*dg.grids.x
25	* sz[1] = dg.threads.y*dg.grids.y
26	* sz[2]...sz[dim-1] = dg.threads.zdg.grids.z
27	*
28	*
29	*
30	* \param dg device grid
31	* \param sz size of the grid
32	* \param max_block_size maximum block size
33	* \param min_block_size minimum block size
34	*
35	*/
36	template<unsigned int dim>
37	void calculate_optimal_device_grid(device_grid<dim> & dg,
38	size_t (& sz)[dim],
39	size_t max_block_size,
40	size_t min_block_size)
41	{
42
43	if (dim == `0`) {return ;}
44
45	size_t tot_block = `1`;
46
47	// Here we calculate the factors for each grid dimension and prioritize the
48	// factors by 2 for the blocks
49
50	// Get the factors for x
51	std::vector<unsigned long int> x;
52	openfpm::math::getFactorization(sz[`0`],x);
53
54	dg.threads.x = `1`;
55	size_t jx = `0`;
56
57	while(jx < x.size() && x [jx] == `2` && tot_block < max_block_size)
58	{
59	if (tot_block * `2` > max_block_size)
60	{break;}
61
62	dg.threads.x *= `2`;
63	tot_block *= `2`;
64
65	jx++;
66	}
67
68	// if we already reach the maximum block size we finished
69	if (tot_block * `2` > max_block_size)
70	{
71	dg.threads.y = `1`;
72	dg.threads.z = `1`;
73
74	dg.grids.x = `1`;
75	for (; jx < x.size() ; jx++)
76	{dg.grids.x *= x [jx];}
77
78	if (dim >= `2`)
79	{dg.grids.y = sz[`1`];}
80	else
81	{dg.grids.y = `1`;}
82
83	dg.grids.z = `1`;
84	for (size_t k = `2` ; k < dim ; k++)
85	// coverty[dead_error_line]
86	{dg.grids.z *= sz[k];}
87
88	return;
89	}
90
91
92	// Get the factors for y
93	std::vector<unsigned long int> y;
94	size_t jy = `0`;
95	dg.threads.y = `1`;
96
97	if (dim >= `2`)
98	{
99	openfpm::math::getFactorization(sz[`1`],y);
100
101	while(jy < y.size() && y [jy] == `2` && tot_block < max_block_size)
102	{
103	if (tot_block * `2` > max_block_size)
104	{break;}
105
106	dg.threads.y *= `2`;
107	tot_block *= `2`;
108
109	jy++;
110	}
111
112	// if we already reach the maximum block size we finished
113	if (tot_block * `2` > max_block_size)
114	{
115	dg.threads.z = `1`;
116
117	dg.grids.x = `1`;
118	for (; jx < x.size() ; jx++)
119	{dg.grids.x *= x [jx];}
120
121	dg.grids.y = `1`;
122	for (; jy < y.size() ; jy++)
123	{dg.grids.y *= y [jy];}
124
125	dg.grids.z = `1`;
126	for (size_t k = `2` ; k < dim ; k++)
127	{dg.grids.z *= sz[k];}
128
129	return;
130	}
131	}
132
133	// Get the factors for z
134	std::vector<unsigned long int> z;
135
136	size_t jz = `0`;
137	dg.threads.z = `1`;
138
139	if (dim >= `3`)
140	{
141	openfpm::math::getFactorization(sz[`2`],z);
142
143	while(jz < z.size() && z [jz] == `2` && tot_block < max_block_size)
144	{
145	if (tot_block * `2` > max_block_size)
146	{break;}
147
148	dg.threads.z *= `2`;
149	tot_block *= `2`;
150
151	jz++;
152	}
153
154	// if we already reach the maximum block size we finished
155	if (tot_block * `2` > max_block_size)
156	{
157	dg.grids.x = `1`;
158	for (; jx < x.size() ; jx++)
159	{dg.grids.x *= x [jx];}
160
161	dg.grids.y = `1`;
162	for (; jy < y.size() ; jy++)
163	{dg.grids.y *= y [jy];}
164
165	dg.grids.z = `1`;
166	for (; jz < z.size() ; jz++)
167	{dg.grids.z *= z [jz];}
168
169	for (size_t k = `3` ; k < dim ; k++)
170	// coverty[dead_error_line]
171	{dg.grids.z *= sz[k];}
172
173	return;
174	}
175	}
176
177	if (tot_block >= min_block_size)
178	{return;}
179
180	// Calculate the grids from the threads configuration
181
182	dg.grids.x = `1`;
183	for (size_t k = jx ; k < x.size() ; k++)
184	{dg.grids.x *= x [k];}
185
186	dg.grids.y = `1`;
187	for (size_t k = jy ; k < y.size() ; k++)
188	{dg.grids.y *= y [k];}
189
190	dg.grids.z = `1`;
191	for (size_t k = jz ; k < z.size() ; k++)
192	{dg.grids.z *= z [k];}
193
194	std::vector<unsigned long int> * ptr_xyz[`3`];
195	ptr_xyz[`0`] = &x;
196	ptr_xyz[`1`] = &y;
197	ptr_xyz[`2`] = &z;
198
199	size_t * jj[`3`];
200	jj[`0`] = &jx;
201	jj[`1`] = &jy;
202	jj[`2`] = &jz;
203
204	while (tot_block < min_block_size && (jx < x.size() \|\| jy < y.size() \|\| jz < z.size() ) )
205	{
206	size_t best_fact = std::numeric_limits<size_t>::max();
207	size_t k_best = `0`;
208
209	for (size_t k = `0` ; k < dim ; k++)
210	{
211	if (jj[k] < ptr_xyz[k]->size() && ptr_xyz[k]->operator[](jj[k]) < best_fact )
212	{
213	best_fact = ptr_xyz[k]->operator[](*jj[k]);
214	k_best = k;
215	}
216	}
217
218	// The maximum block size cannot be passed
219	if (tot_block * best_fact > max_block_size)
220	{break;}
221
222	if (k_best == `0`)
223	{
224	dg.threads.x *= best_fact;
225	dg.grids.x /= best_fact;
226	}
227	else if (k_best == `1`)
228	{
229	dg.threads.y *= best_fact;
230	dg.grids.y /= best_fact;
231	}
232	/ coverty[dead_error_line] /
233	else if (k_best == `2`)
234	{
235	dg.threads.z *= best_fact;
236	dg.grids.z /= best_fact;
237	}
238
239	tot_block *= best_fact;
240
241	(*jj[k_best])++;
242	}
243	}
244
245
246	#endif /* OPENFPM_DATA_SRC_UTIL_COMPUTE_OPTIMAL_DEVICE_GRID_HPP_ */
247

Browse the source code of openfpm/openfpm_data/src/util/compute_optimal_device_grid.hpp