| 1 | // Optimizations for random number functions, x86 version -*- C++ -*- | 
| 2 |  | 
| 3 | // Copyright (C) 2012-2019 Free Software Foundation, Inc. | 
| 4 | // | 
| 5 | // This file is part of the GNU ISO C++ Library.  This library is free | 
| 6 | // software; you can redistribute it and/or modify it under the | 
| 7 | // terms of the GNU General Public License as published by the | 
| 8 | // Free Software Foundation; either version 3, or (at your option) | 
| 9 | // any later version. | 
| 10 |  | 
| 11 | // This library is distributed in the hope that it will be useful, | 
| 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 14 | // GNU General Public License for more details. | 
| 15 |  | 
| 16 | // Under Section 7 of GPL version 3, you are granted additional | 
| 17 | // permissions described in the GCC Runtime Library Exception, version | 
| 18 | // 3.1, as published by the Free Software Foundation. | 
| 19 |  | 
| 20 | // You should have received a copy of the GNU General Public License and | 
| 21 | // a copy of the GCC Runtime Library Exception along with this program; | 
| 22 | // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see | 
| 23 | // <http://www.gnu.org/licenses/>. | 
| 24 |  | 
| 25 | /** @file bits/opt_random.h | 
| 26 |  *  This is an internal header file, included by other library headers. | 
| 27 |  *  Do not attempt to use it directly. @headername{random} | 
| 28 |  */ | 
| 29 |  | 
| 30 | #ifndef _BITS_OPT_RANDOM_H | 
| 31 | #define _BITS_OPT_RANDOM_H 1 | 
| 32 |  | 
| 33 | #ifdef __SSE3__ | 
| 34 | #include <pmmintrin.h> | 
| 35 | #endif | 
| 36 |  | 
| 37 |  | 
| 38 | #pragma GCC system_header | 
| 39 |  | 
| 40 |  | 
| 41 | namespace std _GLIBCXX_VISIBILITY(default) | 
| 42 | { | 
| 43 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | 
| 44 |  | 
| 45 | #ifdef __SSE3__ | 
| 46 |   template<> | 
| 47 |     template<typename _UniformRandomNumberGenerator> | 
| 48 |       void | 
| 49 |       normal_distribution<double>:: | 
| 50 |       __generate(typename normal_distribution<double>::result_type* __f, | 
| 51 | 		 typename normal_distribution<double>::result_type* __t, | 
| 52 | 		 _UniformRandomNumberGenerator& __urng, | 
| 53 | 		 const param_type& __param) | 
| 54 |       { | 
| 55 | 	typedef uint64_t __uctype; | 
| 56 |  | 
| 57 | 	if (__f == __t) | 
| 58 | 	  return; | 
| 59 |  | 
| 60 | 	if (_M_saved_available) | 
| 61 | 	  { | 
| 62 | 	    _M_saved_available = false; | 
| 63 | 	    *__f++ = _M_saved * __param.stddev() + __param.mean(); | 
| 64 |  | 
| 65 | 	    if (__f == __t) | 
| 66 | 	      return; | 
| 67 | 	  } | 
| 68 |  | 
| 69 | 	constexpr uint64_t __maskval = 0xfffffffffffffull; | 
| 70 | 	static const __m128i __mask = _mm_set1_epi64x(__maskval); | 
| 71 | 	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull); | 
| 72 | 	static const __m128d __three = _mm_set1_pd(3.0); | 
| 73 | 	const __m128d __av = _mm_set1_pd(__param.mean()); | 
| 74 |  | 
| 75 | 	const __uctype __urngmin = __urng.min(); | 
| 76 | 	const __uctype __urngmax = __urng.max(); | 
| 77 | 	const __uctype __urngrange = __urngmax - __urngmin; | 
| 78 | 	const __uctype __uerngrange = __urngrange + 1; | 
| 79 |  | 
| 80 | 	while (__f + 1 < __t) | 
| 81 | 	  { | 
| 82 | 	    double __le; | 
| 83 | 	    __m128d __x; | 
| 84 | 	    do | 
| 85 | 	      { | 
| 86 |                 union | 
| 87 |                 { | 
| 88 |                   __m128i __i; | 
| 89 |                   __m128d __d; | 
| 90 | 		} __v; | 
| 91 |  | 
| 92 | 		if (__urngrange > __maskval) | 
| 93 | 		  { | 
| 94 | 		    if (__detail::_Power_of_2(__uerngrange)) | 
| 95 | 		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(), | 
| 96 | 							     __urng()), | 
| 97 | 					      __mask); | 
| 98 | 		    else | 
| 99 | 		      { | 
| 100 | 			const __uctype __uerange = __maskval + 1; | 
| 101 | 			const __uctype __scaling = __urngrange / __uerange; | 
| 102 | 			const __uctype __past = __uerange * __scaling; | 
| 103 | 			uint64_t __v1; | 
| 104 | 			do | 
| 105 | 			  __v1 = __uctype(__urng()) - __urngmin; | 
| 106 | 			while (__v1 >= __past); | 
| 107 | 			__v1 /= __scaling; | 
| 108 | 			uint64_t __v2; | 
| 109 | 			do | 
| 110 | 			  __v2 = __uctype(__urng()) - __urngmin; | 
| 111 | 			while (__v2 >= __past); | 
| 112 | 			__v2 /= __scaling; | 
| 113 |  | 
| 114 | 			__v.__i = _mm_set_epi64x(__v1, __v2); | 
| 115 | 		      } | 
| 116 | 		  } | 
| 117 | 		else if (__urngrange == __maskval) | 
| 118 | 		  __v.__i = _mm_set_epi64x(__urng(), __urng()); | 
| 119 | 		else if ((__urngrange + 2) * __urngrange >= __maskval | 
| 120 | 			 && __detail::_Power_of_2(__uerngrange)) | 
| 121 | 		  { | 
| 122 | 		    uint64_t __v1 = __urng() * __uerngrange + __urng(); | 
| 123 | 		    uint64_t __v2 = __urng() * __uerngrange + __urng(); | 
| 124 |  | 
| 125 | 		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2), | 
| 126 | 					    __mask); | 
| 127 | 		  } | 
| 128 | 		else | 
| 129 | 		  { | 
| 130 | 		    size_t __nrng = 2; | 
| 131 | 		    __uctype __high = __maskval / __uerngrange / __uerngrange; | 
| 132 | 		    while (__high > __uerngrange) | 
| 133 | 		      { | 
| 134 | 			++__nrng; | 
| 135 | 			__high /= __uerngrange; | 
| 136 | 		      } | 
| 137 | 		    const __uctype __highrange = __high + 1; | 
| 138 | 		    const __uctype __scaling = __urngrange / __highrange; | 
| 139 | 		    const __uctype __past = __highrange * __scaling; | 
| 140 | 		    __uctype __tmp; | 
| 141 |  | 
| 142 | 		    uint64_t __v1; | 
| 143 | 		    do | 
| 144 | 		      { | 
| 145 | 			do | 
| 146 | 			  __tmp = __uctype(__urng()) - __urngmin; | 
| 147 | 			while (__tmp >= __past); | 
| 148 | 			__v1 = __tmp / __scaling; | 
| 149 | 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) | 
| 150 | 			  { | 
| 151 | 			    __tmp = __v1; | 
| 152 | 			    __v1 *= __uerngrange; | 
| 153 | 			    __v1 += __uctype(__urng()) - __urngmin; | 
| 154 | 			  } | 
| 155 | 		      } | 
| 156 | 		    while (__v1 > __maskval || __v1 < __tmp); | 
| 157 |  | 
| 158 | 		    uint64_t __v2; | 
| 159 | 		    do | 
| 160 | 		      { | 
| 161 | 			do | 
| 162 | 			  __tmp = __uctype(__urng()) - __urngmin; | 
| 163 | 			while (__tmp >= __past); | 
| 164 | 			__v2 = __tmp / __scaling; | 
| 165 | 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) | 
| 166 | 			  { | 
| 167 | 			    __tmp = __v2; | 
| 168 | 			    __v2 *= __uerngrange; | 
| 169 | 			    __v2 += __uctype(__urng()) - __urngmin; | 
| 170 | 			  } | 
| 171 | 		      } | 
| 172 | 		    while (__v2 > __maskval || __v2 < __tmp); | 
| 173 |  | 
| 174 | 		    __v.__i = _mm_set_epi64x(__v1, __v2); | 
| 175 | 		  } | 
| 176 |  | 
| 177 | 		__v.__i = _mm_or_si128(__v.__i, __two); | 
| 178 | 		__x = _mm_sub_pd(__v.__d, __three); | 
| 179 | 		__m128d __m = _mm_mul_pd(__x, __x); | 
| 180 | 		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m)); | 
| 181 |               } | 
| 182 |             while (__le == 0.0 || __le >= 1.0); | 
| 183 |  | 
| 184 |             double __mult = (std::sqrt(-2.0 * std::log(__le) / __le) | 
| 185 |                              * __param.stddev()); | 
| 186 |  | 
| 187 |             __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av); | 
| 188 |  | 
| 189 |             _mm_storeu_pd(__f, __x); | 
| 190 |             __f += 2; | 
| 191 |           } | 
| 192 |  | 
| 193 |         if (__f != __t) | 
| 194 |           { | 
| 195 |             result_type __x, __y, __r2; | 
| 196 |  | 
| 197 |             __detail::_Adaptor<_UniformRandomNumberGenerator, result_type> | 
| 198 |               __aurng(__urng); | 
| 199 |  | 
| 200 |             do | 
| 201 |               { | 
| 202 |                 __x = result_type(2.0) * __aurng() - 1.0; | 
| 203 |                 __y = result_type(2.0) * __aurng() - 1.0; | 
| 204 |                 __r2 = __x * __x + __y * __y; | 
| 205 |               } | 
| 206 |             while (__r2 > 1.0 || __r2 == 0.0); | 
| 207 |  | 
| 208 |             const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2); | 
| 209 |             _M_saved = __x * __mult; | 
| 210 |             _M_saved_available = true; | 
| 211 |             *__f = __y * __mult * __param.stddev() + __param.mean(); | 
| 212 |           } | 
| 213 |       } | 
| 214 | #endif | 
| 215 |  | 
| 216 |  | 
| 217 | _GLIBCXX_END_NAMESPACE_VERSION | 
| 218 | } // namespace | 
| 219 |  | 
| 220 |  | 
| 221 | #endif // _BITS_OPT_RANDOM_H | 
| 222 |  |