TensorRandom.h (12385B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd. 6 // 7 // This Source Code Form is subject to the terms of the Mozilla 8 // Public License v. 2.0. If a copy of the MPL was not distributed 9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10 11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H 12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H 13 14 namespace Eigen { 15 namespace internal { 16 17 namespace { 18 19 EIGEN_DEVICE_FUNC uint64_t get_random_seed() { 20 #if defined(EIGEN_GPU_COMPILE_PHASE) 21 // We don't support 3d kernels since we currently only use 1 and 22 // 2d kernels. 23 gpu_assert(threadIdx.z == 0); 24 return blockIdx.x * blockDim.x + threadIdx.x 25 + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); 26 #else 27 // Rely on Eigen's random implementation. 28 return random<uint64_t>(); 29 #endif 30 } 31 32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { 33 // TODO: Unify with the implementation in the non blocking thread pool. 34 uint64_t current = *state; 35 // Update the internal state 36 *state = current * 6364136223846793005ULL + (stream << 1 | 1); 37 // Generate the random output (using the PCG-XSH-RS scheme) 38 return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); 39 } 40 41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { 42 seed = seed ? seed : get_random_seed(); 43 return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; 44 } 45 46 } // namespace 47 48 49 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 50 T RandomToTypeUniform(uint64_t* state, uint64_t stream) { 51 unsigned rnd = PCG_XSH_RS_generator(state, stream); 52 return static_cast<T>(rnd); 53 } 54 55 56 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 57 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) { 58 // Generate 10 random bits for the mantissa, merge with exponent. 59 unsigned rnd = PCG_XSH_RS_generator(state, stream); 60 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10); 61 Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits); 62 // Return the final result 63 return result - Eigen::half(1.0f); 64 } 65 66 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 67 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) { 68 69 // Generate 7 random bits for the mantissa, merge with exponent. 70 unsigned rnd = PCG_XSH_RS_generator(state, stream); 71 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7); 72 Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits); 73 // Return the final result 74 return result - Eigen::bfloat16(1.0f); 75 } 76 77 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 78 float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) { 79 typedef union { 80 uint32_t raw; 81 float fp; 82 } internal; 83 internal result; 84 // Generate 23 random bits for the mantissa mantissa 85 const unsigned rnd = PCG_XSH_RS_generator(state, stream); 86 result.raw = rnd & 0x7fffffu; 87 // Set the exponent 88 result.raw |= (static_cast<uint32_t>(127) << 23); 89 // Return the final result 90 return result.fp - 1.0f; 91 } 92 93 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 94 double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { 95 typedef union { 96 uint64_t raw; 97 double dp; 98 } internal; 99 internal result; 100 result.raw = 0; 101 // Generate 52 random bits for the mantissa 102 // First generate the upper 20 bits 103 unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; 104 // The generate the lower 32 bits 105 unsigned rnd2 = PCG_XSH_RS_generator(state, stream); 106 result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2; 107 // Set the exponent 108 result.raw |= (static_cast<uint64_t>(1023) << 52); 109 // Return the final result 110 return result.dp - 1.0; 111 } 112 113 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 114 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) { 115 return std::complex<float>(RandomToTypeUniform<float>(state, stream), 116 RandomToTypeUniform<float>(state, stream)); 117 } 118 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 119 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) { 120 return std::complex<double>(RandomToTypeUniform<double>(state, stream), 121 RandomToTypeUniform<double>(state, stream)); 122 } 123 124 template <typename T> class UniformRandomGenerator { 125 public: 126 static const bool PacketAccess = true; 127 128 // Uses the given "seed" if non-zero, otherwise uses a random seed. 129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( 130 uint64_t seed = 0) { 131 m_state = PCG_XSH_RS_state(seed); 132 #ifdef EIGEN_USE_SYCL 133 // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 134 // Therefor, we need two step to initializate the m_state. 135 // IN SYCL, the constructor of the functor is s called on the CPU 136 // and we get the clock seed here from the CPU. However, This seed is 137 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. 138 // and only available on the Operator() function (which is called on the GPU). 139 // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread 140 // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds 141 // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction 142 // similar to CUDA Therefore, the thread Id injection is not available at this stage. 143 //However when the operator() is called the thread ID will be avilable. So inside the opeator, 144 // we add the thrreadID, BlockId,... (which is equivalent of i) 145 //to the seed and construct the unique m_state per thead similar to cuda. 146 m_exec_once =false; 147 #endif 148 } 149 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( 150 const UniformRandomGenerator& other) { 151 m_state = other.m_state; 152 #ifdef EIGEN_USE_SYCL 153 m_exec_once =other.m_exec_once; 154 #endif 155 } 156 157 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 158 T operator()(Index i) const { 159 #ifdef EIGEN_USE_SYCL 160 if(!m_exec_once) { 161 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread 162 // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side 163 m_state += (i * 6364136223846793005ULL); 164 m_exec_once =true; 165 } 166 #endif 167 T result = RandomToTypeUniform<T>(&m_state, i); 168 return result; 169 } 170 171 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 172 Packet packetOp(Index i) const { 173 const int packetSize = internal::unpacket_traits<Packet>::size; 174 EIGEN_ALIGN_MAX T values[packetSize]; 175 #ifdef EIGEN_USE_SYCL 176 if(!m_exec_once) { 177 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread 178 m_state += (i * 6364136223846793005ULL); 179 m_exec_once =true; 180 } 181 #endif 182 EIGEN_UNROLL_LOOP 183 for (int j = 0; j < packetSize; ++j) { 184 values[j] = RandomToTypeUniform<T>(&m_state, i); 185 } 186 return internal::pload<Packet>(values); 187 } 188 189 private: 190 mutable uint64_t m_state; 191 #ifdef EIGEN_USE_SYCL 192 mutable bool m_exec_once; 193 #endif 194 }; 195 196 template <typename Scalar> 197 struct functor_traits<UniformRandomGenerator<Scalar> > { 198 enum { 199 // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). 200 Cost = 12 * NumTraits<Scalar>::AddCost * 201 ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), 202 PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess 203 }; 204 }; 205 206 207 208 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 209 T RandomToTypeNormal(uint64_t* state, uint64_t stream) { 210 // Use the ratio of uniform method to generate numbers following a normal 211 // distribution. See for example Numerical Recipes chapter 7.3.9 for the 212 // details. 213 T u, v, q; 214 do { 215 u = RandomToTypeUniform<T>(state, stream); 216 v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5)); 217 const T x = u - T(0.449871); 218 const T y = numext::abs(v) + T(0.386595); 219 q = x*x + y * (T(0.196)*y - T(0.25472)*x); 220 } while (q > T(0.27597) && 221 (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); 222 223 return v/u; 224 } 225 226 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 227 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) { 228 return std::complex<float>(RandomToTypeNormal<float>(state, stream), 229 RandomToTypeNormal<float>(state, stream)); 230 } 231 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 232 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) { 233 return std::complex<double>(RandomToTypeNormal<double>(state, stream), 234 RandomToTypeNormal<double>(state, stream)); 235 } 236 237 238 template <typename T> class NormalRandomGenerator { 239 public: 240 static const bool PacketAccess = true; 241 242 // Uses the given "seed" if non-zero, otherwise uses a random seed. 243 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { 244 m_state = PCG_XSH_RS_state(seed); 245 #ifdef EIGEN_USE_SYCL 246 // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 247 // Therefor, we need two steps to initializate the m_state. 248 // IN SYCL, the constructor of the functor is s called on the CPU 249 // and we get the clock seed here from the CPU. However, This seed is 250 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. 251 // and only available on the Operator() function (which is called on the GPU). 252 // Therefore, the thread Id injection is not available at this stage. However when the operator() 253 //is called the thread ID will be avilable. So inside the opeator, 254 // we add the thrreadID, BlockId,... (which is equivalent of i) 255 //to the seed and construct the unique m_state per thead similar to cuda. 256 m_exec_once =false; 257 #endif 258 } 259 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( 260 const NormalRandomGenerator& other) { 261 m_state = other.m_state; 262 #ifdef EIGEN_USE_SYCL 263 m_exec_once=other.m_exec_once; 264 #endif 265 } 266 267 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 268 T operator()(Index i) const { 269 #ifdef EIGEN_USE_SYCL 270 if(!m_exec_once) { 271 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread 272 m_state += (i * 6364136223846793005ULL); 273 m_exec_once =true; 274 } 275 #endif 276 T result = RandomToTypeNormal<T>(&m_state, i); 277 return result; 278 } 279 280 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 281 Packet packetOp(Index i) const { 282 const int packetSize = internal::unpacket_traits<Packet>::size; 283 EIGEN_ALIGN_MAX T values[packetSize]; 284 #ifdef EIGEN_USE_SYCL 285 if(!m_exec_once) { 286 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread 287 m_state += (i * 6364136223846793005ULL); 288 m_exec_once =true; 289 } 290 #endif 291 EIGEN_UNROLL_LOOP 292 for (int j = 0; j < packetSize; ++j) { 293 values[j] = RandomToTypeNormal<T>(&m_state, i); 294 } 295 return internal::pload<Packet>(values); 296 } 297 298 private: 299 mutable uint64_t m_state; 300 #ifdef EIGEN_USE_SYCL 301 mutable bool m_exec_once; 302 #endif 303 }; 304 305 306 template <typename Scalar> 307 struct functor_traits<NormalRandomGenerator<Scalar> > { 308 enum { 309 // On average, we need to generate about 3 random numbers 310 // 15 mul, 8 add, 1.5 logs 311 Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 312 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost + 313 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2, 314 PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess 315 }; 316 }; 317 318 319 } // end namespace internal 320 } // end namespace Eigen 321 322 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H