cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorRandom.h (12385B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
      6 //
      7 // This Source Code Form is subject to the terms of the Mozilla
      8 // Public License v. 2.0. If a copy of the MPL was not distributed
      9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     10 
     11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
     12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
     13 
     14 namespace Eigen {
     15 namespace internal {
     16 
     17 namespace {
     18 
     19 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
     20 #if defined(EIGEN_GPU_COMPILE_PHASE)
     21   // We don't support 3d kernels since we currently only use 1 and
     22   // 2d kernels.
     23   gpu_assert(threadIdx.z == 0);
     24   return blockIdx.x * blockDim.x + threadIdx.x 
     25          + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
     26 #else
     27   // Rely on Eigen's random implementation.
     28   return random<uint64_t>();
     29 #endif
     30 }
     31 
     32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
     33   // TODO: Unify with the implementation in the non blocking thread pool.
     34   uint64_t current = *state;
     35   // Update the internal state
     36   *state = current * 6364136223846793005ULL + (stream << 1 | 1);
     37   // Generate the random output (using the PCG-XSH-RS scheme)
     38   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
     39 }
     40 
     41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
     42   seed = seed ? seed : get_random_seed();
     43   return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
     44 }
     45 
     46 }  // namespace
     47 
     48 
     49 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     50 T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
     51   unsigned rnd = PCG_XSH_RS_generator(state, stream);
     52   return static_cast<T>(rnd);
     53 }
     54 
     55 
     56 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     57 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
     58   // Generate 10 random bits for the mantissa, merge with exponent.
     59   unsigned rnd = PCG_XSH_RS_generator(state, stream);
     60   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
     61   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
     62   // Return the final result
     63   return result - Eigen::half(1.0f);
     64 }
     65 
     66 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     67 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
     68 
     69   // Generate 7 random bits for the mantissa, merge with exponent.
     70   unsigned rnd = PCG_XSH_RS_generator(state, stream);
     71   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
     72   Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
     73   // Return the final result
     74   return result - Eigen::bfloat16(1.0f);
     75 }
     76 
     77 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     78 float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
     79   typedef union {
     80     uint32_t raw;
     81     float fp;
     82   } internal;
     83   internal result;
     84   // Generate 23 random bits for the mantissa mantissa
     85   const unsigned rnd = PCG_XSH_RS_generator(state, stream);
     86   result.raw = rnd & 0x7fffffu;
     87   // Set the exponent
     88   result.raw |= (static_cast<uint32_t>(127) << 23);
     89   // Return the final result
     90   return result.fp - 1.0f;
     91 }
     92 
     93 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     94 double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
     95   typedef union {
     96     uint64_t raw;
     97     double dp;
     98   } internal;
     99   internal result;
    100   result.raw = 0;
    101   // Generate 52 random bits for the mantissa
    102   // First generate the upper 20 bits
    103   unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
    104   // The generate the lower 32 bits
    105   unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
    106   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
    107   // Set the exponent
    108   result.raw |= (static_cast<uint64_t>(1023) << 52);
    109   // Return the final result
    110   return result.dp - 1.0;
    111 }
    112 
    113 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    114 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
    115   return std::complex<float>(RandomToTypeUniform<float>(state, stream),
    116                              RandomToTypeUniform<float>(state, stream));
    117 }
    118 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    119 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
    120   return std::complex<double>(RandomToTypeUniform<double>(state, stream),
    121                               RandomToTypeUniform<double>(state, stream));
    122 }
    123 
    124 template <typename T> class UniformRandomGenerator {
    125  public:
    126   static const bool PacketAccess = true;
    127 
    128   // Uses the given "seed" if non-zero, otherwise uses a random seed.
    129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
    130       uint64_t seed = 0) {
    131     m_state = PCG_XSH_RS_state(seed);
    132     #ifdef EIGEN_USE_SYCL
    133     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    134     // Therefor, we need two step to initializate the m_state.
    135     // IN SYCL, the constructor of the functor is s called on the CPU
    136     // and we get the clock seed here from the CPU. However, This seed is
    137     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    138     // and only  available on the Operator() function (which is called on the GPU).
    139     // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
    140     // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
    141     // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
    142     // similar to CUDA Therefore, the thread Id injection is not available at this stage.
    143     //However when the operator() is called the thread ID will be avilable. So inside the opeator,
    144     // we add the thrreadID, BlockId,... (which is equivalent of i)
    145     //to the seed and construct the unique m_state per thead similar to cuda.
    146     m_exec_once =false;
    147    #endif
    148   }
    149   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
    150       const UniformRandomGenerator& other) {
    151     m_state = other.m_state;
    152     #ifdef EIGEN_USE_SYCL
    153      m_exec_once =other.m_exec_once;
    154     #endif
    155   }
    156 
    157   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    158   T operator()(Index i) const {
    159     #ifdef EIGEN_USE_SYCL
    160       if(!m_exec_once) {
    161       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
    162       // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
    163        m_state += (i * 6364136223846793005ULL);
    164        m_exec_once =true;
    165       }
    166     #endif
    167     T result = RandomToTypeUniform<T>(&m_state, i);
    168     return result;
    169   }
    170 
    171   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    172   Packet packetOp(Index i) const {
    173     const int packetSize = internal::unpacket_traits<Packet>::size;
    174     EIGEN_ALIGN_MAX T values[packetSize];
    175       #ifdef EIGEN_USE_SYCL
    176       if(!m_exec_once) {
    177       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
    178        m_state += (i * 6364136223846793005ULL);
    179        m_exec_once =true;
    180       }
    181     #endif
    182     EIGEN_UNROLL_LOOP
    183     for (int j = 0; j < packetSize; ++j) {
    184       values[j] = RandomToTypeUniform<T>(&m_state, i);
    185     }
    186     return internal::pload<Packet>(values);
    187   }
    188 
    189  private:
    190   mutable uint64_t m_state;
    191   #ifdef EIGEN_USE_SYCL
    192   mutable bool m_exec_once;
    193   #endif
    194 };
    195 
    196 template <typename Scalar>
    197 struct functor_traits<UniformRandomGenerator<Scalar> > {
    198   enum {
    199     // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
    200     Cost = 12 * NumTraits<Scalar>::AddCost *
    201            ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
    202     PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
    203   };
    204 };
    205 
    206 
    207 
    208 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    209 T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
    210   // Use the ratio of uniform method to generate numbers following a normal
    211   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
    212   // details.
    213   T u, v, q;
    214   do {
    215     u = RandomToTypeUniform<T>(state, stream);
    216     v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
    217     const T x = u - T(0.449871);
    218     const T y = numext::abs(v) + T(0.386595);
    219     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
    220   } while (q > T(0.27597) &&
    221            (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
    222 
    223   return v/u;
    224 }
    225 
    226 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    227 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
    228   return std::complex<float>(RandomToTypeNormal<float>(state, stream),
    229                              RandomToTypeNormal<float>(state, stream));
    230 }
    231 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    232 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
    233   return std::complex<double>(RandomToTypeNormal<double>(state, stream),
    234                               RandomToTypeNormal<double>(state, stream));
    235 }
    236 
    237 
    238 template <typename T> class NormalRandomGenerator {
    239  public:
    240   static const bool PacketAccess = true;
    241 
    242   // Uses the given "seed" if non-zero, otherwise uses a random seed.
    243   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
    244     m_state = PCG_XSH_RS_state(seed);
    245     #ifdef EIGEN_USE_SYCL
    246     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    247     // Therefor, we need two steps to initializate the m_state.
    248     // IN SYCL, the constructor of the functor is s called on the CPU
    249     // and we get the clock seed here from the CPU. However, This seed is
    250     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    251     // and only  available on the Operator() function (which is called on the GPU).
    252     // Therefore, the thread Id injection is not available at this stage. However when the operator()
    253     //is called the thread ID will be avilable. So inside the opeator,
    254     // we add the thrreadID, BlockId,... (which is equivalent of i)
    255     //to the seed and construct the unique m_state per thead similar to cuda.
    256     m_exec_once =false;
    257    #endif
    258   }
    259   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
    260       const NormalRandomGenerator& other) {
    261     m_state = other.m_state;
    262 #ifdef EIGEN_USE_SYCL
    263     m_exec_once=other.m_exec_once;
    264 #endif
    265   }
    266 
    267  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    268   T operator()(Index i) const {
    269     #ifdef EIGEN_USE_SYCL
    270     if(!m_exec_once) {
    271       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
    272       m_state += (i * 6364136223846793005ULL);
    273       m_exec_once =true;
    274     }
    275     #endif
    276     T result = RandomToTypeNormal<T>(&m_state, i);
    277     return result;
    278   }
    279 
    280   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    281   Packet packetOp(Index i) const {
    282     const int packetSize = internal::unpacket_traits<Packet>::size;
    283     EIGEN_ALIGN_MAX T values[packetSize];
    284     #ifdef EIGEN_USE_SYCL
    285     if(!m_exec_once) {
    286       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
    287       m_state += (i * 6364136223846793005ULL);
    288       m_exec_once =true;
    289     }
    290     #endif
    291     EIGEN_UNROLL_LOOP
    292     for (int j = 0; j < packetSize; ++j) {
    293       values[j] = RandomToTypeNormal<T>(&m_state, i);
    294     }
    295     return internal::pload<Packet>(values);
    296   }
    297 
    298  private:
    299   mutable uint64_t m_state;
    300    #ifdef EIGEN_USE_SYCL
    301   mutable bool m_exec_once;
    302   #endif
    303 };
    304 
    305 
    306 template <typename Scalar>
    307 struct functor_traits<NormalRandomGenerator<Scalar> > {
    308   enum {
    309     // On average, we need to generate about 3 random numbers
    310     // 15 mul, 8 add, 1.5 logs
    311     Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
    312            15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
    313            3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
    314     PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
    315   };
    316 };
    317 
    318 
    319 } // end namespace internal
    320 } // end namespace Eigen
    321 
    322 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H