cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorReductionGpu.h (40719B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
     12 
     13 namespace Eigen {
     14 namespace internal {
     15 
     16 
     17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
     18 // Full reducers for GPU, don't vectorize for now
     19 
     20 // Reducer function that enables multiple gpu thread to safely accumulate at the same
     21 // output address. It basically reads the current value of the output variable, and
     22 // attempts to update it with the new value. If in the meantime another gpu thread
     23 // updated the content of the output address it will try again.
     24 template <typename T, typename R>
     25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
     26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
     27   if (sizeof(T) == 4)
     28   {
     29     unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
     30     unsigned int newval = oldval;
     31     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     32     if (newval == oldval) {
     33       return;
     34     }
     35     unsigned int readback;
     36     while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
     37       oldval = readback;
     38       newval = oldval;
     39       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     40       if (newval == oldval) {
     41         return;
     42       }
     43     }
     44   }
     45   else if (sizeof(T) == 8) {
     46     unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
     47     unsigned long long newval = oldval;
     48     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     49     if (newval == oldval) {
     50       return;
     51     }
     52     unsigned long long readback;
     53     while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
     54       oldval = readback;
     55       newval = oldval;
     56       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     57       if (newval == oldval) {
     58         return;
     59       }
     60     }
     61   }
     62   else {
     63     gpu_assert(0 && "Wordsize not supported");
     64   }
     65 #else // EIGEN_CUDA_ARCH >= 300
     66   gpu_assert(0 && "Shouldn't be called on unsupported device");
     67 #endif // EIGEN_CUDA_ARCH >= 300
     68 }
     69 
     70 // We extend atomicExch to support extra data types
     71 template <typename Type>
     72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
     73   return atomicExch(address, val);
     74 }
     75 
     76 template <>
     77 __device__ inline double atomicExchCustom(double* address, double val) {
     78   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
     79   return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
     80 }
     81 
     82 #ifdef EIGEN_HAS_GPU_FP16
     83 template <typename R>
     84 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
     85   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
     86   unsigned int newval = oldval;
     87   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
     88   if (newval == oldval) {
     89     return;
     90   }
     91   unsigned int readback;
     92   while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
     93     oldval = readback;
     94     newval = oldval;
     95     reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
     96     if (newval == oldval) {
     97       return;
     98     }
     99   }
    100 }
    101 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
    102 template <typename R>
    103 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
    104   half2* houtput=reinterpret_cast<half2*>(output);
    105   half2* haccum=reinterpret_cast<half2*>(&accum);
    106   for(int i=0;i<4;++i){
    107     atomicReduce(houtput+i,*(haccum+i),reducer);
    108   }
    109 }
    110 #endif  // EIGEN_HAS_GPU_FP16
    111 
    112 template <>
    113 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
    114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
    115   atomicAdd(output, accum);
    116 #else // EIGEN_CUDA_ARCH >= 300
    117   gpu_assert(0 && "Shouldn't be called on unsupported device");
    118 #endif // EIGEN_CUDA_ARCH >= 300
    119 }
    120 
    121 
    122 template <typename CoeffType, typename Index>
    123 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
    124   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    125   const Index num_threads = blockDim.x * gridDim.x;
    126   for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    127     output[i] = val;
    128   }
    129 }
    130 
    131 
    132 template <int BlockSize, int NumPerThread, typename Self,
    133           typename Reducer, typename Index>
    134 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
    135                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
    136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
    137   // Initialize the output value
    138   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
    139   if (gridDim.x == 1) {
    140     if (first_index == 0) {
    141       *output = reducer.initialize();
    142     }
    143   }
    144   else {
    145     if (threadIdx.x == 0) {
    146       unsigned int block = atomicCAS(semaphore, 0u, 1u);
    147       if (block == 0) {
    148         // We're the first block to run, initialize the output value
    149         atomicExchCustom(output, reducer.initialize());
    150         __threadfence();
    151         atomicExch(semaphore, 2u);
    152       }
    153       else {
    154         // Wait for the first block to initialize the output value.
    155         // Use atomicCAS here to ensure that the reads aren't cached
    156         unsigned int val;
    157         do {
    158           val = atomicCAS(semaphore, 2u, 2u);
    159         }
    160         while (val < 2u);
    161       }
    162     }
    163   }
    164 
    165   __syncthreads();
    166 
    167   eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
    168 
    169   typename Self::CoeffReturnType accum = reducer.initialize();
    170   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
    171   for (Index i = 0; i < max_iter; i+=BlockSize) {
    172     const Index index = first_index + i;
    173     eigen_assert(index < num_coeffs);
    174     typename Self::CoeffReturnType val = input.m_impl.coeff(index);
    175     reducer.reduce(val, &accum);
    176   }
    177 
    178 #pragma unroll
    179   for (int offset = warpSize/2; offset > 0; offset /= 2) {
    180   #if defined(EIGEN_HIPCC)
    181     // use std::is_floating_point to determine the type of reduced_val 
    182     // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
    183     // and list the float and int versions of __shfl_down as the candidate functions. 
    184     if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
    185       reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
    186     } else {
    187       reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
    188     }
    189   #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
    190     reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
    191   #else
    192     reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
    193   #endif
    194   }
    195 
    196   if ((threadIdx.x & (warpSize - 1)) == 0) {
    197     atomicReduce(output, accum, reducer);
    198   }
    199 
    200   if (gridDim.x > 1 && threadIdx.x == 0) {
    201     // Let the last block reset the semaphore
    202     atomicInc(semaphore, gridDim.x + 1);
    203 #if defined(EIGEN_HIPCC)
    204     __threadfence_system();
    205 #endif
    206   }
    207 #else // EIGEN_CUDA_ARCH >= 300
    208   gpu_assert(0 && "Shouldn't be called on unsupported device");
    209 #endif // EIGEN_CUDA_ARCH >= 300
    210 }
    211 
    212 
    213 #ifdef EIGEN_HAS_GPU_FP16
    214 template <typename Self,
    215           typename Reducer, typename Index>
    216 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
    217                                                       packet_traits<Eigen::half>::type* scratch) {
    218   eigen_assert(blockDim.x == 1);
    219   eigen_assert(gridDim.x == 1);
    220   typedef packet_traits<Eigen::half>::type packet_type;
    221   Index packet_remainder =
    222       num_coeffs % Index(unpacket_traits<packet_type>::size);
    223   if (packet_remainder != 0) {
    224     half2* h2scratch = reinterpret_cast<half2*>(scratch);
    225     for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
    226       *h2scratch =
    227           __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
    228       h2scratch++;
    229     }
    230     if ((num_coeffs & 1) != 0) {
    231       half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
    232       *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
    233     }
    234   } else {
    235     *scratch = reducer.template initializePacket<packet_type>();
    236   }
    237 }
    238 
    239 template <typename Self,
    240           typename Reducer, typename Index>
    241 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
    242   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    243   const Index num_threads = blockDim.x * gridDim.x;
    244   typedef typename packet_traits<Eigen::half>::type PacketType;
    245 
    246   const Index num_packets =
    247       num_coeffs / Index(unpacket_traits<PacketType>::size);
    248   PacketType* p_output = reinterpret_cast<PacketType*>(output);
    249   for (Index i = thread_id; i < num_packets; i += num_threads) {
    250     p_output[i] = reducer.template initializePacket<PacketType>();
    251   }
    252   Index packet_remainder =
    253       num_coeffs % Index(unpacket_traits<PacketType>::size);
    254   if (thread_id < packet_remainder) {
    255     output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
    256   }
    257 }
    258 
    259 template <int BlockSize, int NumPerThread, typename Self,
    260           typename Reducer, typename Index>
    261 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
    262                                     half* output, packet_traits<Eigen::half>::type* scratch) {
    263   typedef typename packet_traits<Eigen::half>::type PacketType;
    264   const int packet_width = unpacket_traits<PacketType>::size;
    265   eigen_assert(NumPerThread % packet_width == 0);
    266   const Index first_index =
    267       blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
    268 
    269   // Initialize the output value if it wasn't initialized by the ReductionInitKernel
    270 
    271   if (gridDim.x == 1) {
    272     if (first_index == 0) {
    273       int rem = num_coeffs % packet_width;
    274       if (rem != 0) {
    275         half2* p_scratch = reinterpret_cast<half2*>(scratch);
    276         *scratch = reducer.template initializePacket<PacketType>();
    277         for (int i = 0; i < rem / 2; i++) {
    278           *p_scratch = __halves2half2(
    279               input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
    280               input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
    281           p_scratch++;
    282         }
    283         if ((num_coeffs & 1) != 0) {
    284           half last = input.m_impl.coeff(num_coeffs - 1);
    285           *p_scratch = __halves2half2(last, reducer.initialize());
    286         }
    287       } else {
    288         *scratch = reducer.template initializePacket<PacketType>();
    289       }
    290     }
    291     __syncthreads();
    292   }
    293 
    294   PacketType accum = reducer.template initializePacket<PacketType>();
    295   const Index max_iter =
    296       numext::mini<Index>((num_coeffs - first_index) / packet_width,
    297                           NumPerThread * BlockSize / packet_width);
    298   for (Index i = 0; i < max_iter; i += BlockSize) {
    299     const Index index = first_index + packet_width * i;
    300     eigen_assert(index + packet_width < num_coeffs);
    301     PacketType val = input.m_impl.template packet<Unaligned>(index);
    302     reducer.reducePacket(val, &accum);
    303   }
    304 
    305 #pragma unroll
    306   for (int offset = warpSize/2; offset > 0; offset /= 2) {
    307   #if defined(EIGEN_HIPCC)
    308     PacketType r1;
    309     half2* hr = reinterpret_cast<half2*>(&r1);
    310     half2* hacc = reinterpret_cast<half2*>(&accum);
    311     for (int i = 0; i < packet_width / 2; i++) {
    312       // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
    313       union { int i; half2 h; } wka_in, wka_out;
    314       wka_in.h = hacc[i];
    315       wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
    316       hr[i] = wka_out.h;
    317     }
    318     reducer.reducePacket(r1, &accum);
    319   #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
    320     PacketType r1;
    321     half2* hr = reinterpret_cast<half2*>(&r1);
    322     half2* hacc = reinterpret_cast<half2*>(&accum);
    323     for (int i = 0; i < packet_width / 2; i++) {
    324       hr[i] = __shfl_down(hacc[i], offset, warpSize);
    325     }
    326     reducer.reducePacket(r1, &accum);
    327   #else
    328     PacketType r1;
    329     half2* hr = reinterpret_cast<half2*>(&r1);
    330     half2* hacc = reinterpret_cast<half2*>(&accum);
    331     for (int i = 0; i < packet_width / 2; i++) {
    332       hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
    333     }
    334     reducer.reducePacket(r1, &accum);
    335 
    336   #endif
    337   }
    338 
    339   if ((threadIdx.x & (warpSize - 1)) == 0) {
    340     atomicReduce(scratch, accum, reducer);
    341   }
    342 
    343   __syncthreads();
    344   half2* rv1 = reinterpret_cast<half2*>(scratch);
    345   if (packet_width > 2) {
    346     reducer.reducePacket(rv1[2], rv1);
    347     reducer.reducePacket(rv1[3], rv1 + 1);
    348     reducer.reducePacket(rv1[1], rv1);
    349   }
    350   if (gridDim.x == 1) {
    351     if (first_index == 0) {
    352       half tmp = __low2half(*rv1);
    353       reducer.reduce(__high2half(*rv1), &tmp);
    354       *output = tmp;
    355     }
    356   }
    357 }
    358 
    359 template <typename Op>
    360 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
    361   eigen_assert(threadIdx.x == 1);
    362   half2* pscratch = reinterpret_cast<half2*>(scratch);
    363   half tmp = __float2half(0.f);
    364   typedef packet_traits<Eigen::half>::type packet_type;
    365   for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
    366     reducer.reduce(__low2half(*pscratch), &tmp);
    367     reducer.reduce(__high2half(*pscratch), &tmp);
    368     pscratch++;
    369   }
    370   *output = tmp;
    371 }
    372 
    373 #endif // EIGEN_HAS_GPU_FP16
    374 
    375 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
    376 struct FullReductionLauncher {
    377   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
    378     gpu_assert(false && "Should only be called on doubles, floats and half floats");
    379   }
    380 };
    381 
    382 // Specialization for float and double
    383 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
    384 struct FullReductionLauncher<
    385     Self, Op, OutputType, PacketAccess,
    386     typename internal::enable_if<
    387       internal::is_same<float, OutputType>::value ||
    388       internal::is_same<double, OutputType>::value,
    389     void>::type> {
    390   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
    391 
    392     typedef typename Self::Index Index;
    393     const int block_size = 256;
    394     const int num_per_thread = 128;
    395     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    396 
    397     unsigned int* semaphore = NULL;
    398     if (num_blocks > 1) {
    399       semaphore = device.semaphore();
    400     }
    401 
    402     LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
    403                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
    404   }
    405 };
    406 
    407 #ifdef EIGEN_HAS_GPU_FP16
    408 template <typename Self, typename Op>
    409 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
    410   static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
    411     gpu_assert(false && "Should not be called since there is no packet accessor");
    412   }
    413 };
    414 
    415 template <typename Self, typename Op>
    416 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
    417   static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
    418     typedef typename Self::Index Index;
    419     typedef typename packet_traits<Eigen::half>::type PacketType;
    420 
    421     const int block_size = 256;
    422     const int num_per_thread = 128;
    423     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    424     PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
    425     // half2* scratch = static_cast<half2*>(device.scratchpad());
    426 
    427     if (num_blocks > 1) {
    428       // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
    429       // won't be a race conditions between multiple thread blocks.
    430       LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
    431                          1, 1, 0, device, reducer, self, num_coeffs, scratch);
    432     }
    433 
    434     LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
    435                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
    436 
    437     if (num_blocks > 1) {
    438       LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
    439                          1, 1, 0, device, reducer, output, scratch);
    440     }
    441   }
    442 };
    443 #endif // EIGEN_HAS_GPU_FP16
    444 
    445 
    446 template <typename Self, typename Op, bool Vectorizable>
    447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
    448   // Unfortunately nvidia doesn't support well exotic types such as complex,
    449   // so reduce the scope of the optimized version of the code to the simple cases
    450   // of doubles, floats and half floats
    451 #ifdef EIGEN_HAS_GPU_FP16
    452   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
    453       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    454        internal::is_same<typename Self::CoeffReturnType, double>::value ||
    455        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
    456 #else // EIGEN_HAS_GPU_FP16
    457   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
    458                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    459                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
    460 #endif // EIGEN_HAS_GPU_FP16
    461 
    462   template <typename OutputType>
    463   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
    464     gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
    465     const Index num_coeffs = array_prod(self.m_impl.dimensions());
    466     // Don't crash when we're called with an input tensor of size 0.
    467     if (num_coeffs == 0) {
    468       return;
    469     }
    470 
    471     FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
    472   }
    473 };
    474 
    475 
    476 template <int NumPerThread, typename Self,
    477           typename Reducer, typename Index>
    478 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    479                                          typename Self::CoeffReturnType* output) {
    480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
    481   typedef typename Self::CoeffReturnType Type;
    482   eigen_assert(blockDim.y == 1);
    483   eigen_assert(blockDim.z == 1);
    484   eigen_assert(gridDim.y == 1);
    485   eigen_assert(gridDim.z == 1);
    486 
    487   const int unroll_times = 16;
    488   eigen_assert(NumPerThread % unroll_times == 0);
    489 
    490   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
    491   const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
    492 
    493   const Index num_threads = blockDim.x * gridDim.x;
    494   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    495 
    496   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    497   if (gridDim.x == 1) {
    498     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    499       output[i] = reducer.initialize();
    500     }
    501     __syncthreads();
    502   }
    503 
    504   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
    505     const Index row = i / input_col_blocks;
    506 
    507     if (row < num_preserved_coeffs) {
    508       const Index col_block = i % input_col_blocks;
    509       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
    510 
    511       Type reduced_val = reducer.initialize();
    512 
    513       for (Index j = 0; j < NumPerThread; j += unroll_times) {
    514         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
    515         if (last_col >= num_coeffs_to_reduce) {
    516           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
    517             const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
    518             reducer.reduce(val, &reduced_val);
    519           }
    520           break;
    521         } else {
    522           // Faster version of the loop with no branches after unrolling.
    523 #pragma unroll
    524           for (int k = 0; k < unroll_times; ++k) {
    525             const Index col = col_begin + blockDim.x * (j + k);
    526             reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
    527           }
    528         }
    529       }
    530 
    531 #pragma unroll
    532       for (int offset = warpSize/2; offset > 0; offset /= 2) {
    533       #if defined(EIGEN_HIPCC)
    534         // use std::is_floating_point to determine the type of reduced_val 
    535        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
    536        // and list the float and int versions of __shfl_down as the candidate functions. 
    537         if (std::is_floating_point<Type>::value) {
    538           reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
    539         } else {
    540           reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
    541         }
    542       #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
    543         reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
    544       #else
    545         reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
    546       #endif
    547       }
    548 
    549       if ((threadIdx.x & (warpSize - 1)) == 0) {
    550         atomicReduce(&(output[row]), reduced_val, reducer);
    551       }
    552     }
    553   }
    554 #else // EIGEN_CUDA_ARCH >= 300
    555   gpu_assert(0 && "Shouldn't be called on unsupported device");
    556 #endif // EIGEN_CUDA_ARCH >= 300
    557 }
    558 
    559 #ifdef EIGEN_HAS_GPU_FP16
    560 
    561 template <int NumPerThread, typename Self,
    562           typename Reducer, typename Index>
    563 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    564                                               half* output) {
    565   eigen_assert(blockDim.y == 1);
    566   eigen_assert(blockDim.z == 1);
    567   eigen_assert(gridDim.y == 1);
    568   eigen_assert(gridDim.z == 1);
    569 
    570   typedef typename packet_traits<Eigen::half>::type PacketType;
    571   const int packet_width = unpacket_traits<PacketType>::size;
    572   const int unroll_times = 16 / packet_width;
    573   eigen_assert(NumPerThread % unroll_times == 0);
    574   eigen_assert(unroll_times % 2 == 0);
    575 
    576   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
    577   const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
    578 
    579   const Index num_threads = blockDim.x * gridDim.x;
    580   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    581 
    582   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    583   if (gridDim.x == 1) {
    584     Index i = packet_width * thread_id;
    585     for (; i + packet_width <= num_preserved_coeffs;
    586          i += packet_width * num_threads) {
    587       PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
    588       *poutput = reducer.template initializePacket<PacketType>();
    589     }
    590     if (i < num_preserved_coeffs) {
    591       output[i] = reducer.initialize();
    592     }
    593     __syncthreads();
    594   }
    595 
    596   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
    597     const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
    598 
    599     if (row + 1 < num_preserved_coeffs) {
    600       const Index col_block = i % input_col_blocks;
    601       const Index col_begin =
    602           packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
    603 
    604       PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
    605       PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
    606 
    607       for (Index j = 0; j < NumPerThread; j += unroll_times) {
    608         const Index last_col =
    609             col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
    610         if (last_col >= num_coeffs_to_reduce) {
    611           Index col = col_begin + blockDim.x * j;
    612           for (; col + packet_width <= num_coeffs_to_reduce;
    613                col += blockDim.x) {
    614             const PacketType val1 = input.m_impl.template packet<Unaligned>(
    615                 row * num_coeffs_to_reduce + col);
    616             reducer.reducePacket(val1, &reduced_val1);
    617             const PacketType val2 = input.m_impl.template packet<Unaligned>(
    618                 (row + 1) * num_coeffs_to_reduce + col);
    619             reducer.reducePacket(val2, &reduced_val2);
    620           }
    621           if (col < num_coeffs_to_reduce) {
    622             PacketType r1 = reducer.template initializePacket<PacketType>();
    623             PacketType r2 = reducer.template initializePacket<PacketType>();
    624             half2* hr1 = reinterpret_cast<half2*>(&r1);
    625             half2* hr2 = reinterpret_cast<half2*>(&r2);
    626             while (col + 1 < num_coeffs_to_reduce) {
    627               *hr1 = __halves2half2(
    628                   input.m_impl.coeff(row * num_coeffs_to_reduce + col),
    629                   input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
    630               *hr2 = __halves2half2(
    631                   input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
    632                   input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
    633                                      1));
    634               hr1++;
    635               hr2++;
    636               col += 2;
    637             }
    638             if (col < num_coeffs_to_reduce) {
    639               // Peel;
    640               const half last1 =
    641                   input.m_impl.coeff(row * num_coeffs_to_reduce + col);
    642               *hr1 = __halves2half2(last1, reducer.initialize());
    643               const half last2 =
    644                   input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
    645               *hr2 = __halves2half2(last2, reducer.initialize());
    646             }
    647             reducer.reducePacket(r1, &reduced_val1);
    648             reducer.reducePacket(r2, &reduced_val2);
    649           }
    650           break;
    651         } else {
    652           // Faster version of the loop with no branches after unrolling.
    653 #pragma unroll
    654           for (int k = 0; k < unroll_times; ++k) {
    655             const Index col = col_begin + blockDim.x * (j + k) * packet_width;
    656             reducer.reducePacket(input.m_impl.template packet<Unaligned>(
    657                                      row * num_coeffs_to_reduce + col),
    658                                  &reduced_val1);
    659             reducer.reducePacket(input.m_impl.template packet<Unaligned>(
    660                                      (row + 1) * num_coeffs_to_reduce + col),
    661                                  &reduced_val2);
    662           }
    663         }
    664       }
    665 
    666 #pragma unroll
    667       for (int offset = warpSize/2; offset > 0; offset /= 2) {
    668       #if defined(EIGEN_HIPCC)
    669         PacketType r1;
    670         PacketType r2;
    671         half2* hr1 = reinterpret_cast<half2*>(&r1);
    672         half2* hr2 = reinterpret_cast<half2*>(&r2);
    673         half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
    674         half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
    675         for (int i = 0; i < packet_width / 2; i++) {
    676 	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
    677 	  union { int i; half2 h; } wka_in1, wka_out1;
    678 	  wka_in1.h = rv1[i];
    679 	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
    680 	  hr1[i] = wka_out1.h;
    681 
    682 	  union { int i; half2 h; } wka_in2, wka_out2;
    683 	  wka_in2.h = rv2[i];
    684 	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
    685 	  hr2[i] = wka_out2.h;
    686         }
    687         reducer.reducePacket(r1, &reduced_val1);
    688         reducer.reducePacket(r2, &reduced_val2);
    689       #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
    690         PacketType r1;
    691         PacketType r2;
    692         half2* hr1 = reinterpret_cast<half2*>(&r1);
    693         half2* hr2 = reinterpret_cast<half2*>(&r2);
    694         half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
    695         half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
    696         for (int i = 0; i < packet_width / 2; i++) {
    697           hr1[i] = __shfl_down(rv1[i], offset, warpSize);
    698           hr2[i] = __shfl_down(rv2[i], offset, warpSize);
    699         }
    700         reducer.reducePacket(r1, &reduced_val1);
    701         reducer.reducePacket(r2, &reduced_val2);
    702       #else
    703         PacketType r1;
    704         PacketType r2;
    705         half2* hr1 = reinterpret_cast<half2*>(&r1);
    706         half2* hr2 = reinterpret_cast<half2*>(&r2);
    707         half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
    708         half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
    709         for (int i = 0; i < packet_width / 2; i++) {
    710           hr1[i] =
    711               __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
    712           hr2[i] =
    713               __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
    714         }
    715         reducer.reducePacket(r1, &reduced_val1);
    716         reducer.reducePacket(r2, &reduced_val2);
    717 
    718       #endif
    719       }
    720       half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
    721       half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
    722       half2 val;
    723       if (packet_width > 2) {
    724         reducer.reducePacket(rv1[2], rv1);
    725         reducer.reducePacket(rv1[3], rv1 + 1);
    726         reducer.reducePacket(rv1[1], rv1);
    727         reducer.reducePacket(rv2[2], rv2);
    728         reducer.reducePacket(rv2[3], rv2 + 1);
    729         reducer.reducePacket(rv2[1], rv2);
    730       }
    731       half val1 = __low2half(*rv1);
    732       reducer.reduce(__high2half(*rv1), &val1);
    733       half val2 = __low2half(*rv2);
    734       reducer.reduce(__high2half(*rv2), &val2);
    735       val = __halves2half2(val1, val2);
    736       if ((threadIdx.x & (warpSize - 1)) == 0) {
    737         half* loc = output + row;
    738         atomicReduce((half2*)loc, val, reducer);
    739       }
    740     }
    741   }
    742 }
    743 
    744 #endif // EIGEN_HAS_GPU_FP16
    745 
    746 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
    747 struct InnerReductionLauncher {
    748   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
    749     gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
    750     return true;
    751   }
    752 };
    753 
    754 // Specialization for float and double
    755 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
    756 struct InnerReductionLauncher<
    757   Self, Op, OutputType, PacketAccess,
    758   typename internal::enable_if<
    759     internal::is_same<float, OutputType>::value ||
    760     internal::is_same<double, OutputType>::value,
    761   void>::type> {
    762   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    763     typedef typename Self::Index Index;
    764 
    765     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    766     const int block_size = 256;
    767     const int num_per_thread = 128;
    768     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    769     const int max_blocks = device.getNumGpuMultiProcessors() *
    770                            device.maxGpuThreadsPerMultiProcessor() / block_size;
    771     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    772 
    773     if (num_blocks > 1) {
    774       // We initialize the outputs outside the reduction kernel when we can't be sure that there
    775       // won't be a race conditions between multiple thread blocks.
    776       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
    777       const int max_blocks = device.getNumGpuMultiProcessors() *
    778                            device.maxGpuThreadsPerMultiProcessor() / 1024;
    779       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    780       LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
    781                          num_blocks, 1024, 0, device, reducer.initialize(),
    782                          num_preserved_vals, output);
    783     }
    784 
    785     LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
    786                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    787 
    788     return false;
    789   }
    790 };
    791 
    792 #ifdef EIGEN_HAS_GPU_FP16
    793 template <typename Self, typename Op>
    794 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
    795   static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
    796     gpu_assert(false && "Should not be called since there is no packet accessor");
    797     return true;
    798   }
    799 };
    800 
    801 template <typename Self, typename Op>
    802 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
    803   static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    804     typedef typename Self::Index Index;
    805 
    806     if (num_preserved_vals % 2 != 0) {
    807       // Not supported yet, revert to the slower code path
    808       return true;
    809     }
    810 
    811     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    812     const int block_size = /*256*/128;
    813     const int num_per_thread = /*128*/64;
    814     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    815     const int max_blocks = device.getNumGpuMultiProcessors() *
    816                            device.maxGpuThreadsPerMultiProcessor() / block_size;
    817     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    818 
    819     if (num_blocks > 1) {
    820       // We initialize the outputs outside the reduction kernel when we can't be sure that there
    821       // won't be a race conditions between multiple thread blocks.
    822       LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
    823                          1, 1, 0, device, reducer, self, num_preserved_vals, output);
    824     }
    825 
    826     LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
    827                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    828 
    829     return false;
    830   }
    831 };
    832 #endif // EIGEN_HAS_GPU_FP16
    833 
    834 
    835 template <typename Self, typename Op>
    836 struct InnerReducer<Self, Op, GpuDevice> {
    837   // Unfortunately nvidia doesn't support well exotic types such as complex,
    838   // so reduce the scope of the optimized version of the code to the simple case
    839   // of floats and half floats.
    840 #ifdef EIGEN_HAS_GPU_FP16
    841   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
    842       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    843        internal::is_same<typename Self::CoeffReturnType, double>::value ||
    844        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
    845 #else // EIGEN_HAS_GPU_FP16
    846   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
    847                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    848                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
    849 #endif // EIGEN_HAS_GPU_FP16
    850 
    851   template <typename OutputType>
    852   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    853     gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
    854     const Index num_coeffs = array_prod(self.m_impl.dimensions());
    855     // Don't crash when we're called with an input tensor of size 0.
    856     if (num_coeffs == 0) {
    857       return true;
    858     }
    859     // It's faster to use the usual code.
    860     if (num_coeffs_to_reduce <= 128) {
    861       return true;
    862     }
    863 
    864     return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
    865   }
    866 };
    867 
    868 template <int NumPerThread, typename Self,
    869           typename Reducer, typename Index>
    870 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    871                                      typename Self::CoeffReturnType* output) {
    872   const Index num_threads = blockDim.x * gridDim.x;
    873   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    874   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    875   if (gridDim.x == 1) {
    876     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    877       output[i] = reducer.initialize();
    878     }
    879     __syncthreads();
    880   }
    881 
    882   // Do the reduction.
    883   const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
    884   for (Index i = thread_id; i < max_iter; i += num_threads) {
    885     const Index input_col = i % num_preserved_coeffs;
    886     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
    887     typename Self::CoeffReturnType reduced_val = reducer.initialize();
    888     const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
    889     for (Index j = input_row; j < max_row; j++) {
    890       typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
    891       reducer.reduce(val, &reduced_val);
    892     }
    893     atomicReduce(&(output[input_col]), reduced_val, reducer);
    894   }
    895 }
    896 
    897 
    898 template <typename Self, typename Op>
    899 struct OuterReducer<Self, Op, GpuDevice> {
    900   // Unfortunately nvidia doesn't support well exotic types such as complex,
    901   // so reduce the scope of the optimized version of the code to the simple case
    902   // of floats.
    903   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
    904                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    905                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
    906   template <typename Device, typename OutputType>
    907   static
    908     #if !defined(EIGEN_HIPCC)
    909     // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
    910     //          (in the cxx11_tensor_reduction_gpu test)
    911     //
    912     // terminate called after throwing an instance of 'std::runtime_error'
    913     //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
    914     //
    915     // don't know why this happens (and why is it a runtime error instead of a compile time error)
    916     //
    917     // this will be fixed by HIP PR#457
    918     EIGEN_DEVICE_FUNC
    919     #endif
    920     bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
    921     gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
    922     return true;
    923   }
    924 
    925   static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    926     typedef typename Self::Index Index;
    927 
    928     // It's faster to use the usual code.
    929     if (num_coeffs_to_reduce <= 32) {
    930       return true;
    931     }
    932 
    933     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    934     const int block_size = 256;
    935     const int num_per_thread = 16;
    936     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    937     const int max_blocks = device.getNumGpuMultiProcessors() *
    938                            device.maxGpuThreadsPerMultiProcessor() / block_size;
    939     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    940 
    941     if (num_blocks > 1) {
    942       // We initialize the outputs in the reduction kernel itself when we don't have to worry
    943       // about race conditions between multiple thread blocks.
    944       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
    945       const int max_blocks = device.getNumGpuMultiProcessors() *
    946                              device.maxGpuThreadsPerMultiProcessor() / 1024;
    947       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    948       LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
    949                          num_blocks, 1024, 0, device, reducer.initialize(),
    950                          num_preserved_vals, output);
    951     }
    952 
    953     LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
    954                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    955 
    956     return false;
    957   }
    958 };
    959 
    960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
    961 
    962 
    963 } // end namespace internal
    964 } // end namespace Eigen
    965 
    966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H