TensorReductionGpu.h (40719B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H 12 13 namespace Eigen { 14 namespace internal { 15 16 17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 18 // Full reducers for GPU, don't vectorize for now 19 20 // Reducer function that enables multiple gpu thread to safely accumulate at the same 21 // output address. It basically reads the current value of the output variable, and 22 // attempts to update it with the new value. If in the meantime another gpu thread 23 // updated the content of the output address it will try again. 24 template <typename T, typename R> 25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { 26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 27 if (sizeof(T) == 4) 28 { 29 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); 30 unsigned int newval = oldval; 31 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 32 if (newval == oldval) { 33 return; 34 } 35 unsigned int readback; 36 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { 37 oldval = readback; 38 newval = oldval; 39 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 40 if (newval == oldval) { 41 return; 42 } 43 } 44 } 45 else if (sizeof(T) == 8) { 46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); 47 unsigned long long newval = oldval; 48 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 49 if (newval == oldval) { 50 return; 51 } 52 unsigned long long readback; 53 while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { 54 oldval = readback; 55 newval = oldval; 56 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 57 if (newval == oldval) { 58 return; 59 } 60 } 61 } 62 else { 63 gpu_assert(0 && "Wordsize not supported"); 64 } 65 #else // EIGEN_CUDA_ARCH >= 300 66 gpu_assert(0 && "Shouldn't be called on unsupported device"); 67 #endif // EIGEN_CUDA_ARCH >= 300 68 } 69 70 // We extend atomicExch to support extra data types 71 template <typename Type> 72 __device__ inline Type atomicExchCustom(Type* address, Type val) { 73 return atomicExch(address, val); 74 } 75 76 template <> 77 __device__ inline double atomicExchCustom(double* address, double val) { 78 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); 79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); 80 } 81 82 #ifdef EIGEN_HAS_GPU_FP16 83 template <typename R> 84 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { 85 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); 86 unsigned int newval = oldval; 87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); 88 if (newval == oldval) { 89 return; 90 } 91 unsigned int readback; 92 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { 93 oldval = readback; 94 newval = oldval; 95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); 96 if (newval == oldval) { 97 return; 98 } 99 } 100 } 101 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations 102 template <typename R> 103 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { 104 half2* houtput=reinterpret_cast<half2*>(output); 105 half2* haccum=reinterpret_cast<half2*>(&accum); 106 for(int i=0;i<4;++i){ 107 atomicReduce(houtput+i,*(haccum+i),reducer); 108 } 109 } 110 #endif // EIGEN_HAS_GPU_FP16 111 112 template <> 113 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { 114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 115 atomicAdd(output, accum); 116 #else // EIGEN_CUDA_ARCH >= 300 117 gpu_assert(0 && "Shouldn't be called on unsupported device"); 118 #endif // EIGEN_CUDA_ARCH >= 300 119 } 120 121 122 template <typename CoeffType, typename Index> 123 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { 124 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 125 const Index num_threads = blockDim.x * gridDim.x; 126 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 127 output[i] = val; 128 } 129 } 130 131 132 template <int BlockSize, int NumPerThread, typename Self, 133 typename Reducer, typename Index> 134 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, 135 typename Self::CoeffReturnType* output, unsigned int* semaphore) { 136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 137 // Initialize the output value 138 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; 139 if (gridDim.x == 1) { 140 if (first_index == 0) { 141 *output = reducer.initialize(); 142 } 143 } 144 else { 145 if (threadIdx.x == 0) { 146 unsigned int block = atomicCAS(semaphore, 0u, 1u); 147 if (block == 0) { 148 // We're the first block to run, initialize the output value 149 atomicExchCustom(output, reducer.initialize()); 150 __threadfence(); 151 atomicExch(semaphore, 2u); 152 } 153 else { 154 // Wait for the first block to initialize the output value. 155 // Use atomicCAS here to ensure that the reads aren't cached 156 unsigned int val; 157 do { 158 val = atomicCAS(semaphore, 2u, 2u); 159 } 160 while (val < 2u); 161 } 162 } 163 } 164 165 __syncthreads(); 166 167 eigen_assert(gridDim.x == 1 || *semaphore >= 2u); 168 169 typename Self::CoeffReturnType accum = reducer.initialize(); 170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); 171 for (Index i = 0; i < max_iter; i+=BlockSize) { 172 const Index index = first_index + i; 173 eigen_assert(index < num_coeffs); 174 typename Self::CoeffReturnType val = input.m_impl.coeff(index); 175 reducer.reduce(val, &accum); 176 } 177 178 #pragma unroll 179 for (int offset = warpSize/2; offset > 0; offset /= 2) { 180 #if defined(EIGEN_HIPCC) 181 // use std::is_floating_point to determine the type of reduced_val 182 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 183 // and list the float and int versions of __shfl_down as the candidate functions. 184 if (std::is_floating_point<typename Self::CoeffReturnType>::value) { 185 reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum); 186 } else { 187 reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum); 188 } 189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 190 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); 191 #else 192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); 193 #endif 194 } 195 196 if ((threadIdx.x & (warpSize - 1)) == 0) { 197 atomicReduce(output, accum, reducer); 198 } 199 200 if (gridDim.x > 1 && threadIdx.x == 0) { 201 // Let the last block reset the semaphore 202 atomicInc(semaphore, gridDim.x + 1); 203 #if defined(EIGEN_HIPCC) 204 __threadfence_system(); 205 #endif 206 } 207 #else // EIGEN_CUDA_ARCH >= 300 208 gpu_assert(0 && "Shouldn't be called on unsupported device"); 209 #endif // EIGEN_CUDA_ARCH >= 300 210 } 211 212 213 #ifdef EIGEN_HAS_GPU_FP16 214 template <typename Self, 215 typename Reducer, typename Index> 216 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, 217 packet_traits<Eigen::half>::type* scratch) { 218 eigen_assert(blockDim.x == 1); 219 eigen_assert(gridDim.x == 1); 220 typedef packet_traits<Eigen::half>::type packet_type; 221 Index packet_remainder = 222 num_coeffs % Index(unpacket_traits<packet_type>::size); 223 if (packet_remainder != 0) { 224 half2* h2scratch = reinterpret_cast<half2*>(scratch); 225 for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { 226 *h2scratch = 227 __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); 228 h2scratch++; 229 } 230 if ((num_coeffs & 1) != 0) { 231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1); 232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); 233 } 234 } else { 235 *scratch = reducer.template initializePacket<packet_type>(); 236 } 237 } 238 239 template <typename Self, 240 typename Reducer, typename Index> 241 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { 242 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 243 const Index num_threads = blockDim.x * gridDim.x; 244 typedef typename packet_traits<Eigen::half>::type PacketType; 245 246 const Index num_packets = 247 num_coeffs / Index(unpacket_traits<PacketType>::size); 248 PacketType* p_output = reinterpret_cast<PacketType*>(output); 249 for (Index i = thread_id; i < num_packets; i += num_threads) { 250 p_output[i] = reducer.template initializePacket<PacketType>(); 251 } 252 Index packet_remainder = 253 num_coeffs % Index(unpacket_traits<PacketType>::size); 254 if (thread_id < packet_remainder) { 255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); 256 } 257 } 258 259 template <int BlockSize, int NumPerThread, typename Self, 260 typename Reducer, typename Index> 261 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, 262 half* output, packet_traits<Eigen::half>::type* scratch) { 263 typedef typename packet_traits<Eigen::half>::type PacketType; 264 const int packet_width = unpacket_traits<PacketType>::size; 265 eigen_assert(NumPerThread % packet_width == 0); 266 const Index first_index = 267 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; 268 269 // Initialize the output value if it wasn't initialized by the ReductionInitKernel 270 271 if (gridDim.x == 1) { 272 if (first_index == 0) { 273 int rem = num_coeffs % packet_width; 274 if (rem != 0) { 275 half2* p_scratch = reinterpret_cast<half2*>(scratch); 276 *scratch = reducer.template initializePacket<PacketType>(); 277 for (int i = 0; i < rem / 2; i++) { 278 *p_scratch = __halves2half2( 279 input.m_impl.coeff(num_coeffs - packet_width + 2 * i), 280 input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); 281 p_scratch++; 282 } 283 if ((num_coeffs & 1) != 0) { 284 half last = input.m_impl.coeff(num_coeffs - 1); 285 *p_scratch = __halves2half2(last, reducer.initialize()); 286 } 287 } else { 288 *scratch = reducer.template initializePacket<PacketType>(); 289 } 290 } 291 __syncthreads(); 292 } 293 294 PacketType accum = reducer.template initializePacket<PacketType>(); 295 const Index max_iter = 296 numext::mini<Index>((num_coeffs - first_index) / packet_width, 297 NumPerThread * BlockSize / packet_width); 298 for (Index i = 0; i < max_iter; i += BlockSize) { 299 const Index index = first_index + packet_width * i; 300 eigen_assert(index + packet_width < num_coeffs); 301 PacketType val = input.m_impl.template packet<Unaligned>(index); 302 reducer.reducePacket(val, &accum); 303 } 304 305 #pragma unroll 306 for (int offset = warpSize/2; offset > 0; offset /= 2) { 307 #if defined(EIGEN_HIPCC) 308 PacketType r1; 309 half2* hr = reinterpret_cast<half2*>(&r1); 310 half2* hacc = reinterpret_cast<half2*>(&accum); 311 for (int i = 0; i < packet_width / 2; i++) { 312 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down 313 union { int i; half2 h; } wka_in, wka_out; 314 wka_in.h = hacc[i]; 315 wka_out.i = __shfl_down(wka_in.i, offset, warpSize); 316 hr[i] = wka_out.h; 317 } 318 reducer.reducePacket(r1, &accum); 319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 320 PacketType r1; 321 half2* hr = reinterpret_cast<half2*>(&r1); 322 half2* hacc = reinterpret_cast<half2*>(&accum); 323 for (int i = 0; i < packet_width / 2; i++) { 324 hr[i] = __shfl_down(hacc[i], offset, warpSize); 325 } 326 reducer.reducePacket(r1, &accum); 327 #else 328 PacketType r1; 329 half2* hr = reinterpret_cast<half2*>(&r1); 330 half2* hacc = reinterpret_cast<half2*>(&accum); 331 for (int i = 0; i < packet_width / 2; i++) { 332 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); 333 } 334 reducer.reducePacket(r1, &accum); 335 336 #endif 337 } 338 339 if ((threadIdx.x & (warpSize - 1)) == 0) { 340 atomicReduce(scratch, accum, reducer); 341 } 342 343 __syncthreads(); 344 half2* rv1 = reinterpret_cast<half2*>(scratch); 345 if (packet_width > 2) { 346 reducer.reducePacket(rv1[2], rv1); 347 reducer.reducePacket(rv1[3], rv1 + 1); 348 reducer.reducePacket(rv1[1], rv1); 349 } 350 if (gridDim.x == 1) { 351 if (first_index == 0) { 352 half tmp = __low2half(*rv1); 353 reducer.reduce(__high2half(*rv1), &tmp); 354 *output = tmp; 355 } 356 } 357 } 358 359 template <typename Op> 360 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { 361 eigen_assert(threadIdx.x == 1); 362 half2* pscratch = reinterpret_cast<half2*>(scratch); 363 half tmp = __float2half(0.f); 364 typedef packet_traits<Eigen::half>::type packet_type; 365 for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) { 366 reducer.reduce(__low2half(*pscratch), &tmp); 367 reducer.reduce(__high2half(*pscratch), &tmp); 368 pscratch++; 369 } 370 *output = tmp; 371 } 372 373 #endif // EIGEN_HAS_GPU_FP16 374 375 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> 376 struct FullReductionLauncher { 377 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { 378 gpu_assert(false && "Should only be called on doubles, floats and half floats"); 379 } 380 }; 381 382 // Specialization for float and double 383 template <typename Self, typename Op, typename OutputType, bool PacketAccess> 384 struct FullReductionLauncher< 385 Self, Op, OutputType, PacketAccess, 386 typename internal::enable_if< 387 internal::is_same<float, OutputType>::value || 388 internal::is_same<double, OutputType>::value, 389 void>::type> { 390 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { 391 392 typedef typename Self::Index Index; 393 const int block_size = 256; 394 const int num_per_thread = 128; 395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 396 397 unsigned int* semaphore = NULL; 398 if (num_blocks > 1) { 399 semaphore = device.semaphore(); 400 } 401 402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), 403 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); 404 } 405 }; 406 407 #ifdef EIGEN_HAS_GPU_FP16 408 template <typename Self, typename Op> 409 struct FullReductionLauncher<Self, Op, Eigen::half, false> { 410 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { 411 gpu_assert(false && "Should not be called since there is no packet accessor"); 412 } 413 }; 414 415 template <typename Self, typename Op> 416 struct FullReductionLauncher<Self, Op, Eigen::half, true> { 417 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { 418 typedef typename Self::Index Index; 419 typedef typename packet_traits<Eigen::half>::type PacketType; 420 421 const int block_size = 256; 422 const int num_per_thread = 128; 423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 424 PacketType* scratch = static_cast<PacketType*>(device.scratchpad()); 425 // half2* scratch = static_cast<half2*>(device.scratchpad()); 426 427 if (num_blocks > 1) { 428 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there 429 // won't be a race conditions between multiple thread blocks. 430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 431 1, 1, 0, device, reducer, self, num_coeffs, scratch); 432 } 433 434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), 435 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); 436 437 if (num_blocks > 1) { 438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 439 1, 1, 0, device, reducer, output, scratch); 440 } 441 } 442 }; 443 #endif // EIGEN_HAS_GPU_FP16 444 445 446 template <typename Self, typename Op, bool Vectorizable> 447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> { 448 // Unfortunately nvidia doesn't support well exotic types such as complex, 449 // so reduce the scope of the optimized version of the code to the simple cases 450 // of doubles, floats and half floats 451 #ifdef EIGEN_HAS_GPU_FP16 452 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && 453 (internal::is_same<typename Self::CoeffReturnType, float>::value || 454 internal::is_same<typename Self::CoeffReturnType, double>::value || 455 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); 456 #else // EIGEN_HAS_GPU_FP16 457 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && 458 (internal::is_same<typename Self::CoeffReturnType, float>::value || 459 internal::is_same<typename Self::CoeffReturnType, double>::value); 460 #endif // EIGEN_HAS_GPU_FP16 461 462 template <typename OutputType> 463 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { 464 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); 465 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 466 // Don't crash when we're called with an input tensor of size 0. 467 if (num_coeffs == 0) { 468 return; 469 } 470 471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); 472 } 473 }; 474 475 476 template <int NumPerThread, typename Self, 477 typename Reducer, typename Index> 478 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 479 typename Self::CoeffReturnType* output) { 480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 481 typedef typename Self::CoeffReturnType Type; 482 eigen_assert(blockDim.y == 1); 483 eigen_assert(blockDim.z == 1); 484 eigen_assert(gridDim.y == 1); 485 eigen_assert(gridDim.z == 1); 486 487 const int unroll_times = 16; 488 eigen_assert(NumPerThread % unroll_times == 0); 489 490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); 491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; 492 493 const Index num_threads = blockDim.x * gridDim.x; 494 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 495 496 // Initialize the output values if they weren't initialized by the ReductionInitKernel 497 if (gridDim.x == 1) { 498 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 499 output[i] = reducer.initialize(); 500 } 501 __syncthreads(); 502 } 503 504 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { 505 const Index row = i / input_col_blocks; 506 507 if (row < num_preserved_coeffs) { 508 const Index col_block = i % input_col_blocks; 509 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; 510 511 Type reduced_val = reducer.initialize(); 512 513 for (Index j = 0; j < NumPerThread; j += unroll_times) { 514 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); 515 if (last_col >= num_coeffs_to_reduce) { 516 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { 517 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); 518 reducer.reduce(val, &reduced_val); 519 } 520 break; 521 } else { 522 // Faster version of the loop with no branches after unrolling. 523 #pragma unroll 524 for (int k = 0; k < unroll_times; ++k) { 525 const Index col = col_begin + blockDim.x * (j + k); 526 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); 527 } 528 } 529 } 530 531 #pragma unroll 532 for (int offset = warpSize/2; offset > 0; offset /= 2) { 533 #if defined(EIGEN_HIPCC) 534 // use std::is_floating_point to determine the type of reduced_val 535 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 536 // and list the float and int versions of __shfl_down as the candidate functions. 537 if (std::is_floating_point<Type>::value) { 538 reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val); 539 } else { 540 reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val); 541 } 542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 543 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); 544 #else 545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); 546 #endif 547 } 548 549 if ((threadIdx.x & (warpSize - 1)) == 0) { 550 atomicReduce(&(output[row]), reduced_val, reducer); 551 } 552 } 553 } 554 #else // EIGEN_CUDA_ARCH >= 300 555 gpu_assert(0 && "Shouldn't be called on unsupported device"); 556 #endif // EIGEN_CUDA_ARCH >= 300 557 } 558 559 #ifdef EIGEN_HAS_GPU_FP16 560 561 template <int NumPerThread, typename Self, 562 typename Reducer, typename Index> 563 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 564 half* output) { 565 eigen_assert(blockDim.y == 1); 566 eigen_assert(blockDim.z == 1); 567 eigen_assert(gridDim.y == 1); 568 eigen_assert(gridDim.z == 1); 569 570 typedef typename packet_traits<Eigen::half>::type PacketType; 571 const int packet_width = unpacket_traits<PacketType>::size; 572 const int unroll_times = 16 / packet_width; 573 eigen_assert(NumPerThread % unroll_times == 0); 574 eigen_assert(unroll_times % 2 == 0); 575 576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); 577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); 578 579 const Index num_threads = blockDim.x * gridDim.x; 580 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 581 582 // Initialize the output values if they weren't initialized by the ReductionInitKernel 583 if (gridDim.x == 1) { 584 Index i = packet_width * thread_id; 585 for (; i + packet_width <= num_preserved_coeffs; 586 i += packet_width * num_threads) { 587 PacketType* poutput = reinterpret_cast<PacketType*>(output + i); 588 *poutput = reducer.template initializePacket<PacketType>(); 589 } 590 if (i < num_preserved_coeffs) { 591 output[i] = reducer.initialize(); 592 } 593 __syncthreads(); 594 } 595 596 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { 597 const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows 598 599 if (row + 1 < num_preserved_coeffs) { 600 const Index col_block = i % input_col_blocks; 601 const Index col_begin = 602 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); 603 604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>(); 605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>(); 606 607 for (Index j = 0; j < NumPerThread; j += unroll_times) { 608 const Index last_col = 609 col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; 610 if (last_col >= num_coeffs_to_reduce) { 611 Index col = col_begin + blockDim.x * j; 612 for (; col + packet_width <= num_coeffs_to_reduce; 613 col += blockDim.x) { 614 const PacketType val1 = input.m_impl.template packet<Unaligned>( 615 row * num_coeffs_to_reduce + col); 616 reducer.reducePacket(val1, &reduced_val1); 617 const PacketType val2 = input.m_impl.template packet<Unaligned>( 618 (row + 1) * num_coeffs_to_reduce + col); 619 reducer.reducePacket(val2, &reduced_val2); 620 } 621 if (col < num_coeffs_to_reduce) { 622 PacketType r1 = reducer.template initializePacket<PacketType>(); 623 PacketType r2 = reducer.template initializePacket<PacketType>(); 624 half2* hr1 = reinterpret_cast<half2*>(&r1); 625 half2* hr2 = reinterpret_cast<half2*>(&r2); 626 while (col + 1 < num_coeffs_to_reduce) { 627 *hr1 = __halves2half2( 628 input.m_impl.coeff(row * num_coeffs_to_reduce + col), 629 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); 630 *hr2 = __halves2half2( 631 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), 632 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 633 1)); 634 hr1++; 635 hr2++; 636 col += 2; 637 } 638 if (col < num_coeffs_to_reduce) { 639 // Peel; 640 const half last1 = 641 input.m_impl.coeff(row * num_coeffs_to_reduce + col); 642 *hr1 = __halves2half2(last1, reducer.initialize()); 643 const half last2 = 644 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); 645 *hr2 = __halves2half2(last2, reducer.initialize()); 646 } 647 reducer.reducePacket(r1, &reduced_val1); 648 reducer.reducePacket(r2, &reduced_val2); 649 } 650 break; 651 } else { 652 // Faster version of the loop with no branches after unrolling. 653 #pragma unroll 654 for (int k = 0; k < unroll_times; ++k) { 655 const Index col = col_begin + blockDim.x * (j + k) * packet_width; 656 reducer.reducePacket(input.m_impl.template packet<Unaligned>( 657 row * num_coeffs_to_reduce + col), 658 &reduced_val1); 659 reducer.reducePacket(input.m_impl.template packet<Unaligned>( 660 (row + 1) * num_coeffs_to_reduce + col), 661 &reduced_val2); 662 } 663 } 664 } 665 666 #pragma unroll 667 for (int offset = warpSize/2; offset > 0; offset /= 2) { 668 #if defined(EIGEN_HIPCC) 669 PacketType r1; 670 PacketType r2; 671 half2* hr1 = reinterpret_cast<half2*>(&r1); 672 half2* hr2 = reinterpret_cast<half2*>(&r2); 673 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); 674 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); 675 for (int i = 0; i < packet_width / 2; i++) { 676 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down 677 union { int i; half2 h; } wka_in1, wka_out1; 678 wka_in1.h = rv1[i]; 679 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); 680 hr1[i] = wka_out1.h; 681 682 union { int i; half2 h; } wka_in2, wka_out2; 683 wka_in2.h = rv2[i]; 684 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); 685 hr2[i] = wka_out2.h; 686 } 687 reducer.reducePacket(r1, &reduced_val1); 688 reducer.reducePacket(r2, &reduced_val2); 689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 690 PacketType r1; 691 PacketType r2; 692 half2* hr1 = reinterpret_cast<half2*>(&r1); 693 half2* hr2 = reinterpret_cast<half2*>(&r2); 694 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); 695 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); 696 for (int i = 0; i < packet_width / 2; i++) { 697 hr1[i] = __shfl_down(rv1[i], offset, warpSize); 698 hr2[i] = __shfl_down(rv2[i], offset, warpSize); 699 } 700 reducer.reducePacket(r1, &reduced_val1); 701 reducer.reducePacket(r2, &reduced_val2); 702 #else 703 PacketType r1; 704 PacketType r2; 705 half2* hr1 = reinterpret_cast<half2*>(&r1); 706 half2* hr2 = reinterpret_cast<half2*>(&r2); 707 half2* rr1 = reinterpret_cast<half2*>(&reduced_val1); 708 half2* rr2 = reinterpret_cast<half2*>(&reduced_val2); 709 for (int i = 0; i < packet_width / 2; i++) { 710 hr1[i] = 711 __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize); 712 hr2[i] = 713 __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize); 714 } 715 reducer.reducePacket(r1, &reduced_val1); 716 reducer.reducePacket(r2, &reduced_val2); 717 718 #endif 719 } 720 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); 721 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); 722 half2 val; 723 if (packet_width > 2) { 724 reducer.reducePacket(rv1[2], rv1); 725 reducer.reducePacket(rv1[3], rv1 + 1); 726 reducer.reducePacket(rv1[1], rv1); 727 reducer.reducePacket(rv2[2], rv2); 728 reducer.reducePacket(rv2[3], rv2 + 1); 729 reducer.reducePacket(rv2[1], rv2); 730 } 731 half val1 = __low2half(*rv1); 732 reducer.reduce(__high2half(*rv1), &val1); 733 half val2 = __low2half(*rv2); 734 reducer.reduce(__high2half(*rv2), &val2); 735 val = __halves2half2(val1, val2); 736 if ((threadIdx.x & (warpSize - 1)) == 0) { 737 half* loc = output + row; 738 atomicReduce((half2*)loc, val, reducer); 739 } 740 } 741 } 742 } 743 744 #endif // EIGEN_HAS_GPU_FP16 745 746 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> 747 struct InnerReductionLauncher { 748 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { 749 gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); 750 return true; 751 } 752 }; 753 754 // Specialization for float and double 755 template <typename Self, typename Op, typename OutputType, bool PacketAccess> 756 struct InnerReductionLauncher< 757 Self, Op, OutputType, PacketAccess, 758 typename internal::enable_if< 759 internal::is_same<float, OutputType>::value || 760 internal::is_same<double, OutputType>::value, 761 void>::type> { 762 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 763 typedef typename Self::Index Index; 764 765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 766 const int block_size = 256; 767 const int num_per_thread = 128; 768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 769 const int max_blocks = device.getNumGpuMultiProcessors() * 770 device.maxGpuThreadsPerMultiProcessor() / block_size; 771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 772 773 if (num_blocks > 1) { 774 // We initialize the outputs outside the reduction kernel when we can't be sure that there 775 // won't be a race conditions between multiple thread blocks. 776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024); 777 const int max_blocks = device.getNumGpuMultiProcessors() * 778 device.maxGpuThreadsPerMultiProcessor() / 1024; 779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), 781 num_blocks, 1024, 0, device, reducer.initialize(), 782 num_preserved_vals, output); 783 } 784 785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), 786 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 787 788 return false; 789 } 790 }; 791 792 #ifdef EIGEN_HAS_GPU_FP16 793 template <typename Self, typename Op> 794 struct InnerReductionLauncher<Self, Op, Eigen::half, false> { 795 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { 796 gpu_assert(false && "Should not be called since there is no packet accessor"); 797 return true; 798 } 799 }; 800 801 template <typename Self, typename Op> 802 struct InnerReductionLauncher<Self, Op, Eigen::half, true> { 803 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 804 typedef typename Self::Index Index; 805 806 if (num_preserved_vals % 2 != 0) { 807 // Not supported yet, revert to the slower code path 808 return true; 809 } 810 811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 812 const int block_size = /*256*/128; 813 const int num_per_thread = /*128*/64; 814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 815 const int max_blocks = device.getNumGpuMultiProcessors() * 816 device.maxGpuThreadsPerMultiProcessor() / block_size; 817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 818 819 if (num_blocks > 1) { 820 // We initialize the outputs outside the reduction kernel when we can't be sure that there 821 // won't be a race conditions between multiple thread blocks. 822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 823 1, 1, 0, device, reducer, self, num_preserved_vals, output); 824 } 825 826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), 827 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 828 829 return false; 830 } 831 }; 832 #endif // EIGEN_HAS_GPU_FP16 833 834 835 template <typename Self, typename Op> 836 struct InnerReducer<Self, Op, GpuDevice> { 837 // Unfortunately nvidia doesn't support well exotic types such as complex, 838 // so reduce the scope of the optimized version of the code to the simple case 839 // of floats and half floats. 840 #ifdef EIGEN_HAS_GPU_FP16 841 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && 842 (internal::is_same<typename Self::CoeffReturnType, float>::value || 843 internal::is_same<typename Self::CoeffReturnType, double>::value || 844 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); 845 #else // EIGEN_HAS_GPU_FP16 846 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && 847 (internal::is_same<typename Self::CoeffReturnType, float>::value || 848 internal::is_same<typename Self::CoeffReturnType, double>::value); 849 #endif // EIGEN_HAS_GPU_FP16 850 851 template <typename OutputType> 852 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 853 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); 854 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 855 // Don't crash when we're called with an input tensor of size 0. 856 if (num_coeffs == 0) { 857 return true; 858 } 859 // It's faster to use the usual code. 860 if (num_coeffs_to_reduce <= 128) { 861 return true; 862 } 863 864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); 865 } 866 }; 867 868 template <int NumPerThread, typename Self, 869 typename Reducer, typename Index> 870 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 871 typename Self::CoeffReturnType* output) { 872 const Index num_threads = blockDim.x * gridDim.x; 873 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 874 // Initialize the output values if they weren't initialized by the ReductionInitKernel 875 if (gridDim.x == 1) { 876 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 877 output[i] = reducer.initialize(); 878 } 879 __syncthreads(); 880 } 881 882 // Do the reduction. 883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); 884 for (Index i = thread_id; i < max_iter; i += num_threads) { 885 const Index input_col = i % num_preserved_coeffs; 886 const Index input_row = (i / num_preserved_coeffs) * NumPerThread; 887 typename Self::CoeffReturnType reduced_val = reducer.initialize(); 888 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); 889 for (Index j = input_row; j < max_row; j++) { 890 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); 891 reducer.reduce(val, &reduced_val); 892 } 893 atomicReduce(&(output[input_col]), reduced_val, reducer); 894 } 895 } 896 897 898 template <typename Self, typename Op> 899 struct OuterReducer<Self, Op, GpuDevice> { 900 // Unfortunately nvidia doesn't support well exotic types such as complex, 901 // so reduce the scope of the optimized version of the code to the simple case 902 // of floats. 903 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && 904 (internal::is_same<typename Self::CoeffReturnType, float>::value || 905 internal::is_same<typename Self::CoeffReturnType, double>::value); 906 template <typename Device, typename OutputType> 907 static 908 #if !defined(EIGEN_HIPCC) 909 // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error 910 // (in the cxx11_tensor_reduction_gpu test) 911 // 912 // terminate called after throwing an instance of 'std::runtime_error' 913 // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... 914 // 915 // don't know why this happens (and why is it a runtime error instead of a compile time error) 916 // 917 // this will be fixed by HIP PR#457 918 EIGEN_DEVICE_FUNC 919 #endif 920 bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { 921 gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); 922 return true; 923 } 924 925 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 926 typedef typename Self::Index Index; 927 928 // It's faster to use the usual code. 929 if (num_coeffs_to_reduce <= 32) { 930 return true; 931 } 932 933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 934 const int block_size = 256; 935 const int num_per_thread = 16; 936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 937 const int max_blocks = device.getNumGpuMultiProcessors() * 938 device.maxGpuThreadsPerMultiProcessor() / block_size; 939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 940 941 if (num_blocks > 1) { 942 // We initialize the outputs in the reduction kernel itself when we don't have to worry 943 // about race conditions between multiple thread blocks. 944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024); 945 const int max_blocks = device.getNumGpuMultiProcessors() * 946 device.maxGpuThreadsPerMultiProcessor() / 1024; 947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), 949 num_blocks, 1024, 0, device, reducer.initialize(), 950 num_preserved_vals, output); 951 } 952 953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), 954 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 955 956 return false; 957 } 958 }; 959 960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 961 962 963 } // end namespace internal 964 } // end namespace Eigen 965 966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H