TensorDeviceGpu.h (12837B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H 12 13 // This header file container defines fo gpu* macros which will resolve to 14 // their equivalent hip* or cuda* versions depending on the compiler in use 15 // A separate header (included at the end of this file) will undefine all 16 #include "TensorGpuHipCudaDefines.h" 17 18 namespace Eigen { 19 20 static const int kGpuScratchSize = 1024; 21 22 // This defines an interface that GPUDevice can take to use 23 // HIP / CUDA streams underneath. 24 class StreamInterface { 25 public: 26 virtual ~StreamInterface() {} 27 28 virtual const gpuStream_t& stream() const = 0; 29 virtual const gpuDeviceProp_t& deviceProperties() const = 0; 30 31 // Allocate memory on the actual device where the computation will run 32 virtual void* allocate(size_t num_bytes) const = 0; 33 virtual void deallocate(void* buffer) const = 0; 34 35 // Return a scratchpad buffer of size 1k 36 virtual void* scratchpad() const = 0; 37 38 // Return a semaphore. The semaphore is initially initialized to 0, and 39 // each kernel using it is responsible for resetting to 0 upon completion 40 // to maintain the invariant that the semaphore is always equal to 0 upon 41 // each kernel start. 42 virtual unsigned int* semaphore() const = 0; 43 }; 44 45 class GpuDeviceProperties { 46 public: 47 GpuDeviceProperties() : 48 initialized_(false), first_(true), device_properties_(nullptr) {} 49 50 ~GpuDeviceProperties() { 51 if (device_properties_) { 52 delete[] device_properties_; 53 } 54 } 55 56 EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { 57 return device_properties_[device]; 58 } 59 60 EIGEN_STRONG_INLINE bool isInitialized() const { 61 return initialized_; 62 } 63 64 void initialize() { 65 if (!initialized_) { 66 // Attempts to ensure proper behavior in the case of multiple threads 67 // calling this function simultaneously. This would be trivial to 68 // implement if we could use std::mutex, but unfortunately mutex don't 69 // compile with nvcc, so we resort to atomics and thread fences instead. 70 // Note that if the caller uses a compiler that doesn't support c++11 we 71 // can't ensure that the initialization is thread safe. 72 if (first_.exchange(false)) { 73 // We're the first thread to reach this point. 74 int num_devices; 75 gpuError_t status = gpuGetDeviceCount(&num_devices); 76 if (status != gpuSuccess) { 77 std::cerr << "Failed to get the number of GPU devices: " 78 << gpuGetErrorString(status) 79 << std::endl; 80 gpu_assert(status == gpuSuccess); 81 } 82 device_properties_ = new gpuDeviceProp_t[num_devices]; 83 for (int i = 0; i < num_devices; ++i) { 84 status = gpuGetDeviceProperties(&device_properties_[i], i); 85 if (status != gpuSuccess) { 86 std::cerr << "Failed to initialize GPU device #" 87 << i 88 << ": " 89 << gpuGetErrorString(status) 90 << std::endl; 91 gpu_assert(status == gpuSuccess); 92 } 93 } 94 95 std::atomic_thread_fence(std::memory_order_release); 96 initialized_ = true; 97 } else { 98 // Wait for the other thread to inititialize the properties. 99 while (!initialized_) { 100 std::atomic_thread_fence(std::memory_order_acquire); 101 std::this_thread::sleep_for(std::chrono::milliseconds(1000)); 102 } 103 } 104 } 105 } 106 107 private: 108 volatile bool initialized_; 109 std::atomic<bool> first_; 110 gpuDeviceProp_t* device_properties_; 111 }; 112 113 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { 114 static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); 115 if (!deviceProperties->isInitialized()) { 116 deviceProperties->initialize(); 117 } 118 return *deviceProperties; 119 } 120 121 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { 122 return GetGpuDeviceProperties().get(device); 123 } 124 125 static const gpuStream_t default_stream = gpuStreamDefault; 126 127 class GpuStreamDevice : public StreamInterface { 128 public: 129 // Use the default stream on the current device 130 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { 131 gpuGetDevice(&device_); 132 } 133 // Use the default stream on the specified device 134 GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} 135 // Use the specified stream. Note that it's the 136 // caller responsibility to ensure that the stream can run on 137 // the specified device. If no device is specified the code 138 // assumes that the stream is associated to the current gpu device. 139 GpuStreamDevice(const gpuStream_t* stream, int device = -1) 140 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { 141 if (device < 0) { 142 gpuGetDevice(&device_); 143 } else { 144 int num_devices; 145 gpuError_t err = gpuGetDeviceCount(&num_devices); 146 EIGEN_UNUSED_VARIABLE(err) 147 gpu_assert(err == gpuSuccess); 148 gpu_assert(device < num_devices); 149 device_ = device; 150 } 151 } 152 153 virtual ~GpuStreamDevice() { 154 if (scratch_) { 155 deallocate(scratch_); 156 } 157 } 158 159 const gpuStream_t& stream() const { return *stream_; } 160 const gpuDeviceProp_t& deviceProperties() const { 161 return GetGpuDeviceProperties(device_); 162 } 163 virtual void* allocate(size_t num_bytes) const { 164 gpuError_t err = gpuSetDevice(device_); 165 EIGEN_UNUSED_VARIABLE(err) 166 gpu_assert(err == gpuSuccess); 167 void* result; 168 err = gpuMalloc(&result, num_bytes); 169 gpu_assert(err == gpuSuccess); 170 gpu_assert(result != NULL); 171 return result; 172 } 173 virtual void deallocate(void* buffer) const { 174 gpuError_t err = gpuSetDevice(device_); 175 EIGEN_UNUSED_VARIABLE(err) 176 gpu_assert(err == gpuSuccess); 177 gpu_assert(buffer != NULL); 178 err = gpuFree(buffer); 179 gpu_assert(err == gpuSuccess); 180 } 181 182 virtual void* scratchpad() const { 183 if (scratch_ == NULL) { 184 scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); 185 } 186 return scratch_; 187 } 188 189 virtual unsigned int* semaphore() const { 190 if (semaphore_ == NULL) { 191 char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize; 192 semaphore_ = reinterpret_cast<unsigned int*>(scratch); 193 gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); 194 EIGEN_UNUSED_VARIABLE(err) 195 gpu_assert(err == gpuSuccess); 196 } 197 return semaphore_; 198 } 199 200 private: 201 const gpuStream_t* stream_; 202 int device_; 203 mutable void* scratch_; 204 mutable unsigned int* semaphore_; 205 }; 206 207 struct GpuDevice { 208 // The StreamInterface is not owned: the caller is 209 // responsible for its initialization and eventual destruction. 210 explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { 211 eigen_assert(stream); 212 } 213 explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { 214 eigen_assert(stream); 215 } 216 // TODO(bsteiner): This is an internal API, we should not expose it. 217 EIGEN_STRONG_INLINE const gpuStream_t& stream() const { 218 return stream_->stream(); 219 } 220 221 EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { 222 return stream_->allocate(num_bytes); 223 } 224 225 EIGEN_STRONG_INLINE void deallocate(void* buffer) const { 226 stream_->deallocate(buffer); 227 } 228 229 EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { 230 return stream_->allocate(num_bytes); 231 } 232 233 EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { 234 stream_->deallocate(buffer); 235 } 236 237 template<typename Type> 238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 239 return data; 240 } 241 242 EIGEN_STRONG_INLINE void* scratchpad() const { 243 return stream_->scratchpad(); 244 } 245 246 EIGEN_STRONG_INLINE unsigned int* semaphore() const { 247 return stream_->semaphore(); 248 } 249 250 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { 251 #ifndef EIGEN_GPU_COMPILE_PHASE 252 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, 253 stream_->stream()); 254 EIGEN_UNUSED_VARIABLE(err) 255 gpu_assert(err == gpuSuccess); 256 #else 257 EIGEN_UNUSED_VARIABLE(dst); 258 EIGEN_UNUSED_VARIABLE(src); 259 EIGEN_UNUSED_VARIABLE(n); 260 eigen_assert(false && "The default device should be used instead to generate kernel code"); 261 #endif 262 } 263 264 EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { 265 gpuError_t err = 266 gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); 267 EIGEN_UNUSED_VARIABLE(err) 268 gpu_assert(err == gpuSuccess); 269 } 270 271 EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { 272 gpuError_t err = 273 gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); 274 EIGEN_UNUSED_VARIABLE(err) 275 gpu_assert(err == gpuSuccess); 276 } 277 278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { 279 #ifndef EIGEN_GPU_COMPILE_PHASE 280 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); 281 EIGEN_UNUSED_VARIABLE(err) 282 gpu_assert(err == gpuSuccess); 283 #else 284 eigen_assert(false && "The default device should be used instead to generate kernel code"); 285 #endif 286 } 287 288 EIGEN_STRONG_INLINE size_t numThreads() const { 289 // FIXME 290 return 32; 291 } 292 293 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { 294 // FIXME 295 return 48*1024; 296 } 297 298 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { 299 // We won't try to take advantage of the l2 cache for the time being, and 300 // there is no l3 cache on hip/cuda devices. 301 return firstLevelCacheSize(); 302 } 303 304 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { 305 #ifndef EIGEN_GPU_COMPILE_PHASE 306 gpuError_t err = gpuStreamSynchronize(stream_->stream()); 307 if (err != gpuSuccess) { 308 std::cerr << "Error detected in GPU stream: " 309 << gpuGetErrorString(err) 310 << std::endl; 311 gpu_assert(err == gpuSuccess); 312 } 313 #else 314 gpu_assert(false && "The default device should be used instead to generate kernel code"); 315 #endif 316 } 317 318 EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { 319 return stream_->deviceProperties().multiProcessorCount; 320 } 321 EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { 322 return stream_->deviceProperties().maxThreadsPerBlock; 323 } 324 EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { 325 return stream_->deviceProperties().maxThreadsPerMultiProcessor; 326 } 327 EIGEN_STRONG_INLINE int sharedMemPerBlock() const { 328 return stream_->deviceProperties().sharedMemPerBlock; 329 } 330 EIGEN_STRONG_INLINE int majorDeviceVersion() const { 331 return stream_->deviceProperties().major; 332 } 333 EIGEN_STRONG_INLINE int minorDeviceVersion() const { 334 return stream_->deviceProperties().minor; 335 } 336 337 EIGEN_STRONG_INLINE int maxBlocks() const { 338 return max_blocks_; 339 } 340 341 // This function checks if the GPU runtime recorded an error for the 342 // underlying stream device. 343 inline bool ok() const { 344 #ifdef EIGEN_GPUCC 345 gpuError_t error = gpuStreamQuery(stream_->stream()); 346 return (error == gpuSuccess) || (error == gpuErrorNotReady); 347 #else 348 return false; 349 #endif 350 } 351 352 private: 353 const StreamInterface* stream_; 354 int max_blocks_; 355 }; 356 357 #if defined(EIGEN_HIPCC) 358 359 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 360 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ 361 gpu_assert(hipGetLastError() == hipSuccess); 362 363 #else 364 365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 366 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ 367 gpu_assert(cudaGetLastError() == cudaSuccess); 368 369 #endif 370 371 // FIXME: Should be device and kernel specific. 372 #ifdef EIGEN_GPUCC 373 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { 374 #ifndef EIGEN_GPU_COMPILE_PHASE 375 gpuError_t status = gpuDeviceSetSharedMemConfig(config); 376 EIGEN_UNUSED_VARIABLE(status) 377 gpu_assert(status == gpuSuccess); 378 #else 379 EIGEN_UNUSED_VARIABLE(config) 380 #endif 381 } 382 #endif 383 384 } // end namespace Eigen 385 386 // undefine all the gpu* macros we defined at the beginning of the file 387 #include "TensorGpuHipCudaUndefines.h" 388 389 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H