cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorDeviceGpu.h (12837B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
     11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
     12 
     13 // This header file container defines fo gpu* macros which will resolve to
     14 // their equivalent hip* or cuda* versions depending on the compiler in use
     15 // A separate header (included at the end of this file) will undefine all 
     16 #include "TensorGpuHipCudaDefines.h"
     17 
     18 namespace Eigen {
     19 
     20 static const int kGpuScratchSize = 1024;
     21 
     22 // This defines an interface that GPUDevice can take to use
     23 // HIP / CUDA streams underneath.
     24 class StreamInterface {
     25  public:
     26   virtual ~StreamInterface() {}
     27 
     28   virtual const gpuStream_t& stream() const = 0;
     29   virtual const gpuDeviceProp_t& deviceProperties() const = 0;
     30 
     31   // Allocate memory on the actual device where the computation will run
     32   virtual void* allocate(size_t num_bytes) const = 0;
     33   virtual void deallocate(void* buffer) const = 0;
     34 
     35   // Return a scratchpad buffer of size 1k
     36   virtual void* scratchpad() const = 0;
     37 
     38   // Return a semaphore. The semaphore is initially initialized to 0, and
     39   // each kernel using it is responsible for resetting to 0 upon completion
     40   // to maintain the invariant that the semaphore is always equal to 0 upon
     41   // each kernel start.
     42   virtual unsigned int* semaphore() const = 0;
     43 };
     44 
     45 class GpuDeviceProperties {
     46  public:
     47   GpuDeviceProperties() : 
     48       initialized_(false), first_(true), device_properties_(nullptr) {}
     49  
     50   ~GpuDeviceProperties() {
     51     if (device_properties_) {
     52       delete[] device_properties_;
     53     }
     54   }
     55   
     56   EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
     57     return device_properties_[device];
     58   }
     59 
     60   EIGEN_STRONG_INLINE bool isInitialized() const {
     61     return initialized_;
     62   }
     63 
     64   void initialize() {
     65     if (!initialized_) {
     66       // Attempts to ensure proper behavior in the case of multiple threads
     67       // calling this function simultaneously. This would be trivial to
     68       // implement if we could use std::mutex, but unfortunately mutex don't
     69       // compile with nvcc, so we resort to atomics and thread fences instead.
     70       // Note that if the caller uses a compiler that doesn't support c++11 we
     71       // can't ensure that the initialization is thread safe.
     72       if (first_.exchange(false)) {
     73         // We're the first thread to reach this point.
     74         int num_devices;
     75         gpuError_t status = gpuGetDeviceCount(&num_devices);
     76         if (status != gpuSuccess) {
     77           std::cerr << "Failed to get the number of GPU devices: "
     78                     << gpuGetErrorString(status)
     79                     << std::endl;
     80           gpu_assert(status == gpuSuccess);
     81         }
     82         device_properties_ = new gpuDeviceProp_t[num_devices];
     83         for (int i = 0; i < num_devices; ++i) {
     84           status = gpuGetDeviceProperties(&device_properties_[i], i);
     85           if (status != gpuSuccess) {
     86             std::cerr << "Failed to initialize GPU device #"
     87                       << i
     88                       << ": "
     89                       << gpuGetErrorString(status)
     90                       << std::endl;
     91             gpu_assert(status == gpuSuccess);
     92           }
     93         }
     94 
     95         std::atomic_thread_fence(std::memory_order_release);
     96         initialized_ = true;
     97       } else {
     98         // Wait for the other thread to inititialize the properties.
     99         while (!initialized_) {
    100           std::atomic_thread_fence(std::memory_order_acquire);
    101           std::this_thread::sleep_for(std::chrono::milliseconds(1000));
    102         }
    103       }
    104     }
    105   }
    106 
    107  private:
    108   volatile bool initialized_;
    109   std::atomic<bool> first_;
    110   gpuDeviceProp_t* device_properties_;
    111 };
    112 
    113 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
    114   static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
    115   if (!deviceProperties->isInitialized()) {
    116     deviceProperties->initialize();
    117   }
    118   return *deviceProperties;
    119 }
    120 
    121 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
    122   return GetGpuDeviceProperties().get(device);
    123 }
    124 
    125 static const gpuStream_t default_stream = gpuStreamDefault;
    126 
    127 class GpuStreamDevice : public StreamInterface {
    128  public:
    129   // Use the default stream on the current device
    130   GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
    131     gpuGetDevice(&device_);
    132   }
    133   // Use the default stream on the specified device
    134   GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
    135   // Use the specified stream. Note that it's the
    136   // caller responsibility to ensure that the stream can run on
    137   // the specified device. If no device is specified the code
    138   // assumes that the stream is associated to the current gpu device.
    139   GpuStreamDevice(const gpuStream_t* stream, int device = -1)
    140       : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
    141     if (device < 0) {
    142       gpuGetDevice(&device_);
    143     } else {
    144       int num_devices;
    145       gpuError_t err = gpuGetDeviceCount(&num_devices);
    146       EIGEN_UNUSED_VARIABLE(err)
    147       gpu_assert(err == gpuSuccess);
    148       gpu_assert(device < num_devices);
    149       device_ = device;
    150     }
    151   }
    152 
    153   virtual ~GpuStreamDevice() {
    154     if (scratch_) {
    155       deallocate(scratch_);
    156     }
    157   }
    158 
    159   const gpuStream_t& stream() const { return *stream_; }
    160   const gpuDeviceProp_t& deviceProperties() const {
    161     return GetGpuDeviceProperties(device_);
    162   }
    163   virtual void* allocate(size_t num_bytes) const {
    164     gpuError_t err = gpuSetDevice(device_);
    165     EIGEN_UNUSED_VARIABLE(err)
    166     gpu_assert(err == gpuSuccess);
    167     void* result;
    168     err = gpuMalloc(&result, num_bytes);
    169     gpu_assert(err == gpuSuccess);
    170     gpu_assert(result != NULL);
    171     return result;
    172   }
    173   virtual void deallocate(void* buffer) const {
    174     gpuError_t err = gpuSetDevice(device_);
    175     EIGEN_UNUSED_VARIABLE(err)
    176     gpu_assert(err == gpuSuccess);
    177     gpu_assert(buffer != NULL);
    178     err = gpuFree(buffer);
    179     gpu_assert(err == gpuSuccess);
    180   }
    181 
    182   virtual void* scratchpad() const {
    183     if (scratch_ == NULL) {
    184       scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
    185     }
    186     return scratch_;
    187   }
    188 
    189   virtual unsigned int* semaphore() const {
    190     if (semaphore_ == NULL) {
    191       char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
    192       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
    193       gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
    194       EIGEN_UNUSED_VARIABLE(err)
    195       gpu_assert(err == gpuSuccess);
    196     }
    197     return semaphore_;
    198   }
    199 
    200  private:
    201   const gpuStream_t* stream_;
    202   int device_;
    203   mutable void* scratch_;
    204   mutable unsigned int* semaphore_;
    205 };
    206 
    207 struct GpuDevice {
    208   // The StreamInterface is not owned: the caller is
    209   // responsible for its initialization and eventual destruction.
    210   explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
    211     eigen_assert(stream);
    212   }
    213   explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
    214     eigen_assert(stream);
    215   }
    216   // TODO(bsteiner): This is an internal API, we should not expose it.
    217   EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
    218     return stream_->stream();
    219   }
    220 
    221   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    222     return stream_->allocate(num_bytes);
    223   }
    224 
    225   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    226     stream_->deallocate(buffer);
    227   }
    228 
    229   EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
    230     return stream_->allocate(num_bytes);
    231   }
    232 
    233   EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
    234     stream_->deallocate(buffer);
    235   }
    236 
    237   template<typename Type>
    238   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
    239     return data;
    240   }
    241 
    242   EIGEN_STRONG_INLINE void* scratchpad() const {
    243     return stream_->scratchpad();
    244   }
    245 
    246   EIGEN_STRONG_INLINE unsigned int* semaphore() const {
    247     return stream_->semaphore();
    248   }
    249 
    250   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    251 #ifndef EIGEN_GPU_COMPILE_PHASE
    252     gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
    253                                       stream_->stream());
    254     EIGEN_UNUSED_VARIABLE(err)
    255     gpu_assert(err == gpuSuccess);
    256 #else
    257     EIGEN_UNUSED_VARIABLE(dst);
    258     EIGEN_UNUSED_VARIABLE(src);
    259     EIGEN_UNUSED_VARIABLE(n);
    260     eigen_assert(false && "The default device should be used instead to generate kernel code");
    261 #endif
    262   }
    263 
    264   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
    265     gpuError_t err =
    266         gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
    267     EIGEN_UNUSED_VARIABLE(err)
    268     gpu_assert(err == gpuSuccess);
    269   }
    270 
    271   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
    272     gpuError_t err =
    273         gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
    274     EIGEN_UNUSED_VARIABLE(err)
    275     gpu_assert(err == gpuSuccess);
    276   }
    277 
    278   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    279 #ifndef EIGEN_GPU_COMPILE_PHASE
    280     gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
    281     EIGEN_UNUSED_VARIABLE(err)
    282     gpu_assert(err == gpuSuccess);
    283 #else
    284   eigen_assert(false && "The default device should be used instead to generate kernel code");
    285 #endif
    286   }
    287 
    288   EIGEN_STRONG_INLINE size_t numThreads() const {
    289     // FIXME
    290     return 32;
    291   }
    292 
    293   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
    294     // FIXME
    295     return 48*1024;
    296   }
    297 
    298   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
    299     // We won't try to take advantage of the l2 cache for the time being, and
    300     // there is no l3 cache on hip/cuda devices.
    301     return firstLevelCacheSize();
    302   }
    303 
    304   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
    305 #ifndef EIGEN_GPU_COMPILE_PHASE
    306     gpuError_t err = gpuStreamSynchronize(stream_->stream());
    307     if (err != gpuSuccess) {
    308       std::cerr << "Error detected in GPU stream: "
    309                 << gpuGetErrorString(err)
    310                 << std::endl;
    311       gpu_assert(err == gpuSuccess);
    312     }
    313 #else
    314     gpu_assert(false && "The default device should be used instead to generate kernel code");
    315 #endif
    316   }
    317 
    318   EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
    319     return stream_->deviceProperties().multiProcessorCount;
    320   }
    321   EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
    322     return stream_->deviceProperties().maxThreadsPerBlock;
    323   }
    324   EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
    325     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
    326   }
    327   EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
    328     return stream_->deviceProperties().sharedMemPerBlock;
    329   }
    330   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
    331     return stream_->deviceProperties().major;
    332   }
    333   EIGEN_STRONG_INLINE int minorDeviceVersion() const {
    334     return stream_->deviceProperties().minor;
    335   }
    336 
    337   EIGEN_STRONG_INLINE int maxBlocks() const {
    338     return max_blocks_;
    339   }
    340 
    341   // This function checks if the GPU runtime recorded an error for the
    342   // underlying stream device.
    343   inline bool ok() const {
    344 #ifdef EIGEN_GPUCC
    345     gpuError_t error = gpuStreamQuery(stream_->stream());
    346     return (error == gpuSuccess) || (error == gpuErrorNotReady);
    347 #else
    348     return false;
    349 #endif
    350   }
    351 
    352  private:
    353   const StreamInterface* stream_;
    354   int max_blocks_;
    355 };
    356 
    357 #if defined(EIGEN_HIPCC)
    358 
    359 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
    360   hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
    361   gpu_assert(hipGetLastError() == hipSuccess);
    362 
    363 #else
    364  
    365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
    366   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
    367   gpu_assert(cudaGetLastError() == cudaSuccess);
    368 
    369 #endif
    370  
    371 // FIXME: Should be device and kernel specific.
    372 #ifdef EIGEN_GPUCC
    373 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
    374 #ifndef EIGEN_GPU_COMPILE_PHASE
    375   gpuError_t status = gpuDeviceSetSharedMemConfig(config);
    376   EIGEN_UNUSED_VARIABLE(status)
    377   gpu_assert(status == gpuSuccess);
    378 #else
    379   EIGEN_UNUSED_VARIABLE(config)
    380 #endif
    381 }
    382 #endif
    383 
    384 }  // end namespace Eigen
    385 
    386 // undefine all the gpu* macros we defined at the beginning of the file
    387 #include "TensorGpuHipCudaUndefines.h"
    388 
    389 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H