cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

gpu_common.h (5456B)


      1 #ifndef EIGEN_TEST_GPU_COMMON_H
      2 #define EIGEN_TEST_GPU_COMMON_H
      3 
      4 #ifdef EIGEN_USE_HIP
      5   #include <hip/hip_runtime.h>
      6   #include <hip/hip_runtime_api.h>
      7 #else
      8   #include <cuda.h>
      9   #include <cuda_runtime.h>
     10   #include <cuda_runtime_api.h>
     11 #endif
     12 
     13 #include <iostream>
     14 
     15 #define EIGEN_USE_GPU
     16 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
     17 
     18 #if !defined(__CUDACC__) && !defined(__HIPCC__)
     19 dim3 threadIdx, blockDim, blockIdx;
     20 #endif
     21 
     22 template<typename Kernel, typename Input, typename Output>
     23 void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
     24 {
     25   for(int i=0; i<n; i++)
     26     ker(i, in.data(), out.data());
     27 }
     28 
     29 
     30 template<typename Kernel, typename Input, typename Output>
     31 __global__
     32 EIGEN_HIP_LAUNCH_BOUNDS_1024
     33 void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
     34 {
     35   int i = threadIdx.x + blockIdx.x*blockDim.x;
     36   if(i<n) {
     37     ker(i, in, out);
     38   }
     39 }
     40 
     41 
     42 template<typename Kernel, typename Input, typename Output>
     43 void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
     44 {
     45   typename Input::Scalar*  d_in;
     46   typename Output::Scalar* d_out;
     47   std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
     48   std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
     49   
     50   gpuMalloc((void**)(&d_in),  in_bytes);
     51   gpuMalloc((void**)(&d_out), out_bytes);
     52   
     53   gpuMemcpy(d_in,  in.data(),  in_bytes,  gpuMemcpyHostToDevice);
     54   gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
     55   
     56   // Simple and non-optimal 1D mapping assuming n is not too large
     57   // That's only for unit testing!
     58   dim3 Blocks(128);
     59   dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
     60 
     61   gpuDeviceSynchronize();
     62   
     63 #ifdef EIGEN_USE_HIP
     64   hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel,
     65 				     typename std::decay<decltype(*d_in)>::type,
     66 				     typename std::decay<decltype(*d_out)>::type>), 
     67 		     dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
     68 #else
     69   run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
     70 #endif
     71   // Pre-launch errors.
     72   gpuError_t err = gpuGetLastError();
     73   if (err != gpuSuccess) {
     74     printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
     75     gpu_assert(false);
     76   }
     77   
     78   // Kernel execution errors.
     79   err = gpuDeviceSynchronize();
     80   if (err != gpuSuccess) {
     81     printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
     82     gpu_assert(false);
     83   }
     84   
     85   
     86   // check inputs have not been modified
     87   gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
     88   gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
     89   
     90   gpuFree(d_in);
     91   gpuFree(d_out);
     92 }
     93 
     94 
     95 template<typename Kernel, typename Input, typename Output>
     96 void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
     97 {
     98   Input  in_ref,  in_gpu;
     99   Output out_ref, out_gpu;
    100   #if !defined(EIGEN_GPU_COMPILE_PHASE)
    101   in_ref = in_gpu = in;
    102   out_ref = out_gpu = out;
    103   #else
    104   EIGEN_UNUSED_VARIABLE(in);
    105   EIGEN_UNUSED_VARIABLE(out);
    106   #endif
    107   run_on_cpu (ker, n, in_ref,  out_ref);
    108   run_on_gpu(ker, n, in_gpu, out_gpu);
    109   #if !defined(EIGEN_GPU_COMPILE_PHASE)
    110   VERIFY_IS_APPROX(in_ref, in_gpu);
    111   VERIFY_IS_APPROX(out_ref, out_gpu);
    112   #endif
    113 }
    114 
    115 struct compile_time_device_info {
    116   EIGEN_DEVICE_FUNC
    117   void operator()(int i, const int* /*in*/, int* info) const
    118   {
    119     if (i == 0) {
    120       EIGEN_UNUSED_VARIABLE(info)
    121       #if defined(__CUDA_ARCH__)
    122       info[0] = int(__CUDA_ARCH__ +0);
    123       #endif
    124       #if defined(EIGEN_HIP_DEVICE_COMPILE)
    125       info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
    126       #endif
    127     }
    128   }
    129 };
    130 
    131 void ei_test_init_gpu()
    132 {
    133   int device = 0;
    134   gpuDeviceProp_t deviceProp;
    135   gpuGetDeviceProperties(&deviceProp, device);
    136 
    137   ArrayXi dummy(1), info(10);
    138   info = -1;
    139   run_on_gpu(compile_time_device_info(),10,dummy,info);
    140 
    141 
    142   std::cout << "GPU compile-time info:\n";
    143   
    144   #ifdef EIGEN_CUDACC
    145   std::cout << "  EIGEN_CUDACC:                 " << int(EIGEN_CUDACC) << "\n";
    146   #endif
    147   
    148   #ifdef EIGEN_CUDA_SDK_VER
    149   std::cout << "  EIGEN_CUDA_SDK_VER:             " << int(EIGEN_CUDA_SDK_VER) << "\n";
    150   #endif
    151 
    152   #ifdef EIGEN_COMP_NVCC
    153   std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << "\n";
    154   #endif
    155   
    156   #ifdef EIGEN_HIPCC
    157   std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << "\n";
    158   #endif
    159 
    160   std::cout << "  EIGEN_CUDA_ARCH:             " << info[0] << "\n";  
    161   std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info[1] << "\n";
    162 
    163   std::cout << "GPU device info:\n";
    164   std::cout << "  name:                        " << deviceProp.name << "\n";
    165   std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
    166   std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
    167   std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
    168   std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
    169   std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
    170   std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
    171   std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
    172   std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
    173   std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
    174 }
    175 
    176 #endif // EIGEN_TEST_GPU_COMMON_H