gpu_common.h (5456B)
1 #ifndef EIGEN_TEST_GPU_COMMON_H 2 #define EIGEN_TEST_GPU_COMMON_H 3 4 #ifdef EIGEN_USE_HIP 5 #include <hip/hip_runtime.h> 6 #include <hip/hip_runtime_api.h> 7 #else 8 #include <cuda.h> 9 #include <cuda_runtime.h> 10 #include <cuda_runtime_api.h> 11 #endif 12 13 #include <iostream> 14 15 #define EIGEN_USE_GPU 16 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> 17 18 #if !defined(__CUDACC__) && !defined(__HIPCC__) 19 dim3 threadIdx, blockDim, blockIdx; 20 #endif 21 22 template<typename Kernel, typename Input, typename Output> 23 void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) 24 { 25 for(int i=0; i<n; i++) 26 ker(i, in.data(), out.data()); 27 } 28 29 30 template<typename Kernel, typename Input, typename Output> 31 __global__ 32 EIGEN_HIP_LAUNCH_BOUNDS_1024 33 void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) 34 { 35 int i = threadIdx.x + blockIdx.x*blockDim.x; 36 if(i<n) { 37 ker(i, in, out); 38 } 39 } 40 41 42 template<typename Kernel, typename Input, typename Output> 43 void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) 44 { 45 typename Input::Scalar* d_in; 46 typename Output::Scalar* d_out; 47 std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); 48 std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); 49 50 gpuMalloc((void**)(&d_in), in_bytes); 51 gpuMalloc((void**)(&d_out), out_bytes); 52 53 gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); 54 gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice); 55 56 // Simple and non-optimal 1D mapping assuming n is not too large 57 // That's only for unit testing! 58 dim3 Blocks(128); 59 dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); 60 61 gpuDeviceSynchronize(); 62 63 #ifdef EIGEN_USE_HIP 64 hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel, 65 typename std::decay<decltype(*d_in)>::type, 66 typename std::decay<decltype(*d_out)>::type>), 67 dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); 68 #else 69 run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out); 70 #endif 71 // Pre-launch errors. 72 gpuError_t err = gpuGetLastError(); 73 if (err != gpuSuccess) { 74 printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); 75 gpu_assert(false); 76 } 77 78 // Kernel execution errors. 79 err = gpuDeviceSynchronize(); 80 if (err != gpuSuccess) { 81 printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); 82 gpu_assert(false); 83 } 84 85 86 // check inputs have not been modified 87 gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost); 88 gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost); 89 90 gpuFree(d_in); 91 gpuFree(d_out); 92 } 93 94 95 template<typename Kernel, typename Input, typename Output> 96 void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out) 97 { 98 Input in_ref, in_gpu; 99 Output out_ref, out_gpu; 100 #if !defined(EIGEN_GPU_COMPILE_PHASE) 101 in_ref = in_gpu = in; 102 out_ref = out_gpu = out; 103 #else 104 EIGEN_UNUSED_VARIABLE(in); 105 EIGEN_UNUSED_VARIABLE(out); 106 #endif 107 run_on_cpu (ker, n, in_ref, out_ref); 108 run_on_gpu(ker, n, in_gpu, out_gpu); 109 #if !defined(EIGEN_GPU_COMPILE_PHASE) 110 VERIFY_IS_APPROX(in_ref, in_gpu); 111 VERIFY_IS_APPROX(out_ref, out_gpu); 112 #endif 113 } 114 115 struct compile_time_device_info { 116 EIGEN_DEVICE_FUNC 117 void operator()(int i, const int* /*in*/, int* info) const 118 { 119 if (i == 0) { 120 EIGEN_UNUSED_VARIABLE(info) 121 #if defined(__CUDA_ARCH__) 122 info[0] = int(__CUDA_ARCH__ +0); 123 #endif 124 #if defined(EIGEN_HIP_DEVICE_COMPILE) 125 info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); 126 #endif 127 } 128 } 129 }; 130 131 void ei_test_init_gpu() 132 { 133 int device = 0; 134 gpuDeviceProp_t deviceProp; 135 gpuGetDeviceProperties(&deviceProp, device); 136 137 ArrayXi dummy(1), info(10); 138 info = -1; 139 run_on_gpu(compile_time_device_info(),10,dummy,info); 140 141 142 std::cout << "GPU compile-time info:\n"; 143 144 #ifdef EIGEN_CUDACC 145 std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << "\n"; 146 #endif 147 148 #ifdef EIGEN_CUDA_SDK_VER 149 std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << "\n"; 150 #endif 151 152 #ifdef EIGEN_COMP_NVCC 153 std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << "\n"; 154 #endif 155 156 #ifdef EIGEN_HIPCC 157 std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << "\n"; 158 #endif 159 160 std::cout << " EIGEN_CUDA_ARCH: " << info[0] << "\n"; 161 std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info[1] << "\n"; 162 163 std::cout << "GPU device info:\n"; 164 std::cout << " name: " << deviceProp.name << "\n"; 165 std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; 166 std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; 167 std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; 168 std::cout << " warpSize: " << deviceProp.warpSize << "\n"; 169 std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; 170 std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; 171 std::cout << " clockRate: " << deviceProp.clockRate << "\n"; 172 std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; 173 std::cout << " computeMode: " << deviceProp.computeMode << "\n"; 174 } 175 176 #endif // EIGEN_TEST_GPU_COMMON_H