cxx11_tensor_contract_gpu.cu (7350B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> 6 // 7 // This Source Code Form is subject to the terms of the Mozilla 8 // Public License v. 2.0. If a copy of the MPL was not distributed 9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10 11 #define EIGEN_TEST_NO_LONGDOUBLE 12 #define EIGEN_TEST_NO_COMPLEX 13 14 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 15 #define EIGEN_USE_GPU 16 17 #include "main.h" 18 #include <unsupported/Eigen/CXX11/Tensor> 19 20 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> 21 22 using Eigen::Tensor; 23 typedef Tensor<float, 1>::DimensionPair DimPair; 24 25 template<int DataLayout> 26 void test_gpu_contraction(int m_size, int k_size, int n_size) 27 { 28 std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; 29 // with these dimensions, the output has 300 * 140 elements, which is 30 // more than 30 * 1024, which is the number of threads in blocks on 31 // a 15 SM GK110 GPU 32 Tensor<float, 2, DataLayout> t_left(m_size, k_size); 33 Tensor<float, 2, DataLayout> t_right(k_size, n_size); 34 Tensor<float, 2, DataLayout> t_result(m_size, n_size); 35 Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); 36 Eigen::array<DimPair, 1> dims(DimPair(1, 0)); 37 38 t_left.setRandom(); 39 t_right.setRandom(); 40 41 std::size_t t_left_bytes = t_left.size() * sizeof(float); 42 std::size_t t_right_bytes = t_right.size() * sizeof(float); 43 std::size_t t_result_bytes = t_result.size() * sizeof(float); 44 45 float* d_t_left; 46 float* d_t_right; 47 float* d_t_result; 48 49 gpuMalloc((void**)(&d_t_left), t_left_bytes); 50 gpuMalloc((void**)(&d_t_right), t_right_bytes); 51 gpuMalloc((void**)(&d_t_result), t_result_bytes); 52 53 gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); 54 gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); 55 56 Eigen::GpuStreamDevice stream; 57 Eigen::GpuDevice gpu_device(&stream); 58 59 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 60 gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size)); 61 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 62 gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size)); 63 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 64 gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size)); 65 66 67 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 68 t_result = t_left.contract(t_right, dims); 69 70 gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); 71 for (DenseIndex i = 0; i < t_result.size(); i++) { 72 if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { 73 continue; 74 } 75 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { 76 continue; 77 } 78 std::cout << "mismatch detected at index " << i << ": " << t_result(i) 79 << " vs " << t_result_gpu(i) << std::endl; 80 assert(false); 81 } 82 83 gpuFree((void*)d_t_left); 84 gpuFree((void*)d_t_right); 85 gpuFree((void*)d_t_result); 86 } 87 88 89 template<int DataLayout> 90 void test_scalar(int m_size, int k_size, int n_size) 91 { 92 std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; 93 // with these dimensions, the output has 300 * 140 elements, which is 94 // more than 30 * 1024, which is the number of threads in blocks on 95 // a 15 SM GK110 GPU 96 Tensor<float, 2, DataLayout> t_left(m_size, k_size); 97 Tensor<float, 2, DataLayout> t_right(k_size, n_size); 98 Tensor<float, 0, DataLayout> t_result; 99 Tensor<float, 0, DataLayout> t_result_gpu; 100 Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); 101 102 t_left.setRandom(); 103 t_right.setRandom(); 104 105 std::size_t t_left_bytes = t_left.size() * sizeof(float); 106 std::size_t t_right_bytes = t_right.size() * sizeof(float); 107 std::size_t t_result_bytes = sizeof(float); 108 109 float* d_t_left; 110 float* d_t_right; 111 float* d_t_result; 112 113 gpuMalloc((void**)(&d_t_left), t_left_bytes); 114 gpuMalloc((void**)(&d_t_right), t_right_bytes); 115 gpuMalloc((void**)(&d_t_result), t_result_bytes); 116 117 gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); 118 gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); 119 120 Eigen::GpuStreamDevice stream; 121 Eigen::GpuDevice gpu_device(&stream); 122 123 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 124 gpu_t_left(d_t_left, m_size, k_size); 125 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 126 gpu_t_right(d_t_right, k_size, n_size); 127 Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > 128 gpu_t_result(d_t_result); 129 130 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 131 t_result = t_left.contract(t_right, dims); 132 133 gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); 134 if (fabs(t_result() - t_result_gpu()) > 1e-4f && 135 !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { 136 std::cout << "mismatch detected: " << t_result() 137 << " vs " << t_result_gpu() << std::endl; 138 assert(false); 139 } 140 141 gpuFree((void*)d_t_left); 142 gpuFree((void*)d_t_right); 143 gpuFree((void*)d_t_result); 144 } 145 146 147 template<int DataLayout> 148 void test_gpu_contraction_m() { 149 for (int k = 32; k < 256; k++) { 150 test_gpu_contraction<ColMajor>(k, 128, 128); 151 test_gpu_contraction<RowMajor>(k, 128, 128); 152 } 153 } 154 155 template<int DataLayout> 156 void test_gpu_contraction_k() { 157 for (int k = 32; k < 256; k++) { 158 test_gpu_contraction<ColMajor>(128, k, 128); 159 test_gpu_contraction<RowMajor>(128, k, 128); 160 } 161 } 162 163 template<int DataLayout> 164 void test_gpu_contraction_n() { 165 for (int k = 32; k < 256; k++) { 166 test_gpu_contraction<ColMajor>(128, 128, k); 167 test_gpu_contraction<RowMajor>(128, 128, k); 168 } 169 } 170 171 172 template<int DataLayout> 173 void test_gpu_contraction_sizes() { 174 int m_sizes[] = { 31, 39, 63, 64, 65, 175 127, 129, 255, 257 , 511, 176 512, 513, 1023, 1024, 1025}; 177 178 int n_sizes[] = { 31, 39, 63, 64, 65, 179 127, 129, 255, 257, 511, 180 512, 513, 1023, 1024, 1025}; 181 182 int k_sizes[] = { 31, 39, 63, 64, 65, 183 95, 96, 127, 129, 255, 184 257, 511, 512, 513, 1023, 185 1024, 1025}; 186 187 for (int i = 0; i < 15; i++) { 188 for (int j = 0; j < 15; j++) { 189 for (int k = 0; k < 17; k++) { 190 test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); 191 } 192 } 193 } 194 } 195 196 EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu) 197 { 198 CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128)); 199 CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128)); 200 201 CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); 202 CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); 203 204 CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>()); 205 CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>()); 206 207 CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>()); 208 CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>()); 209 210 CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>()); 211 CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>()); 212 213 #if !defined(EIGEN_USE_HIP) 214 // disable these subtests for HIP 215 CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>()); 216 CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>()); 217 #endif 218 }