cxx11_tensor_device.cu (13495B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 14 #define EIGEN_USE_GPU 15 16 #include "main.h" 17 #include <unsupported/Eigen/CXX11/Tensor> 18 19 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> 20 21 using Eigen::Tensor; 22 using Eigen::RowMajor; 23 24 // Context for evaluation on cpu 25 struct CPUContext { 26 CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { 27 kernel_1d_(0) = 3.14f; 28 kernel_1d_(1) = 2.7f; 29 30 kernel_2d_(0,0) = 3.14f; 31 kernel_2d_(1,0) = 2.7f; 32 kernel_2d_(0,1) = 0.2f; 33 kernel_2d_(1,1) = 7.0f; 34 35 kernel_3d_(0,0,0) = 3.14f; 36 kernel_3d_(0,1,0) = 2.7f; 37 kernel_3d_(0,0,1) = 0.2f; 38 kernel_3d_(0,1,1) = 7.0f; 39 kernel_3d_(1,0,0) = -1.0f; 40 kernel_3d_(1,1,0) = -0.3f; 41 kernel_3d_(1,0,1) = -0.7f; 42 kernel_3d_(1,1,1) = -0.5f; 43 } 44 45 const Eigen::DefaultDevice& device() const { return cpu_device_; } 46 47 const Eigen::Tensor<float, 3>& in1() const { return in1_; } 48 const Eigen::Tensor<float, 3>& in2() const { return in2_; } 49 Eigen::Tensor<float, 3>& out() { return out_; } 50 const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; } 51 const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; } 52 const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; } 53 54 private: 55 const Eigen::Tensor<float, 3>& in1_; 56 const Eigen::Tensor<float, 3>& in2_; 57 Eigen::Tensor<float, 3>& out_; 58 59 Eigen::Tensor<float, 1> kernel_1d_; 60 Eigen::Tensor<float, 2> kernel_2d_; 61 Eigen::Tensor<float, 3> kernel_3d_; 62 63 Eigen::DefaultDevice cpu_device_; 64 }; 65 66 67 // Context for evaluation on GPU 68 struct GPUContext { 69 GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { 70 assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess); 71 float kernel_1d_val[] = {3.14f, 2.7f}; 72 assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); 73 74 assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess); 75 float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; 76 assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); 77 78 assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess); 79 float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; 80 assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); 81 } 82 ~GPUContext() { 83 assert(gpuFree(kernel_1d_) == gpuSuccess); 84 assert(gpuFree(kernel_2d_) == gpuSuccess); 85 assert(gpuFree(kernel_3d_) == gpuSuccess); 86 } 87 88 const Eigen::GpuDevice& device() const { return gpu_device_; } 89 90 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; } 91 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; } 92 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; } 93 Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); } 94 Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); } 95 Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); } 96 97 private: 98 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_; 99 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_; 100 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_; 101 102 float* kernel_1d_; 103 float* kernel_2d_; 104 float* kernel_3d_; 105 106 Eigen::GpuStreamDevice stream_; 107 Eigen::GpuDevice gpu_device_; 108 }; 109 110 111 // The actual expression to evaluate 112 template <typename Context> 113 void test_contextual_eval(Context* context) 114 { 115 context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); 116 } 117 118 template <typename Context> 119 void test_forced_contextual_eval(Context* context) 120 { 121 context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); 122 } 123 124 template <typename Context> 125 void test_compound_assignment(Context* context) 126 { 127 context->out().device(context->device()) = context->in1().constant(2.718f); 128 context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; 129 } 130 131 132 template <typename Context> 133 void test_contraction(Context* context) 134 { 135 Eigen::array<std::pair<int, int>, 2> dims; 136 dims[0] = std::make_pair(1, 1); 137 dims[1] = std::make_pair(2, 2); 138 139 Eigen::array<int, 2> shape(40, 50*70); 140 141 Eigen::DSizes<int, 2> indices(0,0); 142 Eigen::DSizes<int, 2> sizes(40,40); 143 144 context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); 145 } 146 147 148 template <typename Context> 149 void test_1d_convolution(Context* context) 150 { 151 Eigen::DSizes<int, 3> indices(0,0,0); 152 Eigen::DSizes<int, 3> sizes(40,49,70); 153 154 Eigen::array<int, 1> dims(1); 155 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); 156 } 157 158 template <typename Context> 159 void test_2d_convolution(Context* context) 160 { 161 Eigen::DSizes<int, 3> indices(0,0,0); 162 Eigen::DSizes<int, 3> sizes(40,49,69); 163 164 Eigen::array<int, 2> dims(1,2); 165 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); 166 } 167 168 template <typename Context> 169 void test_3d_convolution(Context* context) 170 { 171 Eigen::DSizes<int, 3> indices(0,0,0); 172 Eigen::DSizes<int, 3> sizes(39,49,69); 173 174 Eigen::array<int, 3> dims(0,1,2); 175 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); 176 } 177 178 179 void test_cpu() { 180 Eigen::Tensor<float, 3> in1(40,50,70); 181 Eigen::Tensor<float, 3> in2(40,50,70); 182 Eigen::Tensor<float, 3> out(40,50,70); 183 184 in1 = in1.random() + in1.constant(10.0f); 185 in2 = in2.random() + in2.constant(10.0f); 186 187 CPUContext context(in1, in2, out); 188 test_contextual_eval(&context); 189 for (int i = 0; i < 40; ++i) { 190 for (int j = 0; j < 50; ++j) { 191 for (int k = 0; k < 70; ++k) { 192 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 193 } 194 } 195 } 196 197 test_forced_contextual_eval(&context); 198 for (int i = 0; i < 40; ++i) { 199 for (int j = 0; j < 50; ++j) { 200 for (int k = 0; k < 70; ++k) { 201 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); 202 } 203 } 204 } 205 206 test_compound_assignment(&context); 207 for (int i = 0; i < 40; ++i) { 208 for (int j = 0; j < 50; ++j) { 209 for (int k = 0; k < 70; ++k) { 210 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 211 } 212 } 213 } 214 215 test_contraction(&context); 216 for (int i = 0; i < 40; ++i) { 217 for (int j = 0; j < 40; ++j) { 218 const float result = out(i,j,0); 219 float expected = 0; 220 for (int k = 0; k < 50; ++k) { 221 for (int l = 0; l < 70; ++l) { 222 expected += in1(i, k, l) * in2(j, k, l); 223 } 224 } 225 VERIFY_IS_APPROX(expected, result); 226 } 227 } 228 229 test_1d_convolution(&context); 230 for (int i = 0; i < 40; ++i) { 231 for (int j = 0; j < 49; ++j) { 232 for (int k = 0; k < 70; ++k) { 233 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); 234 } 235 } 236 } 237 238 test_2d_convolution(&context); 239 for (int i = 0; i < 40; ++i) { 240 for (int j = 0; j < 49; ++j) { 241 for (int k = 0; k < 69; ++k) { 242 const float result = out(i,j,k); 243 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + 244 (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); 245 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { 246 continue; 247 } 248 VERIFY_IS_APPROX(expected, result); 249 } 250 } 251 } 252 253 test_3d_convolution(&context); 254 for (int i = 0; i < 39; ++i) { 255 for (int j = 0; j < 49; ++j) { 256 for (int k = 0; k < 69; ++k) { 257 const float result = out(i,j,k); 258 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 259 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + 260 (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + 261 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); 262 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { 263 continue; 264 } 265 VERIFY_IS_APPROX(expected, result); 266 } 267 } 268 } 269 } 270 271 void test_gpu() { 272 Eigen::Tensor<float, 3> in1(40,50,70); 273 Eigen::Tensor<float, 3> in2(40,50,70); 274 Eigen::Tensor<float, 3> out(40,50,70); 275 in1 = in1.random() + in1.constant(10.0f); 276 in2 = in2.random() + in2.constant(10.0f); 277 278 std::size_t in1_bytes = in1.size() * sizeof(float); 279 std::size_t in2_bytes = in2.size() * sizeof(float); 280 std::size_t out_bytes = out.size() * sizeof(float); 281 282 float* d_in1; 283 float* d_in2; 284 float* d_out; 285 gpuMalloc((void**)(&d_in1), in1_bytes); 286 gpuMalloc((void**)(&d_in2), in2_bytes); 287 gpuMalloc((void**)(&d_out), out_bytes); 288 289 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); 290 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); 291 292 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); 293 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); 294 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70); 295 296 GPUContext context(gpu_in1, gpu_in2, gpu_out); 297 test_contextual_eval(&context); 298 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); 299 for (int i = 0; i < 40; ++i) { 300 for (int j = 0; j < 50; ++j) { 301 for (int k = 0; k < 70; ++k) { 302 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 303 } 304 } 305 } 306 307 test_forced_contextual_eval(&context); 308 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); 309 for (int i = 0; i < 40; ++i) { 310 for (int j = 0; j < 50; ++j) { 311 for (int k = 0; k < 70; ++k) { 312 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); 313 } 314 } 315 } 316 317 test_compound_assignment(&context); 318 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); 319 for (int i = 0; i < 40; ++i) { 320 for (int j = 0; j < 50; ++j) { 321 for (int k = 0; k < 70; ++k) { 322 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 323 } 324 } 325 } 326 327 test_contraction(&context); 328 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); 329 for (int i = 0; i < 40; ++i) { 330 for (int j = 0; j < 40; ++j) { 331 const float result = out(i,j,0); 332 float expected = 0; 333 for (int k = 0; k < 50; ++k) { 334 for (int l = 0; l < 70; ++l) { 335 expected += in1(i, k, l) * in2(j, k, l); 336 } 337 } 338 VERIFY_IS_APPROX(expected, result); 339 } 340 } 341 342 test_1d_convolution(&context); 343 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); 344 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); 345 for (int i = 0; i < 40; ++i) { 346 for (int j = 0; j < 49; ++j) { 347 for (int k = 0; k < 70; ++k) { 348 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); 349 } 350 } 351 } 352 353 test_2d_convolution(&context); 354 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); 355 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); 356 for (int i = 0; i < 40; ++i) { 357 for (int j = 0; j < 49; ++j) { 358 for (int k = 0; k < 69; ++k) { 359 const float result = out(i,j,k); 360 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 361 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); 362 VERIFY_IS_APPROX(expected, result); 363 } 364 } 365 } 366 367 #if !defined(EIGEN_USE_HIP) 368 // disable this test on the HIP platform 369 // 3D tensor convolutions seem to hang on the HIP platform 370 371 test_3d_convolution(&context); 372 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); 373 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); 374 for (int i = 0; i < 39; ++i) { 375 for (int j = 0; j < 49; ++j) { 376 for (int k = 0; k < 69; ++k) { 377 const float result = out(i,j,k); 378 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 379 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + 380 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + 381 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); 382 VERIFY_IS_APPROX(expected, result); 383 } 384 } 385 } 386 387 #endif 388 389 } 390 391 392 EIGEN_DECLARE_TEST(cxx11_tensor_device) 393 { 394 CALL_SUBTEST_1(test_cpu()); 395 CALL_SUBTEST_2(test_gpu()); 396 }