cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_device.cu (13495B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 
     13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
     14 #define EIGEN_USE_GPU
     15 
     16 #include "main.h"
     17 #include <unsupported/Eigen/CXX11/Tensor>
     18 
     19 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
     20 
     21 using Eigen::Tensor;
     22 using Eigen::RowMajor;
     23 
     24 // Context for evaluation on cpu
     25 struct CPUContext {
     26   CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
     27     kernel_1d_(0) = 3.14f;
     28     kernel_1d_(1) = 2.7f;
     29 
     30     kernel_2d_(0,0) = 3.14f;
     31     kernel_2d_(1,0) = 2.7f;
     32     kernel_2d_(0,1) = 0.2f;
     33     kernel_2d_(1,1) = 7.0f;
     34 
     35     kernel_3d_(0,0,0) = 3.14f;
     36     kernel_3d_(0,1,0) = 2.7f;
     37     kernel_3d_(0,0,1) = 0.2f;
     38     kernel_3d_(0,1,1) = 7.0f;
     39     kernel_3d_(1,0,0) = -1.0f;
     40     kernel_3d_(1,1,0) = -0.3f;
     41     kernel_3d_(1,0,1) = -0.7f;
     42     kernel_3d_(1,1,1) = -0.5f;
     43   }
     44 
     45   const Eigen::DefaultDevice& device() const { return cpu_device_; }
     46 
     47   const Eigen::Tensor<float, 3>& in1() const { return in1_; }
     48   const Eigen::Tensor<float, 3>& in2() const { return in2_; }
     49   Eigen::Tensor<float, 3>& out() { return out_; }
     50   const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
     51   const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
     52   const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
     53 
     54  private:
     55   const Eigen::Tensor<float, 3>& in1_;
     56   const Eigen::Tensor<float, 3>& in2_;
     57   Eigen::Tensor<float, 3>& out_;
     58 
     59   Eigen::Tensor<float, 1> kernel_1d_;
     60   Eigen::Tensor<float, 2> kernel_2d_;
     61   Eigen::Tensor<float, 3> kernel_3d_;
     62 
     63   Eigen::DefaultDevice cpu_device_;
     64 };
     65 
     66 
     67 // Context for evaluation on GPU
     68 struct GPUContext {
     69   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
     70     assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
     71     float kernel_1d_val[] = {3.14f, 2.7f};
     72     assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
     73 
     74     assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
     75     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
     76     assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
     77 
     78     assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
     79     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
     80     assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
     81   }
     82   ~GPUContext() {
     83     assert(gpuFree(kernel_1d_) == gpuSuccess);
     84     assert(gpuFree(kernel_2d_) == gpuSuccess);
     85     assert(gpuFree(kernel_3d_) == gpuSuccess);
     86   }
     87 
     88   const Eigen::GpuDevice& device() const { return gpu_device_; }
     89 
     90   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
     91   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
     92   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
     93   Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
     94   Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
     95   Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
     96 
     97  private:
     98   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
     99   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
    100   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
    101 
    102   float* kernel_1d_;
    103   float* kernel_2d_;
    104   float* kernel_3d_;
    105 
    106   Eigen::GpuStreamDevice stream_;
    107   Eigen::GpuDevice gpu_device_;
    108 };
    109 
    110 
    111 // The actual expression to evaluate
    112 template <typename Context>
    113 void test_contextual_eval(Context* context)
    114 {
    115   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
    116 }
    117 
    118 template <typename Context>
    119 void test_forced_contextual_eval(Context* context)
    120 {
    121   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
    122 }
    123 
    124 template <typename Context>
    125 void test_compound_assignment(Context* context)
    126 {
    127   context->out().device(context->device()) = context->in1().constant(2.718f);
    128   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
    129 }
    130 
    131 
    132 template <typename Context>
    133 void test_contraction(Context* context)
    134 {
    135   Eigen::array<std::pair<int, int>, 2> dims;
    136   dims[0] = std::make_pair(1, 1);
    137   dims[1] = std::make_pair(2, 2);
    138 
    139   Eigen::array<int, 2> shape(40, 50*70);
    140 
    141   Eigen::DSizes<int, 2> indices(0,0);
    142   Eigen::DSizes<int, 2> sizes(40,40);
    143 
    144   context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
    145 }
    146 
    147 
    148 template <typename Context>
    149 void test_1d_convolution(Context* context)
    150 {
    151   Eigen::DSizes<int, 3> indices(0,0,0);
    152   Eigen::DSizes<int, 3> sizes(40,49,70);
    153 
    154   Eigen::array<int, 1> dims(1);
    155   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
    156 }
    157 
    158 template <typename Context>
    159 void test_2d_convolution(Context* context)
    160 {
    161   Eigen::DSizes<int, 3> indices(0,0,0);
    162   Eigen::DSizes<int, 3> sizes(40,49,69);
    163 
    164   Eigen::array<int, 2> dims(1,2);
    165   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
    166 }
    167 
    168 template <typename Context>
    169 void test_3d_convolution(Context* context)
    170 {
    171   Eigen::DSizes<int, 3> indices(0,0,0);
    172   Eigen::DSizes<int, 3> sizes(39,49,69);
    173 
    174   Eigen::array<int, 3> dims(0,1,2);
    175   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
    176 }
    177 
    178 
    179 void test_cpu() {
    180   Eigen::Tensor<float, 3> in1(40,50,70);
    181   Eigen::Tensor<float, 3> in2(40,50,70);
    182   Eigen::Tensor<float, 3> out(40,50,70);
    183 
    184   in1 = in1.random() + in1.constant(10.0f);
    185   in2 = in2.random() + in2.constant(10.0f);
    186 
    187   CPUContext context(in1, in2, out);
    188   test_contextual_eval(&context);
    189   for (int i = 0; i < 40; ++i) {
    190     for (int j = 0; j < 50; ++j) {
    191       for (int k = 0; k < 70; ++k) {
    192         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    193       }
    194     }
    195   }
    196 
    197   test_forced_contextual_eval(&context);
    198   for (int i = 0; i < 40; ++i) {
    199     for (int j = 0; j < 50; ++j) {
    200       for (int k = 0; k < 70; ++k) {
    201         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
    202       }
    203     }
    204   }
    205 
    206   test_compound_assignment(&context);
    207   for (int i = 0; i < 40; ++i) {
    208     for (int j = 0; j < 50; ++j) {
    209       for (int k = 0; k < 70; ++k) {
    210         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    211       }
    212     }
    213   }
    214 
    215   test_contraction(&context);
    216   for (int i = 0; i < 40; ++i) {
    217     for (int j = 0; j < 40; ++j) {
    218       const float result = out(i,j,0);
    219       float expected = 0;
    220       for (int k = 0; k < 50; ++k) {
    221         for (int l = 0; l < 70; ++l) {
    222           expected += in1(i, k, l) * in2(j, k, l);
    223         }
    224       }
    225       VERIFY_IS_APPROX(expected, result);
    226     }
    227   }
    228 
    229   test_1d_convolution(&context);
    230   for (int i = 0; i < 40; ++i) {
    231     for (int j = 0; j < 49; ++j) {
    232       for (int k = 0; k < 70; ++k) {
    233         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
    234       }
    235     }
    236   }
    237 
    238   test_2d_convolution(&context);
    239   for (int i = 0; i < 40; ++i) {
    240     for (int j = 0; j < 49; ++j) {
    241       for (int k = 0; k < 69; ++k) {
    242         const float result = out(i,j,k);
    243         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
    244                                (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
    245         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
    246           continue;
    247         }
    248         VERIFY_IS_APPROX(expected, result);
    249       }
    250     }
    251   }
    252 
    253   test_3d_convolution(&context);
    254   for (int i = 0; i < 39; ++i) {
    255     for (int j = 0; j < 49; ++j) {
    256       for (int k = 0; k < 69; ++k) {
    257         const float result = out(i,j,k);
    258         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    259                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
    260                                (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
    261                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
    262         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
    263           continue;
    264         }
    265         VERIFY_IS_APPROX(expected, result);
    266       }
    267     }
    268   }
    269 }
    270 
    271 void test_gpu() {
    272   Eigen::Tensor<float, 3> in1(40,50,70);
    273   Eigen::Tensor<float, 3> in2(40,50,70);
    274   Eigen::Tensor<float, 3> out(40,50,70);
    275   in1 = in1.random() + in1.constant(10.0f);
    276   in2 = in2.random() + in2.constant(10.0f);
    277 
    278   std::size_t in1_bytes = in1.size() * sizeof(float);
    279   std::size_t in2_bytes = in2.size() * sizeof(float);
    280   std::size_t out_bytes = out.size() * sizeof(float);
    281 
    282   float* d_in1;
    283   float* d_in2;
    284   float* d_out;
    285   gpuMalloc((void**)(&d_in1), in1_bytes);
    286   gpuMalloc((void**)(&d_in2), in2_bytes);
    287   gpuMalloc((void**)(&d_out), out_bytes);
    288 
    289   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
    290   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
    291 
    292   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
    293   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
    294   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
    295 
    296   GPUContext context(gpu_in1, gpu_in2, gpu_out);
    297   test_contextual_eval(&context);
    298   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
    299   for (int i = 0; i < 40; ++i) {
    300     for (int j = 0; j < 50; ++j) {
    301       for (int k = 0; k < 70; ++k) {
    302         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    303       }
    304     }
    305   }
    306 
    307   test_forced_contextual_eval(&context);
    308   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
    309   for (int i = 0; i < 40; ++i) {
    310     for (int j = 0; j < 50; ++j) {
    311       for (int k = 0; k < 70; ++k) {
    312         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
    313       }
    314     }
    315   }
    316 
    317   test_compound_assignment(&context);
    318   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
    319   for (int i = 0; i < 40; ++i) {
    320     for (int j = 0; j < 50; ++j) {
    321       for (int k = 0; k < 70; ++k) {
    322         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    323       }
    324     }
    325   }
    326 
    327   test_contraction(&context);
    328   assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
    329   for (int i = 0; i < 40; ++i) {
    330     for (int j = 0; j < 40; ++j) {
    331       const float result = out(i,j,0);
    332       float expected = 0;
    333       for (int k = 0; k < 50; ++k) {
    334         for (int l = 0; l < 70; ++l) {
    335           expected += in1(i, k, l) * in2(j, k, l);
    336         }
    337       }
    338       VERIFY_IS_APPROX(expected, result);
    339     }
    340   }
    341 
    342   test_1d_convolution(&context);
    343   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
    344   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
    345   for (int i = 0; i < 40; ++i) {
    346     for (int j = 0; j < 49; ++j) {
    347       for (int k = 0; k < 70; ++k) {
    348         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
    349       }
    350     }
    351   }
    352 
    353   test_2d_convolution(&context);
    354   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
    355   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
    356   for (int i = 0; i < 40; ++i) {
    357     for (int j = 0; j < 49; ++j) {
    358       for (int k = 0; k < 69; ++k) {
    359         const float result = out(i,j,k);
    360         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    361                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
    362         VERIFY_IS_APPROX(expected, result);
    363       }
    364     }
    365   }
    366 
    367 #if !defined(EIGEN_USE_HIP)
    368 // disable this test on the HIP platform
    369 // 3D tensor convolutions seem to hang on the HIP platform
    370 
    371   test_3d_convolution(&context);
    372   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
    373   assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
    374   for (int i = 0; i < 39; ++i) {
    375     for (int j = 0; j < 49; ++j) {
    376       for (int k = 0; k < 69; ++k) {
    377        const float result = out(i,j,k);
    378         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    379                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
    380                                 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
    381                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
    382         VERIFY_IS_APPROX(expected, result);
    383       }
    384     }
    385   }
    386 
    387 #endif
    388  
    389 }
    390 
    391 
    392 EIGEN_DECLARE_TEST(cxx11_tensor_device)
    393 {
    394   CALL_SUBTEST_1(test_cpu());
    395   CALL_SUBTEST_2(test_gpu());
    396 }