cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_gpu.cu (58446B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 
     13 #define EIGEN_USE_GPU
     14 
     15 #include "main.h"
     16 #include <unsupported/Eigen/CXX11/Tensor>
     17 
     18 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
     19 
     20 #define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
     21 
     22 using Eigen::Tensor;
     23 
     24 void test_gpu_nullary() {
     25   Tensor<float, 1, 0, int> in1(2);
     26   Tensor<float, 1, 0, int> in2(2);
     27   in1.setRandom();
     28   in2.setRandom();
     29 
     30   std::size_t tensor_bytes = in1.size() * sizeof(float);
     31 
     32   float* d_in1;
     33   float* d_in2;
     34   gpuMalloc((void**)(&d_in1), tensor_bytes);
     35   gpuMalloc((void**)(&d_in2), tensor_bytes);
     36   gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
     37   gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
     38 
     39   Eigen::GpuStreamDevice stream;
     40   Eigen::GpuDevice gpu_device(&stream);
     41 
     42   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
     43       d_in1, 2);
     44   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
     45       d_in2, 2);
     46 
     47   gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
     48   gpu_in2.device(gpu_device) = gpu_in2.random();
     49 
     50   Tensor<float, 1, 0, int> new1(2);
     51   Tensor<float, 1, 0, int> new2(2);
     52 
     53   assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
     54                          gpu_device.stream()) == gpuSuccess);
     55   assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
     56                          gpu_device.stream()) == gpuSuccess);
     57 
     58   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
     59 
     60   for (int i = 0; i < 2; ++i) {
     61     VERIFY_IS_APPROX(new1(i), 3.14f);
     62     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
     63   }
     64 
     65   gpuFree(d_in1);
     66   gpuFree(d_in2);
     67 }
     68 
     69 void test_gpu_elementwise_small() {
     70   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
     71   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
     72   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
     73   in1.setRandom();
     74   in2.setRandom();
     75 
     76   std::size_t in1_bytes = in1.size() * sizeof(float);
     77   std::size_t in2_bytes = in2.size() * sizeof(float);
     78   std::size_t out_bytes = out.size() * sizeof(float);
     79 
     80   float* d_in1;
     81   float* d_in2;
     82   float* d_out;
     83   gpuMalloc((void**)(&d_in1), in1_bytes);
     84   gpuMalloc((void**)(&d_in2), in2_bytes);
     85   gpuMalloc((void**)(&d_out), out_bytes);
     86 
     87   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
     88   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
     89 
     90   Eigen::GpuStreamDevice stream;
     91   Eigen::GpuDevice gpu_device(&stream);
     92 
     93   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
     94       d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
     95   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
     96       d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
     97   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
     98       d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
     99 
    100   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
    101 
    102   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
    103                          gpu_device.stream()) == gpuSuccess);
    104   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    105 
    106   for (int i = 0; i < 2; ++i) {
    107     VERIFY_IS_APPROX(
    108         out(Eigen::array<Eigen::DenseIndex, 1>(i)),
    109         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
    110   }
    111 
    112   gpuFree(d_in1);
    113   gpuFree(d_in2);
    114   gpuFree(d_out);
    115 }
    116 
    117 void test_gpu_elementwise()
    118 {
    119   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    120   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    121   Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    122   Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    123   in1.setRandom();
    124   in2.setRandom();
    125   in3.setRandom();
    126 
    127   std::size_t in1_bytes = in1.size() * sizeof(float);
    128   std::size_t in2_bytes = in2.size() * sizeof(float);
    129   std::size_t in3_bytes = in3.size() * sizeof(float);
    130   std::size_t out_bytes = out.size() * sizeof(float);
    131 
    132   float* d_in1;
    133   float* d_in2;
    134   float* d_in3;
    135   float* d_out;
    136   gpuMalloc((void**)(&d_in1), in1_bytes);
    137   gpuMalloc((void**)(&d_in2), in2_bytes);
    138   gpuMalloc((void**)(&d_in3), in3_bytes);
    139   gpuMalloc((void**)(&d_out), out_bytes);
    140 
    141   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
    142   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
    143   gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
    144 
    145   Eigen::GpuStreamDevice stream;
    146   Eigen::GpuDevice gpu_device(&stream);
    147 
    148   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    149   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    150   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    151   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    152 
    153   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
    154 
    155   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    156   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    157 
    158   for (int i = 0; i < 72; ++i) {
    159     for (int j = 0; j < 53; ++j) {
    160       for (int k = 0; k < 97; ++k) {
    161         VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
    162       }
    163     }
    164   }
    165 
    166   gpuFree(d_in1);
    167   gpuFree(d_in2);
    168   gpuFree(d_in3);
    169   gpuFree(d_out);
    170 }
    171 
    172 void test_gpu_props() {
    173   Tensor<float, 1> in1(200);
    174   Tensor<bool, 1> out(200);
    175   in1.setRandom();
    176 
    177   std::size_t in1_bytes = in1.size() * sizeof(float);
    178   std::size_t out_bytes = out.size() * sizeof(bool);
    179 
    180   float* d_in1;
    181   bool* d_out;
    182   gpuMalloc((void**)(&d_in1), in1_bytes);
    183   gpuMalloc((void**)(&d_out), out_bytes);
    184 
    185   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
    186 
    187   Eigen::GpuStreamDevice stream;
    188   Eigen::GpuDevice gpu_device(&stream);
    189 
    190   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
    191       d_in1, 200);
    192   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
    193       d_out, 200);
    194 
    195   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
    196 
    197   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
    198                          gpu_device.stream()) == gpuSuccess);
    199   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    200 
    201   for (int i = 0; i < 200; ++i) {
    202     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
    203   }
    204 
    205   gpuFree(d_in1);
    206   gpuFree(d_out);
    207 }
    208 
    209 void test_gpu_reduction()
    210 {
    211   Tensor<float, 4> in1(72,53,97,113);
    212   Tensor<float, 2> out(72,97);
    213   in1.setRandom();
    214 
    215   std::size_t in1_bytes = in1.size() * sizeof(float);
    216   std::size_t out_bytes = out.size() * sizeof(float);
    217 
    218   float* d_in1;
    219   float* d_out;
    220   gpuMalloc((void**)(&d_in1), in1_bytes);
    221   gpuMalloc((void**)(&d_out), out_bytes);
    222 
    223   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
    224 
    225   Eigen::GpuStreamDevice stream;
    226   Eigen::GpuDevice gpu_device(&stream);
    227 
    228   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
    229   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
    230 
    231   array<Eigen::DenseIndex, 2> reduction_axis;
    232   reduction_axis[0] = 1;
    233   reduction_axis[1] = 3;
    234 
    235   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
    236 
    237   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    238   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    239 
    240   for (int i = 0; i < 72; ++i) {
    241     for (int j = 0; j < 97; ++j) {
    242       float expected = 0;
    243       for (int k = 0; k < 53; ++k) {
    244         for (int l = 0; l < 113; ++l) {
    245           expected =
    246               std::max<float>(expected, in1(i, k, j, l));
    247         }
    248       }
    249       VERIFY_IS_APPROX(out(i,j), expected);
    250     }
    251   }
    252 
    253   gpuFree(d_in1);
    254   gpuFree(d_out);
    255 }
    256 
    257 template<int DataLayout>
    258 void test_gpu_contraction()
    259 {
    260   // with these dimensions, the output has 300 * 140 elements, which is
    261   // more than 30 * 1024, which is the number of threads in blocks on
    262   // a 15 SM GK110 GPU
    263   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
    264   Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
    265   Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
    266 
    267   t_left.setRandom();
    268   t_right.setRandom();
    269 
    270   std::size_t t_left_bytes = t_left.size()  * sizeof(float);
    271   std::size_t t_right_bytes = t_right.size() * sizeof(float);
    272   std::size_t t_result_bytes = t_result.size() * sizeof(float);
    273 
    274   float* d_t_left;
    275   float* d_t_right;
    276   float* d_t_result;
    277 
    278   gpuMalloc((void**)(&d_t_left), t_left_bytes);
    279   gpuMalloc((void**)(&d_t_right), t_right_bytes);
    280   gpuMalloc((void**)(&d_t_result), t_result_bytes);
    281 
    282   gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
    283   gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
    284 
    285   Eigen::GpuStreamDevice stream;
    286   Eigen::GpuDevice gpu_device(&stream);
    287 
    288   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
    289   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
    290   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
    291 
    292   typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
    293   MapXf m_left(t_left.data(), 300, 93);
    294   MapXf m_right(t_right.data(), 93, 140);
    295   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
    296 
    297   typedef Tensor<float, 1>::DimensionPair DimPair;
    298   Eigen::array<DimPair, 2> dims;
    299   dims[0] = DimPair(2, 0);
    300   dims[1] = DimPair(3, 1);
    301 
    302   m_result = m_left * m_right;
    303   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
    304 
    305   gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
    306 
    307   for (DenseIndex i = 0; i < t_result.size(); i++) {
    308     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    309       std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    310       assert(false);
    311     }
    312   }
    313 
    314   gpuFree(d_t_left);
    315   gpuFree(d_t_right);
    316   gpuFree(d_t_result);
    317 }
    318 
    319 template<int DataLayout>
    320 void test_gpu_convolution_1d()
    321 {
    322   Tensor<float, 4, DataLayout> input(74,37,11,137);
    323   Tensor<float, 1, DataLayout> kernel(4);
    324   Tensor<float, 4, DataLayout> out(74,34,11,137);
    325   input = input.constant(10.0f) + input.random();
    326   kernel = kernel.constant(7.0f) + kernel.random();
    327 
    328   std::size_t input_bytes = input.size() * sizeof(float);
    329   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    330   std::size_t out_bytes = out.size() * sizeof(float);
    331 
    332   float* d_input;
    333   float* d_kernel;
    334   float* d_out;
    335   gpuMalloc((void**)(&d_input), input_bytes);
    336   gpuMalloc((void**)(&d_kernel), kernel_bytes);
    337   gpuMalloc((void**)(&d_out), out_bytes);
    338 
    339   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
    340   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
    341 
    342   Eigen::GpuStreamDevice stream;
    343   Eigen::GpuDevice gpu_device(&stream);
    344 
    345   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
    346   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
    347   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
    348 
    349   Eigen::array<Eigen::DenseIndex, 1> dims(1);
    350   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    351 
    352   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    353   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    354 
    355   for (int i = 0; i < 74; ++i) {
    356     for (int j = 0; j < 34; ++j) {
    357       for (int k = 0; k < 11; ++k) {
    358         for (int l = 0; l < 137; ++l) {
    359           const float result = out(i,j,k,l);
    360           const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
    361                                  input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
    362           VERIFY_IS_APPROX(result, expected);
    363         }
    364       }
    365     }
    366   }
    367 
    368   gpuFree(d_input);
    369   gpuFree(d_kernel);
    370   gpuFree(d_out);
    371 }
    372 
    373 void test_gpu_convolution_inner_dim_col_major_1d()
    374 {
    375   Tensor<float, 4, ColMajor> input(74,9,11,7);
    376   Tensor<float, 1, ColMajor> kernel(4);
    377   Tensor<float, 4, ColMajor> out(71,9,11,7);
    378   input = input.constant(10.0f) + input.random();
    379   kernel = kernel.constant(7.0f) + kernel.random();
    380 
    381   std::size_t input_bytes = input.size() * sizeof(float);
    382   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    383   std::size_t out_bytes = out.size() * sizeof(float);
    384 
    385   float* d_input;
    386   float* d_kernel;
    387   float* d_out;
    388   gpuMalloc((void**)(&d_input), input_bytes);
    389   gpuMalloc((void**)(&d_kernel), kernel_bytes);
    390   gpuMalloc((void**)(&d_out), out_bytes);
    391 
    392   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
    393   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
    394 
    395   Eigen::GpuStreamDevice stream;
    396   Eigen::GpuDevice gpu_device(&stream);
    397 
    398   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
    399   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
    400   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
    401 
    402   Eigen::array<Eigen::DenseIndex, 1> dims(0);
    403   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    404 
    405   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    406   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    407 
    408   for (int i = 0; i < 71; ++i) {
    409     for (int j = 0; j < 9; ++j) {
    410       for (int k = 0; k < 11; ++k) {
    411         for (int l = 0; l < 7; ++l) {
    412           const float result = out(i,j,k,l);
    413           const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
    414                                  input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
    415           VERIFY_IS_APPROX(result, expected);
    416         }
    417       }
    418     }
    419   }
    420 
    421   gpuFree(d_input);
    422   gpuFree(d_kernel);
    423   gpuFree(d_out);
    424 }
    425 
    426 void test_gpu_convolution_inner_dim_row_major_1d()
    427 {
    428   Tensor<float, 4, RowMajor> input(7,9,11,74);
    429   Tensor<float, 1, RowMajor> kernel(4);
    430   Tensor<float, 4, RowMajor> out(7,9,11,71);
    431   input = input.constant(10.0f) + input.random();
    432   kernel = kernel.constant(7.0f) + kernel.random();
    433 
    434   std::size_t input_bytes = input.size() * sizeof(float);
    435   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    436   std::size_t out_bytes = out.size() * sizeof(float);
    437 
    438   float* d_input;
    439   float* d_kernel;
    440   float* d_out;
    441   gpuMalloc((void**)(&d_input), input_bytes);
    442   gpuMalloc((void**)(&d_kernel), kernel_bytes);
    443   gpuMalloc((void**)(&d_out), out_bytes);
    444 
    445   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
    446   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
    447 
    448   Eigen::GpuStreamDevice stream;
    449   Eigen::GpuDevice gpu_device(&stream);
    450 
    451   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
    452   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
    453   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
    454 
    455   Eigen::array<Eigen::DenseIndex, 1> dims(3);
    456   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    457 
    458   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    459   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    460 
    461   for (int i = 0; i < 7; ++i) {
    462     for (int j = 0; j < 9; ++j) {
    463       for (int k = 0; k < 11; ++k) {
    464         for (int l = 0; l < 71; ++l) {
    465           const float result = out(i,j,k,l);
    466           const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
    467                                  input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
    468           VERIFY_IS_APPROX(result, expected);
    469         }
    470       }
    471     }
    472   }
    473 
    474   gpuFree(d_input);
    475   gpuFree(d_kernel);
    476   gpuFree(d_out);
    477 }
    478 
    479 template<int DataLayout>
    480 void test_gpu_convolution_2d()
    481 {
    482   Tensor<float, 4, DataLayout> input(74,37,11,137);
    483   Tensor<float, 2, DataLayout> kernel(3,4);
    484   Tensor<float, 4, DataLayout> out(74,35,8,137);
    485   input = input.constant(10.0f) + input.random();
    486   kernel = kernel.constant(7.0f) + kernel.random();
    487 
    488   std::size_t input_bytes = input.size() * sizeof(float);
    489   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    490   std::size_t out_bytes = out.size() * sizeof(float);
    491 
    492   float* d_input;
    493   float* d_kernel;
    494   float* d_out;
    495   gpuMalloc((void**)(&d_input), input_bytes);
    496   gpuMalloc((void**)(&d_kernel), kernel_bytes);
    497   gpuMalloc((void**)(&d_out), out_bytes);
    498 
    499   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
    500   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
    501 
    502   Eigen::GpuStreamDevice stream;
    503   Eigen::GpuDevice gpu_device(&stream);
    504 
    505   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
    506   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
    507   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
    508 
    509   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
    510   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    511 
    512   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    513   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    514 
    515   for (int i = 0; i < 74; ++i) {
    516     for (int j = 0; j < 35; ++j) {
    517       for (int k = 0; k < 8; ++k) {
    518         for (int l = 0; l < 137; ++l) {
    519           const float result = out(i,j,k,l);
    520           const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
    521                                  input(i,j+1,k+0,l) * kernel(1,0) +
    522                                  input(i,j+2,k+0,l) * kernel(2,0) +
    523                                  input(i,j+0,k+1,l) * kernel(0,1) +
    524                                  input(i,j+1,k+1,l) * kernel(1,1) +
    525                                  input(i,j+2,k+1,l) * kernel(2,1) +
    526                                  input(i,j+0,k+2,l) * kernel(0,2) +
    527                                  input(i,j+1,k+2,l) * kernel(1,2) +
    528                                  input(i,j+2,k+2,l) * kernel(2,2) +
    529                                  input(i,j+0,k+3,l) * kernel(0,3) +
    530                                  input(i,j+1,k+3,l) * kernel(1,3) +
    531                                  input(i,j+2,k+3,l) * kernel(2,3);
    532           VERIFY_IS_APPROX(result, expected);
    533         }
    534       }
    535     }
    536   }
    537 
    538   gpuFree(d_input);
    539   gpuFree(d_kernel);
    540   gpuFree(d_out);
    541 }
    542 
    543 template<int DataLayout>
    544 void test_gpu_convolution_3d()
    545 {
    546   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
    547   Tensor<float, 3, DataLayout> kernel(3,4,2);
    548   Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
    549   input = input.constant(10.0f) + input.random();
    550   kernel = kernel.constant(7.0f) + kernel.random();
    551 
    552   std::size_t input_bytes = input.size() * sizeof(float);
    553   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    554   std::size_t out_bytes = out.size() * sizeof(float);
    555 
    556   float* d_input;
    557   float* d_kernel;
    558   float* d_out;
    559   gpuMalloc((void**)(&d_input), input_bytes);
    560   gpuMalloc((void**)(&d_kernel), kernel_bytes);
    561   gpuMalloc((void**)(&d_out), out_bytes);
    562 
    563   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
    564   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
    565 
    566   Eigen::GpuStreamDevice stream;    
    567   Eigen::GpuDevice gpu_device(&stream);
    568 
    569   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
    570   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
    571   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
    572 
    573   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
    574   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    575 
    576   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    577   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    578 
    579   for (int i = 0; i < 74; ++i) {
    580     for (int j = 0; j < 35; ++j) {
    581       for (int k = 0; k < 8; ++k) {
    582         for (int l = 0; l < 136; ++l) {
    583           for (int m = 0; m < 17; ++m) {
    584             const float result = out(i,j,k,l,m);
    585             const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
    586                                    input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
    587                                    input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
    588                                    input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
    589                                    input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
    590                                    input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
    591                                    input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
    592                                    input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
    593                                    input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
    594                                    input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
    595                                    input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
    596                                    input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
    597                                    input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
    598                                    input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
    599                                    input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
    600                                    input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
    601                                    input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
    602                                    input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
    603                                    input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
    604                                    input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
    605                                    input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
    606                                    input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
    607                                    input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
    608                                    input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
    609             VERIFY_IS_APPROX(result, expected);
    610           }
    611         }
    612       }
    613     }
    614   }
    615 
    616   gpuFree(d_input);
    617   gpuFree(d_kernel);
    618   gpuFree(d_out);
    619 }
    620 
    621 
    622 #if EIGEN_GPU_TEST_C99_MATH
    623 template <typename Scalar>
    624 void test_gpu_lgamma(const Scalar stddev)
    625 {
    626   Tensor<Scalar, 2> in(72,97);
    627   in.setRandom();
    628   in *= in.constant(stddev);
    629   Tensor<Scalar, 2> out(72,97);
    630   out.setZero();
    631 
    632   std::size_t bytes = in.size() * sizeof(Scalar);
    633 
    634   Scalar* d_in;
    635   Scalar* d_out;
    636   gpuMalloc((void**)(&d_in), bytes);
    637   gpuMalloc((void**)(&d_out), bytes);
    638 
    639   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
    640 
    641   Eigen::GpuStreamDevice stream;
    642   Eigen::GpuDevice gpu_device(&stream);
    643 
    644   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
    645   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
    646 
    647   gpu_out.device(gpu_device) = gpu_in.lgamma();
    648 
    649   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    650   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    651 
    652   for (int i = 0; i < 72; ++i) {
    653     for (int j = 0; j < 97; ++j) {
    654       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
    655     }
    656   }
    657 
    658   gpuFree(d_in);
    659   gpuFree(d_out);
    660 }
    661 #endif
    662 
    663 template <typename Scalar>
    664 void test_gpu_digamma()
    665 {
    666   Tensor<Scalar, 1> in(7);
    667   Tensor<Scalar, 1> out(7);
    668   Tensor<Scalar, 1> expected_out(7);
    669   out.setZero();
    670 
    671   in(0) = Scalar(1);
    672   in(1) = Scalar(1.5);
    673   in(2) = Scalar(4);
    674   in(3) = Scalar(-10.5);
    675   in(4) = Scalar(10000.5);
    676   in(5) = Scalar(0);
    677   in(6) = Scalar(-1);
    678 
    679   expected_out(0) = Scalar(-0.5772156649015329);
    680   expected_out(1) = Scalar(0.03648997397857645);
    681   expected_out(2) = Scalar(1.2561176684318);
    682   expected_out(3) = Scalar(2.398239129535781);
    683   expected_out(4) = Scalar(9.210340372392849);
    684   expected_out(5) = std::numeric_limits<Scalar>::infinity();
    685   expected_out(6) = std::numeric_limits<Scalar>::infinity();
    686 
    687   std::size_t bytes = in.size() * sizeof(Scalar);
    688 
    689   Scalar* d_in;
    690   Scalar* d_out;
    691   gpuMalloc((void**)(&d_in), bytes);
    692   gpuMalloc((void**)(&d_out), bytes);
    693 
    694   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
    695 
    696   Eigen::GpuStreamDevice stream;
    697   Eigen::GpuDevice gpu_device(&stream);
    698 
    699   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
    700   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
    701 
    702   gpu_out.device(gpu_device) = gpu_in.digamma();
    703 
    704   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    705   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    706 
    707   for (int i = 0; i < 5; ++i) {
    708     VERIFY_IS_APPROX(out(i), expected_out(i));
    709   }
    710   for (int i = 5; i < 7; ++i) {
    711     VERIFY_IS_EQUAL(out(i), expected_out(i));
    712   }
    713 
    714   gpuFree(d_in);
    715   gpuFree(d_out);
    716 }
    717 
    718 template <typename Scalar>
    719 void test_gpu_zeta()
    720 {
    721   Tensor<Scalar, 1> in_x(6);
    722   Tensor<Scalar, 1> in_q(6);
    723   Tensor<Scalar, 1> out(6);
    724   Tensor<Scalar, 1> expected_out(6);
    725   out.setZero();
    726 
    727   in_x(0) = Scalar(1);
    728   in_x(1) = Scalar(1.5);
    729   in_x(2) = Scalar(4);
    730   in_x(3) = Scalar(-10.5);
    731   in_x(4) = Scalar(10000.5);
    732   in_x(5) = Scalar(3);
    733   
    734   in_q(0) = Scalar(1.2345);
    735   in_q(1) = Scalar(2);
    736   in_q(2) = Scalar(1.5);
    737   in_q(3) = Scalar(3);
    738   in_q(4) = Scalar(1.0001);
    739   in_q(5) = Scalar(-2.5);
    740 
    741   expected_out(0) = std::numeric_limits<Scalar>::infinity();
    742   expected_out(1) = Scalar(1.61237534869);
    743   expected_out(2) = Scalar(0.234848505667);
    744   expected_out(3) = Scalar(1.03086757337e-5);
    745   expected_out(4) = Scalar(0.367879440865);
    746   expected_out(5) = Scalar(0.054102025820864097);
    747 
    748   std::size_t bytes = in_x.size() * sizeof(Scalar);
    749 
    750   Scalar* d_in_x;
    751   Scalar* d_in_q;
    752   Scalar* d_out;
    753   gpuMalloc((void**)(&d_in_x), bytes);
    754   gpuMalloc((void**)(&d_in_q), bytes);
    755   gpuMalloc((void**)(&d_out), bytes);
    756 
    757   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
    758   gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
    759   
    760   Eigen::GpuStreamDevice stream;
    761   Eigen::GpuDevice gpu_device(&stream);
    762 
    763   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
    764   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
    765   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
    766 
    767   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
    768 
    769   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    770   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    771 
    772   VERIFY_IS_EQUAL(out(0), expected_out(0));
    773   VERIFY((std::isnan)(out(3)));
    774 
    775   for (int i = 1; i < 6; ++i) {
    776     if (i != 3) {
    777       VERIFY_IS_APPROX(out(i), expected_out(i));
    778     }
    779   }
    780 
    781   gpuFree(d_in_x);
    782   gpuFree(d_in_q);
    783   gpuFree(d_out);
    784 }
    785 
    786 template <typename Scalar>
    787 void test_gpu_polygamma()
    788 {
    789   Tensor<Scalar, 1> in_x(7);
    790   Tensor<Scalar, 1> in_n(7);
    791   Tensor<Scalar, 1> out(7);
    792   Tensor<Scalar, 1> expected_out(7);
    793   out.setZero();
    794 
    795   in_n(0) = Scalar(1);
    796   in_n(1) = Scalar(1);
    797   in_n(2) = Scalar(1);
    798   in_n(3) = Scalar(17);
    799   in_n(4) = Scalar(31);
    800   in_n(5) = Scalar(28);
    801   in_n(6) = Scalar(8);
    802   
    803   in_x(0) = Scalar(2);
    804   in_x(1) = Scalar(3);
    805   in_x(2) = Scalar(25.5);
    806   in_x(3) = Scalar(4.7);
    807   in_x(4) = Scalar(11.8);
    808   in_x(5) = Scalar(17.7);
    809   in_x(6) = Scalar(30.2);
    810 
    811   expected_out(0) = Scalar(0.644934066848);
    812   expected_out(1) = Scalar(0.394934066848);
    813   expected_out(2) = Scalar(0.0399946696496);
    814   expected_out(3) = Scalar(293.334565435);
    815   expected_out(4) = Scalar(0.445487887616);
    816   expected_out(5) = Scalar(-2.47810300902e-07);
    817   expected_out(6) = Scalar(-8.29668781082e-09);
    818 
    819   std::size_t bytes = in_x.size() * sizeof(Scalar);
    820 
    821   Scalar* d_in_x;
    822   Scalar* d_in_n;
    823   Scalar* d_out;
    824   gpuMalloc((void**)(&d_in_x), bytes);
    825   gpuMalloc((void**)(&d_in_n), bytes);
    826   gpuMalloc((void**)(&d_out), bytes);
    827 
    828   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
    829   gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
    830   
    831   Eigen::GpuStreamDevice stream;
    832   Eigen::GpuDevice gpu_device(&stream);
    833 
    834   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
    835   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
    836   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
    837 
    838   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
    839 
    840   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    841   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    842 
    843   for (int i = 0; i < 7; ++i) {
    844     VERIFY_IS_APPROX(out(i), expected_out(i));
    845   }
    846 
    847   gpuFree(d_in_x);
    848   gpuFree(d_in_n);
    849   gpuFree(d_out);
    850 }
    851 
    852 template <typename Scalar>
    853 void test_gpu_igamma()
    854 {
    855   Tensor<Scalar, 2> a(6, 6);
    856   Tensor<Scalar, 2> x(6, 6);
    857   Tensor<Scalar, 2> out(6, 6);
    858   out.setZero();
    859 
    860   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    861   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    862 
    863   for (int i = 0; i < 6; ++i) {
    864     for (int j = 0; j < 6; ++j) {
    865       a(i, j) = a_s[i];
    866       x(i, j) = x_s[j];
    867     }
    868   }
    869 
    870   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
    871   Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
    872                           {0.0, 0.6321205588285578, 0.7768698398515702,
    873                            0.9816843611112658, 9.999500016666262e-05, 1.0},
    874                           {0.0, 0.4275932955291202, 0.608374823728911,
    875                            0.9539882943107686, 7.522076445089201e-07, 1.0},
    876                           {0.0, 0.01898815687615381, 0.06564245437845008,
    877                            0.5665298796332909, 4.166333347221828e-18, 1.0},
    878                           {0.0, 0.9999780593618628, 0.9999899967080838,
    879                            0.9999996219837988, 0.9991370418689945, 1.0},
    880                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
    881 
    882 
    883 
    884   std::size_t bytes = a.size() * sizeof(Scalar);
    885 
    886   Scalar* d_a;
    887   Scalar* d_x;
    888   Scalar* d_out;
    889   assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
    890   assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
    891   assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
    892 
    893   gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
    894   gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
    895 
    896   Eigen::GpuStreamDevice stream;
    897   Eigen::GpuDevice gpu_device(&stream);
    898 
    899   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
    900   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
    901   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
    902 
    903   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
    904 
    905   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    906   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    907 
    908   for (int i = 0; i < 6; ++i) {
    909     for (int j = 0; j < 6; ++j) {
    910       if ((std::isnan)(igamma_s[i][j])) {
    911         VERIFY((std::isnan)(out(i, j)));
    912       } else {
    913         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
    914       }
    915     }
    916   }
    917 
    918   gpuFree(d_a);
    919   gpuFree(d_x);
    920   gpuFree(d_out);
    921 }
    922 
    923 template <typename Scalar>
    924 void test_gpu_igammac()
    925 {
    926   Tensor<Scalar, 2> a(6, 6);
    927   Tensor<Scalar, 2> x(6, 6);
    928   Tensor<Scalar, 2> out(6, 6);
    929   out.setZero();
    930 
    931   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    932   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    933 
    934   for (int i = 0; i < 6; ++i) {
    935     for (int j = 0; j < 6; ++j) {
    936       a(i, j) = a_s[i];
    937       x(i, j) = x_s[j];
    938     }
    939   }
    940 
    941   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
    942   Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
    943                            {1.0, 0.36787944117144233, 0.22313016014842982,
    944                             0.018315638888734182, 0.9999000049998333, 0.0},
    945                            {1.0, 0.5724067044708798, 0.3916251762710878,
    946                             0.04601170568923136, 0.9999992477923555, 0.0},
    947                            {1.0, 0.9810118431238462, 0.9343575456215499,
    948                             0.4334701203667089, 1.0, 0.0},
    949                            {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
    950                             3.7801620118431334e-07, 0.0008629581310054535,
    951                             0.0},
    952                            {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
    953 
    954   std::size_t bytes = a.size() * sizeof(Scalar);
    955 
    956   Scalar* d_a;
    957   Scalar* d_x;
    958   Scalar* d_out;
    959   gpuMalloc((void**)(&d_a), bytes);
    960   gpuMalloc((void**)(&d_x), bytes);
    961   gpuMalloc((void**)(&d_out), bytes);
    962 
    963   gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
    964   gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
    965 
    966   Eigen::GpuStreamDevice stream;
    967   Eigen::GpuDevice gpu_device(&stream);
    968 
    969   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
    970   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
    971   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
    972 
    973   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
    974 
    975   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
    976   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
    977 
    978   for (int i = 0; i < 6; ++i) {
    979     for (int j = 0; j < 6; ++j) {
    980       if ((std::isnan)(igammac_s[i][j])) {
    981         VERIFY((std::isnan)(out(i, j)));
    982       } else {
    983         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
    984       }
    985     }
    986   }
    987 
    988   gpuFree(d_a);
    989   gpuFree(d_x);
    990   gpuFree(d_out);
    991 }
    992 
    993 #if EIGEN_GPU_TEST_C99_MATH
    994 template <typename Scalar>
    995 void test_gpu_erf(const Scalar stddev)
    996 {
    997   Tensor<Scalar, 2> in(72,97);
    998   in.setRandom();
    999   in *= in.constant(stddev);
   1000   Tensor<Scalar, 2> out(72,97);
   1001   out.setZero();
   1002 
   1003   std::size_t bytes = in.size() * sizeof(Scalar);
   1004 
   1005   Scalar* d_in;
   1006   Scalar* d_out;
   1007   assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
   1008   assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
   1009 
   1010   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
   1011 
   1012   Eigen::GpuStreamDevice stream;
   1013   Eigen::GpuDevice gpu_device(&stream);
   1014 
   1015   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
   1016   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
   1017 
   1018   gpu_out.device(gpu_device) = gpu_in.erf();
   1019 
   1020   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   1021   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1022 
   1023   for (int i = 0; i < 72; ++i) {
   1024     for (int j = 0; j < 97; ++j) {
   1025       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
   1026     }
   1027   }
   1028 
   1029   gpuFree(d_in);
   1030   gpuFree(d_out);
   1031 }
   1032 
   1033 template <typename Scalar>
   1034 void test_gpu_erfc(const Scalar stddev)
   1035 {
   1036   Tensor<Scalar, 2> in(72,97);
   1037   in.setRandom();
   1038   in *= in.constant(stddev);
   1039   Tensor<Scalar, 2> out(72,97);
   1040   out.setZero();
   1041 
   1042   std::size_t bytes = in.size() * sizeof(Scalar);
   1043 
   1044   Scalar* d_in;
   1045   Scalar* d_out;
   1046   gpuMalloc((void**)(&d_in), bytes);
   1047   gpuMalloc((void**)(&d_out), bytes);
   1048 
   1049   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
   1050 
   1051   Eigen::GpuStreamDevice stream;
   1052   Eigen::GpuDevice gpu_device(&stream);
   1053 
   1054   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
   1055   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
   1056 
   1057   gpu_out.device(gpu_device) = gpu_in.erfc();
   1058 
   1059   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   1060   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1061 
   1062   for (int i = 0; i < 72; ++i) {
   1063     for (int j = 0; j < 97; ++j) {
   1064       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
   1065     }
   1066   }
   1067 
   1068   gpuFree(d_in);
   1069   gpuFree(d_out);
   1070 }
   1071 #endif
   1072 template <typename Scalar>
   1073 void test_gpu_ndtri()
   1074 {
   1075   Tensor<Scalar, 1> in_x(8);
   1076   Tensor<Scalar, 1> out(8);
   1077   Tensor<Scalar, 1> expected_out(8);
   1078   out.setZero();
   1079 
   1080   in_x(0) = Scalar(1);
   1081   in_x(1) = Scalar(0.);
   1082   in_x(2) = Scalar(0.5);
   1083   in_x(3) = Scalar(0.2);
   1084   in_x(4) = Scalar(0.8);
   1085   in_x(5) = Scalar(0.9);
   1086   in_x(6) = Scalar(0.1);
   1087   in_x(7) = Scalar(0.99);
   1088   in_x(8) = Scalar(0.01);
   1089 
   1090   expected_out(0) = std::numeric_limits<Scalar>::infinity();
   1091   expected_out(1) = -std::numeric_limits<Scalar>::infinity();
   1092   expected_out(2) = Scalar(0.0);
   1093   expected_out(3) = Scalar(-0.8416212335729142);
   1094   expected_out(4) = Scalar(0.8416212335729142);
   1095   expected_out(5) = Scalar(1.2815515655446004);
   1096   expected_out(6) = Scalar(-1.2815515655446004);
   1097   expected_out(7) = Scalar(2.3263478740408408);
   1098   expected_out(8) = Scalar(-2.3263478740408408);
   1099 
   1100   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1101 
   1102   Scalar* d_in_x;
   1103   Scalar* d_out;
   1104   gpuMalloc((void**)(&d_in_x), bytes);
   1105   gpuMalloc((void**)(&d_out), bytes);
   1106 
   1107   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
   1108 
   1109   Eigen::GpuStreamDevice stream;
   1110   Eigen::GpuDevice gpu_device(&stream);
   1111 
   1112   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
   1113   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
   1114 
   1115   gpu_out.device(gpu_device) = gpu_in_x.ndtri();
   1116 
   1117   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   1118   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1119 
   1120   VERIFY_IS_EQUAL(out(0), expected_out(0));
   1121   VERIFY((std::isnan)(out(3)));
   1122 
   1123   for (int i = 1; i < 6; ++i) {
   1124     if (i != 3) {
   1125       VERIFY_IS_APPROX(out(i), expected_out(i));
   1126     }
   1127   }
   1128 
   1129   gpuFree(d_in_x);
   1130   gpuFree(d_out);
   1131 }
   1132 
   1133 template <typename Scalar>
   1134 void test_gpu_betainc()
   1135 {
   1136   Tensor<Scalar, 1> in_x(125);
   1137   Tensor<Scalar, 1> in_a(125);
   1138   Tensor<Scalar, 1> in_b(125);
   1139   Tensor<Scalar, 1> out(125);
   1140   Tensor<Scalar, 1> expected_out(125);
   1141   out.setZero();
   1142 
   1143   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
   1144 
   1145   Array<Scalar, 1, Dynamic> x(125);
   1146   Array<Scalar, 1, Dynamic> a(125);
   1147   Array<Scalar, 1, Dynamic> b(125);
   1148   Array<Scalar, 1, Dynamic> v(125);
   1149 
   1150   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
   1151       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
   1152       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1153       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1154       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1155       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1156       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1157       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1158       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1159       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1160       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
   1161       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
   1162       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1163       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1164       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1165       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1166       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1167       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1168       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1169       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1170       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1171       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
   1172       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
   1173       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
   1174 
   1175   b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1176       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1177       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1178       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1179       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
   1180       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1181       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1182       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1183       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
   1184       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1185       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1186       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1187       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1188       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
   1189       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1190       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1191       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1192       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
   1193       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1194       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1195       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1196       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1197       999.999, 999.999, 999.999;
   1198 
   1199   x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
   1200       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
   1201       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
   1202       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
   1203       0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
   1204       -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
   1205       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
   1206       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
   1207       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
   1208 
   1209   v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
   1210       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
   1211       nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
   1212       0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
   1213       0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
   1214       0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
   1215       nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
   1216       0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
   1217       0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
   1218       0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
   1219       0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
   1220       1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
   1221       nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
   1222       nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
   1223       nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
   1224       nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
   1225       2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
   1226       1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
   1227       2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
   1228       0.5000000000004947, 0.9999999999999999, nan;
   1229 
   1230   for (int i = 0; i < 125; ++i) {
   1231     in_x(i) = x(i);
   1232     in_a(i) = a(i);
   1233     in_b(i) = b(i);
   1234     expected_out(i) = v(i);
   1235   }
   1236 
   1237   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1238 
   1239   Scalar* d_in_x;
   1240   Scalar* d_in_a;
   1241   Scalar* d_in_b;
   1242   Scalar* d_out;
   1243   gpuMalloc((void**)(&d_in_x), bytes);
   1244   gpuMalloc((void**)(&d_in_a), bytes);
   1245   gpuMalloc((void**)(&d_in_b), bytes);
   1246   gpuMalloc((void**)(&d_out), bytes);
   1247 
   1248   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
   1249   gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
   1250   gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
   1251 
   1252   Eigen::GpuStreamDevice stream;
   1253   Eigen::GpuDevice gpu_device(&stream);
   1254 
   1255   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
   1256   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
   1257   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
   1258   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
   1259 
   1260   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
   1261 
   1262   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   1263   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1264 
   1265   for (int i = 1; i < 125; ++i) {
   1266     if ((std::isnan)(expected_out(i))) {
   1267       VERIFY((std::isnan)(out(i)));
   1268     } else {
   1269       VERIFY_IS_APPROX(out(i), expected_out(i));
   1270     }
   1271   }
   1272 
   1273   gpuFree(d_in_x);
   1274   gpuFree(d_in_a);
   1275   gpuFree(d_in_b);
   1276   gpuFree(d_out);
   1277 }
   1278 
   1279 template <typename Scalar>
   1280 void test_gpu_i0e()
   1281 {
   1282   Tensor<Scalar, 1> in_x(21);
   1283   Tensor<Scalar, 1> out(21);
   1284   Tensor<Scalar, 1> expected_out(21);
   1285   out.setZero();
   1286 
   1287   Array<Scalar, 1, Dynamic> in_x_array(21);
   1288   Array<Scalar, 1, Dynamic> expected_out_array(21);
   1289 
   1290   in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
   1291       -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
   1292 
   1293   expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
   1294       0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
   1295       0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
   1296       0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
   1297       0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
   1298       0.0897803118848;
   1299 
   1300   for (int i = 0; i < 21; ++i) {
   1301     in_x(i) = in_x_array(i);
   1302     expected_out(i) = expected_out_array(i);
   1303   }
   1304 
   1305   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1306 
   1307   Scalar* d_in;
   1308   Scalar* d_out;
   1309   gpuMalloc((void**)(&d_in), bytes);
   1310   gpuMalloc((void**)(&d_out), bytes);
   1311 
   1312   gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
   1313 
   1314   Eigen::GpuStreamDevice stream;
   1315   Eigen::GpuDevice gpu_device(&stream);
   1316 
   1317   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
   1318   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
   1319 
   1320   gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
   1321 
   1322   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
   1323                          gpu_device.stream()) == gpuSuccess);
   1324   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1325 
   1326   for (int i = 0; i < 21; ++i) {
   1327     VERIFY_IS_APPROX(out(i), expected_out(i));
   1328   }
   1329 
   1330   gpuFree(d_in);
   1331   gpuFree(d_out);
   1332 }
   1333 
   1334 template <typename Scalar>
   1335 void test_gpu_i1e()
   1336 {
   1337   Tensor<Scalar, 1> in_x(21);
   1338   Tensor<Scalar, 1> out(21);
   1339   Tensor<Scalar, 1> expected_out(21);
   1340   out.setZero();
   1341 
   1342   Array<Scalar, 1, Dynamic> in_x_array(21);
   1343   Array<Scalar, 1, Dynamic> expected_out_array(21);
   1344 
   1345   in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
   1346       -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
   1347 
   1348   expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
   1349       -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
   1350       -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
   1351       0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
   1352       0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
   1353       0.0875062221833;
   1354 
   1355   for (int i = 0; i < 21; ++i) {
   1356     in_x(i) = in_x_array(i);
   1357     expected_out(i) = expected_out_array(i);
   1358   }
   1359 
   1360   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1361 
   1362   Scalar* d_in;
   1363   Scalar* d_out;
   1364   gpuMalloc((void**)(&d_in), bytes);
   1365   gpuMalloc((void**)(&d_out), bytes);
   1366 
   1367   gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
   1368 
   1369   Eigen::GpuStreamDevice stream;
   1370   Eigen::GpuDevice gpu_device(&stream);
   1371 
   1372   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
   1373   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
   1374 
   1375   gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
   1376 
   1377   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
   1378                          gpu_device.stream()) == gpuSuccess);
   1379   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1380 
   1381   for (int i = 0; i < 21; ++i) {
   1382     VERIFY_IS_APPROX(out(i), expected_out(i));
   1383   }
   1384 
   1385   gpuFree(d_in);
   1386   gpuFree(d_out);
   1387 }
   1388 
   1389 template <typename Scalar>
   1390 void test_gpu_igamma_der_a()
   1391 {
   1392   Tensor<Scalar, 1> in_x(30);
   1393   Tensor<Scalar, 1> in_a(30);
   1394   Tensor<Scalar, 1> out(30);
   1395   Tensor<Scalar, 1> expected_out(30);
   1396   out.setZero();
   1397 
   1398   Array<Scalar, 1, Dynamic> in_a_array(30);
   1399   Array<Scalar, 1, Dynamic> in_x_array(30);
   1400   Array<Scalar, 1, Dynamic> expected_out_array(30);
   1401 
   1402   // See special_functions.cpp for the Python code that generates the test data.
   1403 
   1404   in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
   1405       1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
   1406       100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
   1407 
   1408   in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
   1409       1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
   1410       0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
   1411       1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
   1412       7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
   1413       92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
   1414       968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
   1415 
   1416   expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
   1417       -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
   1418       -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
   1419       -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
   1420       -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
   1421       -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
   1422       -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
   1423       -0.0125850719397, -0.00455500206958, -0.00476436993148;
   1424 
   1425   for (int i = 0; i < 30; ++i) {
   1426     in_x(i) = in_x_array(i);
   1427     in_a(i) = in_a_array(i);
   1428     expected_out(i) = expected_out_array(i);
   1429   }
   1430 
   1431   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1432 
   1433   Scalar* d_a;
   1434   Scalar* d_x;
   1435   Scalar* d_out;
   1436   gpuMalloc((void**)(&d_a), bytes);
   1437   gpuMalloc((void**)(&d_x), bytes);
   1438   gpuMalloc((void**)(&d_out), bytes);
   1439 
   1440   gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
   1441   gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
   1442 
   1443   Eigen::GpuStreamDevice stream;
   1444   Eigen::GpuDevice gpu_device(&stream);
   1445 
   1446   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
   1447   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
   1448   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
   1449 
   1450   gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
   1451 
   1452   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
   1453                          gpu_device.stream()) == gpuSuccess);
   1454   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1455 
   1456   for (int i = 0; i < 30; ++i) {
   1457     VERIFY_IS_APPROX(out(i), expected_out(i));
   1458   }
   1459 
   1460   gpuFree(d_a);
   1461   gpuFree(d_x);
   1462   gpuFree(d_out);
   1463 }
   1464 
   1465 template <typename Scalar>
   1466 void test_gpu_gamma_sample_der_alpha()
   1467 {
   1468   Tensor<Scalar, 1> in_alpha(30);
   1469   Tensor<Scalar, 1> in_sample(30);
   1470   Tensor<Scalar, 1> out(30);
   1471   Tensor<Scalar, 1> expected_out(30);
   1472   out.setZero();
   1473 
   1474   Array<Scalar, 1, Dynamic> in_alpha_array(30);
   1475   Array<Scalar, 1, Dynamic> in_sample_array(30);
   1476   Array<Scalar, 1, Dynamic> expected_out_array(30);
   1477 
   1478   // See special_functions.cpp for the Python code that generates the test data.
   1479 
   1480   in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
   1481       1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
   1482       100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
   1483 
   1484   in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
   1485       1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
   1486       0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
   1487       1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
   1488       7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
   1489       92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
   1490       968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
   1491 
   1492   expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
   1493       1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
   1494       0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
   1495       1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
   1496       0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
   1497       1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
   1498       0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
   1499       1.00106492525, 0.97734200649, 1.02198794179;
   1500 
   1501   for (int i = 0; i < 30; ++i) {
   1502     in_alpha(i) = in_alpha_array(i);
   1503     in_sample(i) = in_sample_array(i);
   1504     expected_out(i) = expected_out_array(i);
   1505   }
   1506 
   1507   std::size_t bytes = in_alpha.size() * sizeof(Scalar);
   1508 
   1509   Scalar* d_alpha;
   1510   Scalar* d_sample;
   1511   Scalar* d_out;
   1512   gpuMalloc((void**)(&d_alpha), bytes);
   1513   gpuMalloc((void**)(&d_sample), bytes);
   1514   gpuMalloc((void**)(&d_out), bytes);
   1515 
   1516   gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
   1517   gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
   1518 
   1519   Eigen::GpuStreamDevice stream;
   1520   Eigen::GpuDevice gpu_device(&stream);
   1521 
   1522   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
   1523   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
   1524   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
   1525 
   1526   gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
   1527 
   1528   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
   1529                          gpu_device.stream()) == gpuSuccess);
   1530   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
   1531 
   1532   for (int i = 0; i < 30; ++i) {
   1533     VERIFY_IS_APPROX(out(i), expected_out(i));
   1534   }
   1535 
   1536   gpuFree(d_alpha);
   1537   gpuFree(d_sample);
   1538   gpuFree(d_out);
   1539 }
   1540 
   1541 EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
   1542 {
   1543   CALL_SUBTEST_1(test_gpu_nullary());
   1544   CALL_SUBTEST_1(test_gpu_elementwise_small());
   1545   CALL_SUBTEST_1(test_gpu_elementwise());
   1546   CALL_SUBTEST_1(test_gpu_props());
   1547   CALL_SUBTEST_1(test_gpu_reduction());
   1548   CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
   1549   CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
   1550   CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
   1551   CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
   1552   CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
   1553   CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
   1554   CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
   1555   CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
   1556 #if !defined(EIGEN_USE_HIP)
   1557 // disable these tests on HIP for now.
   1558 // they hang..need to investigate and fix
   1559   CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
   1560   CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
   1561 #endif
   1562 
   1563 #if EIGEN_GPU_TEST_C99_MATH
   1564   // std::erf, std::erfc, and so on where only added in c++11. We use them
   1565   // as a golden reference to validate the results produced by Eigen. Therefore
   1566   // we can only run these tests if we use a c++11 compiler.
   1567   CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
   1568   CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
   1569   CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
   1570   CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
   1571 
   1572   CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
   1573   CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
   1574   CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
   1575   CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
   1576 
   1577   CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
   1578   CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
   1579   CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
   1580   CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
   1581 
   1582   CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
   1583   // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
   1584   CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
   1585   CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
   1586   CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
   1587 
   1588   CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
   1589   CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
   1590   CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
   1591   CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
   1592 
   1593   CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
   1594   // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
   1595   CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
   1596   CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
   1597   CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
   1598 
   1599 #if !defined(EIGEN_USE_HIP)
   1600 // disable these tests on HIP for now.
   1601 
   1602   CALL_SUBTEST_5(test_gpu_ndtri<float>());
   1603   CALL_SUBTEST_5(test_gpu_ndtri<double>());
   1604 
   1605   CALL_SUBTEST_5(test_gpu_digamma<float>());
   1606   CALL_SUBTEST_5(test_gpu_digamma<double>());
   1607 
   1608   CALL_SUBTEST_5(test_gpu_polygamma<float>());
   1609   CALL_SUBTEST_5(test_gpu_polygamma<double>());
   1610 
   1611   CALL_SUBTEST_5(test_gpu_zeta<float>());
   1612   CALL_SUBTEST_5(test_gpu_zeta<double>());
   1613 #endif
   1614 
   1615   CALL_SUBTEST_5(test_gpu_igamma<float>());
   1616   CALL_SUBTEST_5(test_gpu_igammac<float>());
   1617 
   1618   CALL_SUBTEST_5(test_gpu_igamma<double>());
   1619   CALL_SUBTEST_5(test_gpu_igammac<double>());
   1620 
   1621 #if !defined(EIGEN_USE_HIP)
   1622 // disable these tests on HIP for now.
   1623   CALL_SUBTEST_6(test_gpu_betainc<float>());
   1624   CALL_SUBTEST_6(test_gpu_betainc<double>());
   1625 
   1626   CALL_SUBTEST_6(test_gpu_i0e<float>());
   1627   CALL_SUBTEST_6(test_gpu_i0e<double>());
   1628 
   1629   CALL_SUBTEST_6(test_gpu_i1e<float>());
   1630   CALL_SUBTEST_6(test_gpu_i1e<double>());
   1631 
   1632   CALL_SUBTEST_6(test_gpu_i1e<float>());
   1633   CALL_SUBTEST_6(test_gpu_i1e<double>());
   1634 
   1635   CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
   1636   CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
   1637 
   1638   CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
   1639   CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
   1640 #endif
   1641 
   1642 #endif
   1643 }