cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_of_float16_gpu.cu (21223B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 
     13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
     14 #define EIGEN_USE_GPU
     15 
     16 #include "main.h"
     17 #include <unsupported/Eigen/CXX11/Tensor>
     18 
     19 
     20 using Eigen::Tensor;
     21 
     22 template<typename>
     23 void test_gpu_numext() {
     24   Eigen::GpuStreamDevice stream;
     25   Eigen::GpuDevice gpu_device(&stream);
     26   int num_elem = 101;
     27 
     28   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
     29   bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
     30   bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
     31 
     32   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
     33       d_float, num_elem);
     34   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
     35       d_res_half, num_elem);
     36   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
     37       d_res_float, num_elem);
     38 
     39   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
     40   gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
     41   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
     42 
     43   Tensor<bool, 1> half_prec(num_elem);
     44   Tensor<bool, 1> full_prec(num_elem);
     45   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
     46   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
     47   gpu_device.synchronize();
     48 
     49   for (int i = 0; i < num_elem; ++i) {
     50     std::cout << "Checking numext " << i << std::endl;
     51     VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
     52   }
     53 
     54   gpu_device.deallocate(d_float);
     55   gpu_device.deallocate(d_res_half);
     56   gpu_device.deallocate(d_res_float);
     57 }
     58 
     59 
     60 #ifdef EIGEN_HAS_GPU_FP16
     61 
     62 template<typename>
     63 void test_gpu_conversion() {
     64   Eigen::GpuStreamDevice stream;
     65   Eigen::GpuDevice gpu_device(&stream);
     66   int num_elem = 101;
     67 
     68   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
     69   Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
     70   float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
     71 
     72   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
     73       d_float, num_elem);
     74   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
     75       d_half, num_elem);
     76   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
     77       d_conv, num_elem);
     78 
     79   gpu_float.device(gpu_device) = gpu_float.random();
     80   gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
     81   gpu_conv.device(gpu_device) = gpu_half.cast<float>();
     82 
     83   Tensor<float, 1> initial(num_elem);
     84   Tensor<float, 1> final(num_elem);
     85   gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
     86   gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
     87 
     88   for (int i = 0; i < num_elem; ++i) {
     89     VERIFY_IS_APPROX(initial(i), final(i));
     90   }
     91 
     92   gpu_device.deallocate(d_float);
     93   gpu_device.deallocate(d_half);
     94   gpu_device.deallocate(d_conv);
     95 }
     96 
     97 template<typename>
     98 void test_gpu_unary() {
     99   Eigen::GpuStreamDevice stream;
    100   Eigen::GpuDevice gpu_device(&stream);
    101   int num_elem = 101;
    102 
    103   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    104   float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
    105   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    106 
    107   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
    108       d_float, num_elem);
    109   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
    110       d_res_half, num_elem);
    111   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    112       d_res_float, num_elem);
    113 
    114   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
    115   gpu_res_float.device(gpu_device) = gpu_float.abs();
    116   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
    117 
    118   Tensor<float, 1> half_prec(num_elem);
    119   Tensor<float, 1> full_prec(num_elem);
    120   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
    121   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    122   gpu_device.synchronize();
    123 
    124   for (int i = 0; i < num_elem; ++i) {
    125     std::cout << "Checking unary " << i << std::endl;
    126     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
    127   }
    128 
    129   gpu_device.deallocate(d_float);
    130   gpu_device.deallocate(d_res_half);
    131   gpu_device.deallocate(d_res_float);
    132 }
    133 
    134 template<typename>
    135 void test_gpu_elementwise() {
    136   Eigen::GpuStreamDevice stream;
    137   Eigen::GpuDevice gpu_device(&stream);
    138   int num_elem = 101;
    139 
    140   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    141   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    142   float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
    143   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    144 
    145   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
    146       d_float1, num_elem);
    147   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
    148       d_float2, num_elem);
    149   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
    150       d_res_half, num_elem);
    151   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    152       d_res_float, num_elem);
    153 
    154   gpu_float1.device(gpu_device) = gpu_float1.random();
    155   gpu_float2.device(gpu_device) = gpu_float2.random();
    156   gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
    157   gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
    158 
    159   Tensor<float, 1> half_prec(num_elem);
    160   Tensor<float, 1> full_prec(num_elem);
    161   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
    162   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    163   gpu_device.synchronize();
    164 
    165   for (int i = 0; i < num_elem; ++i) {
    166     std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
    167     VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
    168   }
    169 
    170   gpu_device.deallocate(d_float1);
    171   gpu_device.deallocate(d_float2);
    172   gpu_device.deallocate(d_res_half);
    173   gpu_device.deallocate(d_res_float);
    174 }
    175 
    176 template<typename>
    177 void test_gpu_trancendental() {
    178   Eigen::GpuStreamDevice stream;
    179   Eigen::GpuDevice gpu_device(&stream);
    180   int num_elem = 101;
    181 
    182   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    183   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    184   float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    185   Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    186   Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    187   Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    188   Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    189   Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    190   Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    191 
    192   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
    193   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
    194   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
    195   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
    196   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
    197   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
    198   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
    199   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
    200   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
    201   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
    202   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
    203 
    204   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
    205   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
    206   gpu_float3.device(gpu_device) = gpu_float3.random();
    207   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
    208   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
    209   gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
    210   gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
    211 
    212   gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
    213   gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
    214 
    215   gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
    216   gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
    217 
    218   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
    219   gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
    220 
    221   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
    222   gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
    223 
    224   Tensor<float, 1> input1(num_elem);
    225   Tensor<Eigen::half, 1> half_prec1(num_elem);
    226   Tensor<Eigen::half, 1> full_prec1(num_elem);
    227   Tensor<float, 1> input2(num_elem);
    228   Tensor<Eigen::half, 1> half_prec2(num_elem);
    229   Tensor<Eigen::half, 1> full_prec2(num_elem);
    230   Tensor<float, 1> input3(num_elem);
    231   Tensor<Eigen::half, 1> half_prec3(num_elem);
    232   Tensor<Eigen::half, 1> full_prec3(num_elem);
    233   gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
    234   gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
    235   gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
    236   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
    237   gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
    238   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
    239   gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
    240   gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
    241   gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
    242   gpu_device.synchronize();
    243 
    244   for (int i = 0; i < num_elem; ++i) {
    245     std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
    246     VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
    247   }
    248   for (int i = 0; i < num_elem; ++i) {
    249     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
    250     if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
    251       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
    252     else
    253       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
    254   }
    255   for (int i = 0; i < num_elem; ++i) {
    256     std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
    257     VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
    258   }
    259   gpu_device.deallocate(d_float1);
    260   gpu_device.deallocate(d_float2);
    261   gpu_device.deallocate(d_float3);
    262   gpu_device.deallocate(d_res1_half);
    263   gpu_device.deallocate(d_res1_float);
    264   gpu_device.deallocate(d_res2_half);
    265   gpu_device.deallocate(d_res2_float);
    266   gpu_device.deallocate(d_res3_float);
    267   gpu_device.deallocate(d_res3_half);
    268 }
    269 
    270 template<typename>
    271 void test_gpu_contractions() {
    272   Eigen::GpuStreamDevice stream;
    273   Eigen::GpuDevice gpu_device(&stream);
    274   int rows = 23;
    275   int cols = 23;
    276   int num_elem = rows*cols;
    277 
    278   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    279   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    280   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    281   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    282 
    283   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
    284       d_float1, rows, cols);
    285   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
    286       d_float2, rows, cols);
    287   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
    288       d_res_half, rows, cols);
    289   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
    290       d_res_float, rows, cols);
    291 
    292   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
    293   gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
    294 
    295   typedef Tensor<float, 2>::DimensionPair DimPair;
    296   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
    297   gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
    298   gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
    299 
    300   Tensor<Eigen::half, 2> half_prec(rows, cols);
    301   Tensor<Eigen::half, 2> full_prec(rows, cols);
    302   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
    303   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
    304   gpu_device.synchronize();
    305 
    306   for (int i = 0; i < rows; ++i) {
    307     for (int j = 0; j < cols; ++j) {
    308       std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
    309       if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
    310         VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
    311       }
    312     }
    313   }
    314 
    315   gpu_device.deallocate(d_float1);
    316   gpu_device.deallocate(d_float2);
    317   gpu_device.deallocate(d_res_half);
    318   gpu_device.deallocate(d_res_float);
    319 }
    320 
    321 template<typename>
    322 void test_gpu_reductions(int size1, int size2, int redux) {
    323 
    324    std::cout << "Reducing " << size1 << " by " << size2
    325              << " tensor along dim " << redux << std::endl;
    326 
    327   Eigen::GpuStreamDevice stream;
    328   Eigen::GpuDevice gpu_device(&stream);
    329   int num_elem = size1*size2;
    330   int result_size = (redux == 1 ? size1 : size2);
    331 
    332   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    333   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
    334   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
    335 
    336   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
    337       d_float, size1, size2);
    338   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
    339       d_res_half, result_size);
    340   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
    341       d_res_float, result_size);
    342 
    343   gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
    344 
    345   Eigen::array<int, 1> redux_dim = {redux};
    346   gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
    347   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
    348 
    349   Tensor<Eigen::half, 1> half_prec(result_size);
    350   Tensor<Eigen::half, 1> full_prec(result_size);
    351   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
    352   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
    353   gpu_device.synchronize();
    354 
    355   for (int i = 0; i < result_size; ++i) {
    356     std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
    357     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
    358   }
    359 
    360   gpu_device.deallocate(d_float);
    361   gpu_device.deallocate(d_res_half);
    362   gpu_device.deallocate(d_res_float);
    363 }
    364 
    365 template<typename>
    366 void test_gpu_reductions() {
    367   test_gpu_reductions<void>(13, 13, 0);
    368   test_gpu_reductions<void>(13, 13, 1);
    369 
    370   test_gpu_reductions<void>(35, 36, 0);
    371   test_gpu_reductions<void>(35, 36, 1);
    372 
    373   test_gpu_reductions<void>(36, 35, 0);
    374   test_gpu_reductions<void>(36, 35, 1);
    375 }
    376 
    377 template<typename>
    378 void test_gpu_full_reductions() {
    379   Eigen::GpuStreamDevice stream;
    380   Eigen::GpuDevice gpu_device(&stream);
    381   int size = 13;
    382   int num_elem = size*size;
    383 
    384   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    385   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
    386   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
    387 
    388   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
    389       d_float, size, size);
    390   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
    391       d_res_half);
    392   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
    393       d_res_float);
    394 
    395   gpu_float.device(gpu_device) = gpu_float.random();
    396 
    397   gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
    398   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
    399 
    400   Tensor<Eigen::half, 0> half_prec;
    401   Tensor<Eigen::half, 0> full_prec;
    402   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
    403   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
    404   gpu_device.synchronize();
    405 
    406   VERIFY_IS_APPROX(full_prec(), half_prec());
    407 
    408   gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
    409   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
    410   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
    411   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
    412   gpu_device.synchronize();
    413 
    414   VERIFY_IS_APPROX(full_prec(), half_prec());
    415 
    416   gpu_device.deallocate(d_float);
    417   gpu_device.deallocate(d_res_half);
    418   gpu_device.deallocate(d_res_float);
    419 }
    420 
    421 template<typename>
    422 void test_gpu_forced_evals() {
    423 
    424   Eigen::GpuStreamDevice stream;
    425   Eigen::GpuDevice gpu_device(&stream);
    426   int num_elem = 101;
    427 
    428   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    429   float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    430   float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    431   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    432 
    433   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
    434       d_float, num_elem);
    435   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
    436       d_res_half1, num_elem);
    437   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
    438       d_res_half2, num_elem);
    439   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    440       d_res_float, num_elem);
    441 
    442   Eigen::array<int, 1> no_bcast;
    443   no_bcast[0] = 1;
    444 
    445   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
    446   gpu_res_float.device(gpu_device) = gpu_float.abs();
    447   gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
    448   gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
    449 
    450   Tensor<float, 1> half_prec1(num_elem);
    451   Tensor<float, 1> half_prec2(num_elem);
    452   Tensor<float, 1> full_prec(num_elem);
    453   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
    454   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
    455   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    456   gpu_device.synchronize();
    457 
    458   for (int i = 0; i < num_elem; ++i) {
    459     std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
    460     VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
    461     VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
    462   }
    463 
    464   gpu_device.deallocate(d_float);
    465   gpu_device.deallocate(d_res_half1);
    466   gpu_device.deallocate(d_res_half2);
    467   gpu_device.deallocate(d_res_float);
    468 }
    469 #endif
    470 
    471 
    472 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
    473 {
    474   CALL_SUBTEST_1(test_gpu_numext<void>());
    475 
    476 #ifdef EIGEN_HAS_GPU_FP16
    477   CALL_SUBTEST_1(test_gpu_conversion<void>());
    478   CALL_SUBTEST_1(test_gpu_unary<void>());
    479   CALL_SUBTEST_1(test_gpu_elementwise<void>());
    480   CALL_SUBTEST_1(test_gpu_trancendental<void>());
    481   CALL_SUBTEST_2(test_gpu_contractions<void>());
    482   CALL_SUBTEST_3(test_gpu_reductions<void>());
    483   CALL_SUBTEST_4(test_gpu_full_reductions<void>());
    484   CALL_SUBTEST_5(test_gpu_forced_evals<void>());
    485 #else
    486   std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
    487 #endif
    488 }