cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_executor.cpp (30675B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_USE_THREADS
     11 
     12 #include "main.h"
     13 
     14 #include <Eigen/CXX11/Tensor>
     15 
     16 using Eigen::Tensor;
     17 using Eigen::RowMajor;
     18 using Eigen::ColMajor;
     19 using Eigen::internal::TiledEvaluation;
     20 
     21 // A set of tests to verify that different TensorExecutor strategies yields the
     22 // same results for all the ops, supporting tiled evaluation.
     23 
     24 // Default assignment that does no use block evaluation or vectorization.
     25 // We assume that default coefficient evaluation is well tested and correct.
     26 template <typename Dst, typename Expr>
     27 static void DefaultAssign(Dst& dst, Expr expr) {
     28   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
     29   using Executor =
     30       Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
     31                                       /*Vectorizable=*/false,
     32                                       /*Tiling=*/TiledEvaluation::Off>;
     33 
     34   Executor::run(Assign(dst, expr), DefaultDevice());
     35 }
     36 
     37 // Assignment with specified device and tiling strategy.
     38 template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
     39           typename Dst, typename Expr>
     40 static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
     41   using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
     42   using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
     43                                                    Vectorizable, Tiling>;
     44 
     45   Executor::run(Assign(dst, expr), d);
     46 }
     47 
     48 template <int NumDims>
     49 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
     50   array<Index, NumDims> dims;
     51   for (int i = 0; i < NumDims; ++i) {
     52     dims[i] = internal::random<int>(min_dim, max_dim);
     53   }
     54   return dims;
     55 }
     56 
     57 template <typename T, int NumDims, typename Device, bool Vectorizable,
     58           TiledEvaluation Tiling, int Layout>
     59 static void test_execute_unary_expr(Device d)
     60 {
     61   static constexpr int Options = 0 | Layout;
     62 
     63   // Pick a large enough tensor size to bypass small tensor block evaluation
     64   // optimization.
     65   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
     66 
     67   Tensor<T, NumDims, Options, Index> src(dims);
     68   Tensor<T, NumDims, Options, Index> dst(dims);
     69 
     70   src.setRandom();
     71   const auto expr = src.square();
     72 
     73   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
     74   using Executor =
     75       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
     76 
     77   Executor::run(Assign(dst, expr), d);
     78 
     79   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
     80     T square = src.coeff(i) * src.coeff(i);
     81     VERIFY_IS_EQUAL(square, dst.coeff(i));
     82   }
     83 }
     84 
     85 template <typename T, int NumDims, typename Device, bool Vectorizable,
     86           TiledEvaluation Tiling, int Layout>
     87 static void test_execute_binary_expr(Device d)
     88 {
     89   static constexpr int Options = 0 | Layout;
     90 
     91   // Pick a large enough tensor size to bypass small tensor block evaluation
     92   // optimization.
     93   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
     94 
     95   Tensor<T, NumDims, Options, Index> lhs(dims);
     96   Tensor<T, NumDims, Options, Index> rhs(dims);
     97   Tensor<T, NumDims, Options, Index> dst(dims);
     98 
     99   lhs.setRandom();
    100   rhs.setRandom();
    101 
    102   const auto expr = lhs + rhs;
    103 
    104   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    105   using Executor =
    106       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    107 
    108   Executor::run(Assign(dst, expr), d);
    109 
    110   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    111     T sum = lhs.coeff(i) + rhs.coeff(i);
    112     VERIFY_IS_EQUAL(sum, dst.coeff(i));
    113   }
    114 }
    115 
    116 template <typename T, int NumDims, typename Device, bool Vectorizable,
    117           TiledEvaluation Tiling, int Layout>
    118 static void test_execute_broadcasting(Device d)
    119 {
    120   static constexpr int Options = 0 | Layout;
    121 
    122   auto dims = RandomDims<NumDims>(1, 10);
    123   Tensor<T, NumDims, Options, Index> src(dims);
    124   src.setRandom();
    125 
    126   const auto broadcasts = RandomDims<NumDims>(1, 7);
    127   const auto expr = src.broadcast(broadcasts);
    128 
    129   // We assume that broadcasting on a default device is tested and correct, so
    130   // we can rely on it to verify correctness of tensor executor and tiling.
    131   Tensor<T, NumDims, Options, Index> golden;
    132   golden = expr;
    133 
    134   // Now do the broadcasting using configured tensor executor.
    135   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
    136 
    137   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    138   using Executor =
    139       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    140 
    141   Executor::run(Assign(dst, expr), d);
    142 
    143   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    144     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    145   }
    146 }
    147 
    148 template <typename T, int NumDims, typename Device, bool Vectorizable,
    149           TiledEvaluation Tiling, int Layout>
    150 static void test_execute_chipping_rvalue(Device d)
    151 {
    152   auto dims = RandomDims<NumDims>(1, 10);
    153   Tensor<T, NumDims, Layout, Index> src(dims);
    154   src.setRandom();
    155 
    156 #define TEST_CHIPPING(CHIP_DIM)                                           \
    157   if (NumDims > (CHIP_DIM)) {                                             \
    158     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
    159     const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
    160                                                                           \
    161     Tensor<T, NumDims - 1, Layout, Index> golden;                         \
    162     golden = expr;                                                        \
    163                                                                           \
    164     Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
    165                                                                           \
    166     using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
    167     using Executor = internal::TensorExecutor<const Assign, Device,       \
    168                                               Vectorizable, Tiling>;      \
    169                                                                           \
    170     Executor::run(Assign(dst, expr), d);                                  \
    171                                                                           \
    172     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
    173       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
    174     }                                                                     \
    175   }
    176 
    177   TEST_CHIPPING(0)
    178   TEST_CHIPPING(1)
    179   TEST_CHIPPING(2)
    180   TEST_CHIPPING(3)
    181   TEST_CHIPPING(4)
    182   TEST_CHIPPING(5)
    183 
    184 #undef TEST_CHIPPING
    185 }
    186 
    187 template <typename T, int NumDims, typename Device, bool Vectorizable,
    188     TiledEvaluation Tiling, int Layout>
    189 static void test_execute_chipping_lvalue(Device d)
    190 {
    191   auto dims = RandomDims<NumDims>(1, 10);
    192 
    193 #define TEST_CHIPPING(CHIP_DIM)                                             \
    194   if (NumDims > (CHIP_DIM)) {                                               \
    195     /* Generate random data that we'll assign to the chipped tensor dim. */ \
    196     array<Index, NumDims - 1> src_dims;                                     \
    197     for (int i = 0; i < NumDims - 1; ++i) {                                 \
    198       int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
    199       src_dims[i] = dims[dim];                                              \
    200     }                                                                       \
    201                                                                             \
    202     Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
    203     src.setRandom();                                                        \
    204                                                                             \
    205     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
    206                                                                             \
    207     Tensor<T, NumDims, Layout, Index> random(dims);                         \
    208     random.setZero();                                                       \
    209                                                                             \
    210     Tensor<T, NumDims, Layout, Index> golden(dims);                         \
    211     golden = random;                                                        \
    212     golden.template chip<(CHIP_DIM)>(offset) = src;                         \
    213                                                                             \
    214     Tensor<T, NumDims, Layout, Index> dst(dims);                            \
    215     dst = random;                                                           \
    216     auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
    217                                                                             \
    218     using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
    219     using Executor = internal::TensorExecutor<const Assign, Device,         \
    220                                               Vectorizable, Tiling>;        \
    221                                                                             \
    222     Executor::run(Assign(expr, src), d);                                    \
    223                                                                             \
    224     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
    225       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
    226     }                                                                       \
    227   }
    228 
    229   TEST_CHIPPING(0)
    230   TEST_CHIPPING(1)
    231   TEST_CHIPPING(2)
    232   TEST_CHIPPING(3)
    233   TEST_CHIPPING(4)
    234   TEST_CHIPPING(5)
    235 
    236 #undef TEST_CHIPPING
    237 }
    238 
    239 template <typename T, int NumDims, typename Device, bool Vectorizable,
    240           TiledEvaluation Tiling, int Layout>
    241 static void test_execute_shuffle_rvalue(Device d)
    242 {
    243   static constexpr int Options = 0 | Layout;
    244 
    245   auto dims = RandomDims<NumDims>(1, 10);
    246   Tensor<T, NumDims, Options, Index> src(dims);
    247   src.setRandom();
    248 
    249   DSizes<Index, NumDims> shuffle;
    250   for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
    251 
    252   // Test all possible shuffle permutations.
    253   do {
    254     DSizes<Index, NumDims> shuffled_dims;
    255     for (int i = 0; i < NumDims; ++i) {
    256       shuffled_dims[i] = dims[shuffle[i]];
    257     }
    258 
    259     const auto expr = src.shuffle(shuffle);
    260 
    261     // We assume that shuffling on a default device is tested and correct, so
    262     // we can rely on it to verify correctness of tensor executor and tiling.
    263     Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
    264     DefaultAssign(golden, expr);
    265 
    266     // Now do the shuffling using configured tensor executor.
    267     Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
    268     DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
    269 
    270     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    271       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    272     }
    273 
    274   } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
    275 }
    276 
    277 template <typename T, int NumDims, typename Device, bool Vectorizable,
    278           TiledEvaluation Tiling, int Layout>
    279 static void test_execute_shuffle_lvalue(Device d)
    280 {
    281   static constexpr int Options = 0 | Layout;
    282 
    283   auto dims = RandomDims<NumDims>(5, 10);
    284   Tensor<T, NumDims, Options, Index> src(dims);
    285   src.setRandom();
    286 
    287   DSizes<Index, NumDims> shuffle;
    288   for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
    289 
    290   // Test all possible shuffle permutations.
    291   do {
    292     DSizes<Index, NumDims> shuffled_dims;
    293     for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
    294 
    295     // We assume that shuffling on a default device is tested and correct, so
    296     // we can rely on it to verify correctness of tensor executor and tiling.
    297     Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
    298     auto golden_shuffle = golden.shuffle(shuffle);
    299     DefaultAssign(golden_shuffle, src);
    300 
    301     // Now do the shuffling using configured tensor executor.
    302     Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
    303     auto dst_shuffle = dst.shuffle(shuffle);
    304     DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
    305 
    306     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    307       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    308     }
    309 
    310   } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
    311 }
    312 
    313 template <typename T, int NumDims, typename Device, bool Vectorizable,
    314     TiledEvaluation Tiling, int Layout>
    315 static void test_execute_reshape(Device d)
    316 {
    317   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
    318 
    319   static constexpr int ReshapedDims = NumDims - 1;
    320   static constexpr int Options = 0 | Layout;
    321 
    322   auto dims = RandomDims<NumDims>(5, 10);
    323   Tensor<T, NumDims, Options, Index> src(dims);
    324   src.setRandom();
    325 
    326   // Multiple 0th dimension and then shuffle.
    327   std::vector<Index> shuffle;
    328   for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
    329   std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
    330 
    331   DSizes<Index, ReshapedDims> reshaped_dims;
    332   reshaped_dims[shuffle[0]] = dims[0] * dims[1];
    333   for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
    334 
    335   Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
    336 
    337   // Now reshape using configured tensor executor.
    338   Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
    339 
    340   auto expr = src.reshape(reshaped_dims);
    341 
    342   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    343   using Executor =
    344       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    345 
    346   Executor::run(Assign(dst, expr), d);
    347 
    348   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    349     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    350   }
    351 }
    352 
    353 template <typename T, int NumDims, typename Device, bool Vectorizable,
    354           TiledEvaluation Tiling, int Layout>
    355 static void test_execute_slice_rvalue(Device d)
    356 {
    357   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
    358   static constexpr int Options = 0 | Layout;
    359 
    360   auto dims = RandomDims<NumDims>(5, 10);
    361   Tensor<T, NumDims, Options, Index> src(dims);
    362   src.setRandom();
    363 
    364   // Pick a random slice of src tensor.
    365   auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
    366   auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
    367 
    368   // Make sure that slice start + size do not overflow tensor dims.
    369   for (int i = 0; i < NumDims; ++i) {
    370     slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
    371     slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
    372   }
    373 
    374   Tensor<T, NumDims, Options, Index> golden =
    375       src.slice(slice_start, slice_size);
    376 
    377   // Now reshape using configured tensor executor.
    378   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
    379 
    380   auto expr = src.slice(slice_start, slice_size);
    381 
    382   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    383   using Executor =
    384       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    385 
    386   Executor::run(Assign(dst, expr), d);
    387 
    388   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    389     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    390   }
    391 }
    392 
    393 template <typename T, int NumDims, typename Device, bool Vectorizable,
    394     TiledEvaluation Tiling, int Layout>
    395 static void test_execute_slice_lvalue(Device d)
    396 {
    397   static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
    398   static constexpr int Options = 0 | Layout;
    399 
    400   auto dims = RandomDims<NumDims>(5, 10);
    401   Tensor<T, NumDims, Options, Index> src(dims);
    402   src.setRandom();
    403 
    404   // Pick a random slice of src tensor.
    405   auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
    406   auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
    407 
    408   // Make sure that slice start + size do not overflow tensor dims.
    409   for (int i = 0; i < NumDims; ++i) {
    410     slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
    411     slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
    412   }
    413 
    414   Tensor<T, NumDims, Options, Index> slice(slice_size);
    415   slice.setRandom();
    416 
    417   // Assign a slice using default executor.
    418   Tensor<T, NumDims, Options, Index> golden = src;
    419   golden.slice(slice_start, slice_size) = slice;
    420 
    421   // And using configured execution strategy.
    422   Tensor<T, NumDims, Options, Index> dst = src;
    423   auto expr = dst.slice(slice_start, slice_size);
    424 
    425   using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
    426   using Executor =
    427       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    428 
    429   Executor::run(Assign(expr, slice), d);
    430 
    431   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    432     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    433   }
    434 }
    435 
    436 template <typename T, int NumDims, typename Device, bool Vectorizable,
    437     TiledEvaluation Tiling, int Layout>
    438 static void test_execute_broadcasting_of_forced_eval(Device d)
    439 {
    440   static constexpr int Options = 0 | Layout;
    441 
    442   auto dims = RandomDims<NumDims>(1, 10);
    443   Tensor<T, NumDims, Options, Index> src(dims);
    444   src.setRandom();
    445 
    446   const auto broadcasts = RandomDims<NumDims>(1, 7);
    447   const auto expr = src.square().eval().broadcast(broadcasts);
    448 
    449   // We assume that broadcasting on a default device is tested and correct, so
    450   // we can rely on it to verify correctness of tensor executor and tiling.
    451   Tensor<T, NumDims, Options, Index> golden;
    452   golden = expr;
    453 
    454   // Now do the broadcasting using configured tensor executor.
    455   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
    456 
    457   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    458   using Executor =
    459       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    460 
    461   Executor::run(Assign(dst, expr), d);
    462 
    463   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    464     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    465   }
    466 }
    467 
    468 template<typename T, int NumDims>
    469 struct DummyGenerator {
    470   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
    471   T operator()(const array <Index, NumDims>& dims) const {
    472     T result = static_cast<T>(0);
    473     for (int i = 0; i < NumDims; ++i) {
    474       result += static_cast<T>((i + 1) * dims[i]);
    475     }
    476     return result;
    477   }
    478 };
    479 
    480 template <typename T, int NumDims, typename Device, bool Vectorizable,
    481     TiledEvaluation Tiling, int Layout>
    482 static void test_execute_generator_op(Device d)
    483 {
    484   static constexpr int Options = 0 | Layout;
    485 
    486   auto dims = RandomDims<NumDims>(20, 30);
    487   Tensor<T, NumDims, Options, Index> src(dims);
    488   src.setRandom();
    489 
    490   const auto expr = src.generate(DummyGenerator<T, NumDims>());
    491 
    492   // We assume that generator on a default device is tested and correct, so
    493   // we can rely on it to verify correctness of tensor executor and tiling.
    494   Tensor<T, NumDims, Options, Index> golden;
    495   golden = expr;
    496 
    497   // Now do the broadcasting using configured tensor executor.
    498   Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
    499 
    500   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    501   using Executor =
    502     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    503 
    504   Executor::run(Assign(dst, expr), d);
    505 
    506   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    507     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    508   }
    509 }
    510 
    511 template <typename T, int NumDims, typename Device, bool Vectorizable,
    512     TiledEvaluation Tiling, int Layout>
    513 static void test_execute_reverse_rvalue(Device d)
    514 {
    515   static constexpr int Options = 0 | Layout;
    516 
    517   auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
    518   Tensor <T, NumDims, Options, Index> src(dims);
    519   src.setRandom();
    520 
    521   // Reverse half of the dimensions.
    522   Eigen::array<bool, NumDims> reverse;
    523   for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
    524 
    525   const auto expr = src.reverse(reverse);
    526 
    527   // We assume that reversing on a default device is tested and correct, so
    528   // we can rely on it to verify correctness of tensor executor and tiling.
    529   Tensor <T, NumDims, Options, Index> golden;
    530   golden = expr;
    531 
    532   // Now do the reversing using configured tensor executor.
    533   Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
    534 
    535   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    536   using Executor =
    537     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
    538 
    539   Executor::run(Assign(dst, expr), d);
    540 
    541   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    542     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
    543   }
    544 }
    545 
    546 template <typename T, int NumDims, typename Device, bool Vectorizable,
    547           TiledEvaluation Tiling, int Layout>
    548 static void test_async_execute_unary_expr(Device d)
    549 {
    550   static constexpr int Options = 0 | Layout;
    551 
    552   // Pick a large enough tensor size to bypass small tensor block evaluation
    553   // optimization.
    554   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
    555 
    556   Tensor<T, NumDims, Options, Index> src(dims);
    557   Tensor<T, NumDims, Options, Index> dst(dims);
    558 
    559   src.setRandom();
    560   const auto expr = src.square();
    561 
    562   Eigen::Barrier done(1);
    563   auto on_done = [&done]() { done.Notify(); };
    564 
    565   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    566   using DoneCallback = decltype(on_done);
    567   using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
    568                                                  Vectorizable, Tiling>;
    569 
    570   Executor::runAsync(Assign(dst, expr), d, on_done);
    571   done.Wait();
    572 
    573   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    574     T square = src.coeff(i) * src.coeff(i);
    575     VERIFY_IS_EQUAL(square, dst.coeff(i));
    576   }
    577 }
    578 
    579 template <typename T, int NumDims, typename Device, bool Vectorizable,
    580           TiledEvaluation Tiling, int Layout>
    581 static void test_async_execute_binary_expr(Device d)
    582 {
    583   static constexpr int Options = 0 | Layout;
    584 
    585   // Pick a large enough tensor size to bypass small tensor block evaluation
    586   // optimization.
    587   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
    588 
    589   Tensor<T, NumDims, Options, Index> lhs(dims);
    590   Tensor<T, NumDims, Options, Index> rhs(dims);
    591   Tensor<T, NumDims, Options, Index> dst(dims);
    592 
    593   lhs.setRandom();
    594   rhs.setRandom();
    595 
    596   const auto expr = lhs + rhs;
    597 
    598   Eigen::Barrier done(1);
    599   auto on_done = [&done]() { done.Notify(); };
    600 
    601   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
    602   using DoneCallback = decltype(on_done);
    603   using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
    604                                                  Vectorizable, Tiling>;
    605 
    606   Executor::runAsync(Assign(dst, expr), d, on_done);
    607   done.Wait();
    608 
    609   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
    610     T sum = lhs.coeff(i) + rhs.coeff(i);
    611     VERIFY_IS_EQUAL(sum, dst.coeff(i));
    612   }
    613 }
    614 
    615 #ifdef EIGEN_DONT_VECTORIZE
    616 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
    617 #else
    618 #define VECTORIZABLE(VAL) VAL
    619 #endif
    620 
    621 #define CALL_SUBTEST_PART(PART) \
    622   CALL_SUBTEST_##PART
    623 
    624 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
    625   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
    626   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
    627   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
    628   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
    629   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
    630   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
    631   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
    632   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
    633   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
    634   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
    635   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
    636   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
    637   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
    638   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
    639   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
    640   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
    641 
    642 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
    643 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
    644   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
    645   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
    646   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
    647   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
    648   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
    649   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
    650   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
    651   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
    652 
    653 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
    654   Eigen::DefaultDevice default_device;
    655   // Default device is unused in ASYNC tests.
    656   EIGEN_UNUSED_VARIABLE(default_device);
    657 
    658   const auto num_threads = internal::random<int>(20, 24);
    659   Eigen::ThreadPool tp(num_threads);
    660   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
    661 
    662   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
    663   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
    664   CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
    665 
    666   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
    667   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
    668   CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
    669 
    670   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
    671   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
    672   CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
    673 
    674   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
    675   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
    676   CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
    677 
    678   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
    679   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
    680   CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
    681 
    682   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
    683   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
    684   CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
    685 
    686   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
    687   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
    688   CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
    689 
    690   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
    691   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
    692   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
    693   CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
    694 
    695   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
    696   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
    697   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
    698   CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
    699 
    700   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
    701   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
    702   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
    703   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
    704 
    705   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
    706   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
    707   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
    708   CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
    709 
    710   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
    711   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
    712   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
    713   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
    714 
    715   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
    716   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
    717   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
    718   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
    719   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
    720 
    721   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
    722   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
    723   CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
    724 
    725   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
    726   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
    727   CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
    728 
    729   // Force CMake to split this test.
    730   // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
    731 }