cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_thread_pool.cpp (25491B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_USE_THREADS
     11 
     12 
     13 #include "main.h"
     14 #include <iostream>
     15 #include <Eigen/CXX11/Tensor>
     16 
     17 using Eigen::Tensor;
     18 
     19 class TestAllocator : public Allocator {
     20  public:
     21   ~TestAllocator() EIGEN_OVERRIDE {}
     22   EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
     23     const_cast<TestAllocator*>(this)->alloc_count_++;
     24     return internal::aligned_malloc(num_bytes);
     25   }
     26   EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
     27     const_cast<TestAllocator*>(this)->dealloc_count_++;
     28     internal::aligned_free(buffer);
     29   }
     30 
     31   int alloc_count() const { return alloc_count_; }
     32   int dealloc_count() const { return dealloc_count_; }
     33 
     34  private:
     35   int alloc_count_ = 0;
     36   int dealloc_count_ = 0;
     37 };
     38 
     39 void test_multithread_elementwise()
     40 {
     41   Tensor<float, 3> in1(200, 30, 70);
     42   Tensor<float, 3> in2(200, 30, 70);
     43   Tensor<double, 3> out(200, 30, 70);
     44 
     45   in1.setRandom();
     46   in2.setRandom();
     47 
     48   Eigen::ThreadPool tp(internal::random<int>(3, 11));
     49   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
     50   out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
     51 
     52   for (int i = 0; i < 200; ++i) {
     53     for (int j = 0; j < 30; ++j) {
     54       for (int k = 0; k < 70; ++k) {
     55         VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
     56       }
     57     }
     58   }
     59 }
     60 
     61 void test_async_multithread_elementwise()
     62 {
     63   Tensor<float, 3> in1(200, 30, 70);
     64   Tensor<float, 3> in2(200, 30, 70);
     65   Tensor<double, 3> out(200, 30, 70);
     66 
     67   in1.setRandom();
     68   in2.setRandom();
     69 
     70   Eigen::ThreadPool tp(internal::random<int>(3, 11));
     71   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
     72 
     73   Eigen::Barrier b(1);
     74   out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
     75   b.Wait();
     76 
     77   for (int i = 0; i < 200; ++i) {
     78     for (int j = 0; j < 30; ++j) {
     79       for (int k = 0; k < 70; ++k) {
     80         VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
     81       }
     82     }
     83   }
     84 }
     85 
     86 void test_multithread_compound_assignment()
     87 {
     88   Tensor<float, 3> in1(2,3,7);
     89   Tensor<float, 3> in2(2,3,7);
     90   Tensor<float, 3> out(2,3,7);
     91 
     92   in1.setRandom();
     93   in2.setRandom();
     94 
     95   Eigen::ThreadPool tp(internal::random<int>(3, 11));
     96   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
     97   out.device(thread_pool_device) = in1;
     98   out.device(thread_pool_device) += in2 * 3.14f;
     99 
    100   for (int i = 0; i < 2; ++i) {
    101     for (int j = 0; j < 3; ++j) {
    102       for (int k = 0; k < 7; ++k) {
    103         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
    104       }
    105     }
    106   }
    107 }
    108 
    109 template<int DataLayout>
    110 void test_multithread_contraction()
    111 {
    112   Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
    113   Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
    114   Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
    115 
    116   t_left.setRandom();
    117   t_right.setRandom();
    118 
    119   // this contraction should be equivalent to a single matrix multiplication
    120   typedef Tensor<float, 1>::DimensionPair DimPair;
    121   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
    122 
    123   typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    124   MapXf m_left(t_left.data(), 1500, 1147);
    125   MapXf m_right(t_right.data(), 1147, 1400);
    126   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
    127 
    128   Eigen::ThreadPool tp(4);
    129   Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
    130 
    131   // compute results by separate methods
    132   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
    133   m_result = m_left * m_right;
    134 
    135  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
    136     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
    137     if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
    138       continue;
    139     }
    140     if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
    141       continue;
    142     }
    143     std::cout << "mismatch detected at index " << i << ": " << t_result(i)
    144               << " vs " <<  m_result(i) << std::endl;
    145     assert(false);
    146   }
    147 }
    148 
    149 template<int DataLayout>
    150 void test_contraction_corner_cases()
    151 {
    152   Tensor<float, 2, DataLayout> t_left(32, 500);
    153   Tensor<float, 2, DataLayout> t_right(32, 28*28);
    154   Tensor<float, 2, DataLayout> t_result(500, 28*28);
    155 
    156   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
    157   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
    158   t_result = t_result.constant(NAN);
    159 
    160   // this contraction should be equivalent to a single matrix multiplication
    161   typedef Tensor<float, 1>::DimensionPair DimPair;
    162   Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
    163 
    164   typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    165   MapXf m_left(t_left.data(), 32, 500);
    166   MapXf m_right(t_right.data(), 32, 28*28);
    167   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
    168 
    169   Eigen::ThreadPool tp(12);
    170   Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
    171 
    172   // compute results by separate methods
    173   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
    174   m_result = m_left.transpose() * m_right;
    175 
    176   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
    177     assert(!(numext::isnan)(t_result.data()[i]));
    178     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    179       std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    180       assert(false);
    181     }
    182   }
    183 
    184   t_left.resize(32, 1);
    185   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
    186   t_result.resize (1, 28*28);
    187   t_result = t_result.constant(NAN);
    188   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
    189   new(&m_left) MapXf(t_left.data(), 32, 1);
    190   m_result = m_left.transpose() * m_right;
    191   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
    192     assert(!(numext::isnan)(t_result.data()[i]));
    193     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    194       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    195       assert(false);
    196     }
    197   }
    198 
    199   t_left.resize(32, 500);
    200   t_right.resize(32, 4);
    201   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
    202   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
    203   t_result.resize (500, 4);
    204   t_result = t_result.constant(NAN);
    205   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
    206   new(&m_left) MapXf(t_left.data(), 32, 500);
    207   new(&m_right) MapXf(t_right.data(), 32, 4);
    208   m_result = m_left.transpose() * m_right;
    209   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
    210     assert(!(numext::isnan)(t_result.data()[i]));
    211     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    212       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    213       assert(false);
    214     }
    215   }
    216 
    217   t_left.resize(32, 1);
    218   t_right.resize(32, 4);
    219   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
    220   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
    221   t_result.resize (1, 4);
    222   t_result = t_result.constant(NAN);
    223   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
    224   new(&m_left) MapXf(t_left.data(), 32, 1);
    225   new(&m_right) MapXf(t_right.data(), 32, 4);
    226   m_result = m_left.transpose() * m_right;
    227   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
    228     assert(!(numext::isnan)(t_result.data()[i]));
    229     if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    230       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    231       assert(false);
    232     }
    233   }
    234 }
    235 
    236 template<int DataLayout>
    237 void test_multithread_contraction_agrees_with_singlethread() {
    238   int contract_size = internal::random<int>(1, 5000);
    239 
    240   Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
    241                                     contract_size,
    242                                     internal::random<int>(1, 100));
    243 
    244   Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
    245                                      internal::random<int>(1, 37),
    246                                      contract_size,
    247                                      internal::random<int>(1, 51));
    248 
    249   left.setRandom();
    250   right.setRandom();
    251 
    252   // add constants to shift values away from 0 for more precision
    253   left += left.constant(1.5f);
    254   right += right.constant(1.5f);
    255 
    256   typedef Tensor<float, 1>::DimensionPair DimPair;
    257   Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
    258 
    259   Eigen::ThreadPool tp(internal::random<int>(2, 11));
    260   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
    261 
    262   Tensor<float, 5, DataLayout> st_result;
    263   st_result = left.contract(right, dims);
    264 
    265   Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
    266   tp_result.device(thread_pool_device) = left.contract(right, dims);
    267 
    268   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
    269   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
    270     // if both of the values are very small, then do nothing (because the test will fail
    271     // due to numerical precision issues when values are small)
    272     if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
    273       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
    274     }
    275   }
    276 }
    277 
    278 // Apply Sqrt to all output elements.
    279 struct SqrtOutputKernel {
    280   template <typename Index, typename Scalar>
    281   EIGEN_ALWAYS_INLINE void operator()(
    282       const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
    283       const TensorContractionParams&, Index, Index, Index num_rows,
    284       Index num_cols) const {
    285     for (int i = 0; i < num_rows; ++i) {
    286       for (int j = 0; j < num_cols; ++j) {
    287         output_mapper(i, j) = std::sqrt(output_mapper(i, j));
    288       }
    289     }
    290   }
    291 };
    292 
    293 template <int DataLayout>
    294 static void test_multithread_contraction_with_output_kernel() {
    295   typedef Tensor<float, 1>::DimensionPair DimPair;
    296 
    297   const int num_threads = internal::random<int>(2, 11);
    298   ThreadPool threads(num_threads);
    299   Eigen::ThreadPoolDevice device(&threads, num_threads);
    300 
    301   Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
    302   Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
    303   Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
    304 
    305   t_left.setRandom();
    306   t_right.setRandom();
    307   // Put trash in mat4 to verify contraction clears output memory.
    308   t_result.setRandom();
    309 
    310   // Add a little offset so that the results won't be close to zero.
    311   t_left += t_left.constant(1.0f);
    312   t_right += t_right.constant(1.0f);
    313 
    314   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    315   MapXf m_left(t_left.data(), 1500, 248);
    316   MapXf m_right(t_right.data(), 248, 1400);
    317   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
    318 
    319   // this contraction should be equivalent to a single matrix multiplication
    320   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
    321 
    322   // compute results by separate methods
    323   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
    324 
    325   m_result = m_left * m_right;
    326 
    327   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    328     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
    329     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
    330   }
    331 }
    332 
    333 template<int DataLayout>
    334 void test_async_multithread_contraction_agrees_with_singlethread()
    335 {
    336   int contract_size = internal::random<int>(100, 500);
    337 
    338   Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
    339                                     contract_size,
    340                                     internal::random<int>(10, 40));
    341 
    342   Tensor<float, 4, DataLayout> right(
    343       internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
    344       internal::random<int>(1, 20));
    345 
    346   left.setRandom();
    347   right.setRandom();
    348 
    349   // add constants to shift values away from 0 for more precision
    350   left += left.constant(1.5f);
    351   right += right.constant(1.5f);
    352 
    353   typedef Tensor<float, 1>::DimensionPair DimPair;
    354   Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
    355 
    356   Eigen::ThreadPool tp(internal::random<int>(2, 11));
    357   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
    358 
    359   Tensor<float, 5, DataLayout> st_result;
    360   st_result = left.contract(right, dims);
    361 
    362   Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
    363 
    364   Eigen::Barrier barrier(1);
    365   tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
    366       left.contract(right, dims);
    367   barrier.Wait();
    368 
    369   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
    370   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
    371     // if both of the values are very small, then do nothing (because the test
    372     // will fail due to numerical precision issues when values are small)
    373     if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
    374       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
    375     }
    376   }
    377 }
    378 
    379 // We are triggering 'evalShardedByInnerDim' optimization.
    380 template <int DataLayout>
    381 static void test_sharded_by_inner_dim_contraction()
    382 {
    383   typedef Tensor<float, 1>::DimensionPair DimPair;
    384 
    385   const int num_threads = internal::random<int>(4, 16);
    386   ThreadPool threads(num_threads);
    387   Eigen::ThreadPoolDevice device(&threads, num_threads);
    388 
    389   Tensor<float, 2, DataLayout> t_left(2, 10000);
    390   Tensor<float, 2, DataLayout> t_right(10000, 10);
    391   Tensor<float, 2, DataLayout> t_result(2, 10);
    392 
    393   t_left.setRandom();
    394   t_right.setRandom();
    395   // Put trash in t_result to verify contraction clears output memory.
    396   t_result.setRandom();
    397 
    398   // Add a little offset so that the results won't be close to zero.
    399   t_left += t_left.constant(1.0f);
    400   t_right += t_right.constant(1.0f);
    401 
    402   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    403   MapXf m_left(t_left.data(), 2, 10000);
    404   MapXf m_right(t_right.data(), 10000, 10);
    405   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
    406 
    407   // this contraction should be equivalent to a single matrix multiplication
    408   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
    409 
    410   // compute results by separate methods
    411   t_result.device(device) = t_left.contract(t_right, dims);
    412   m_result = m_left * m_right;
    413 
    414   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    415     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
    416   }
    417 }
    418 
    419 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
    420 template <int DataLayout>
    421 static void test_sharded_by_inner_dim_contraction_with_output_kernel()
    422 {
    423   typedef Tensor<float, 1>::DimensionPair DimPair;
    424 
    425   const int num_threads = internal::random<int>(4, 16);
    426   ThreadPool threads(num_threads);
    427   Eigen::ThreadPoolDevice device(&threads, num_threads);
    428 
    429   Tensor<float, 2, DataLayout> t_left(2, 10000);
    430   Tensor<float, 2, DataLayout> t_right(10000, 10);
    431   Tensor<float, 2, DataLayout> t_result(2, 10);
    432 
    433   t_left.setRandom();
    434   t_right.setRandom();
    435   // Put trash in t_result to verify contraction clears output memory.
    436   t_result.setRandom();
    437 
    438   // Add a little offset so that the results won't be close to zero.
    439   t_left += t_left.constant(1.0f);
    440   t_right += t_right.constant(1.0f);
    441 
    442   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    443   MapXf m_left(t_left.data(), 2, 10000);
    444   MapXf m_right(t_right.data(), 10000, 10);
    445   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
    446 
    447   // this contraction should be equivalent to a single matrix multiplication
    448   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
    449 
    450   // compute results by separate methods
    451   t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
    452   m_result = m_left * m_right;
    453 
    454   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    455     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
    456   }
    457 }
    458 
    459 // We are triggering 'evalShardedByInnerDim' optimization.
    460 template <int DataLayout>
    461 static void test_async_sharded_by_inner_dim_contraction()
    462 {
    463   typedef Tensor<float, 1>::DimensionPair DimPair;
    464 
    465   const int num_threads = internal::random<int>(4, 16);
    466   ThreadPool threads(num_threads);
    467   Eigen::ThreadPoolDevice device(&threads, num_threads);
    468 
    469   Tensor<float, 2, DataLayout> t_left(2, 10000);
    470   Tensor<float, 2, DataLayout> t_right(10000, 10);
    471   Tensor<float, 2, DataLayout> t_result(2, 10);
    472 
    473   t_left.setRandom();
    474   t_right.setRandom();
    475   // Put trash in t_result to verify contraction clears output memory.
    476   t_result.setRandom();
    477 
    478   // Add a little offset so that the results won't be close to zero.
    479   t_left += t_left.constant(1.0f);
    480   t_right += t_right.constant(1.0f);
    481 
    482   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    483   MapXf m_left(t_left.data(), 2, 10000);
    484   MapXf m_right(t_right.data(), 10000, 10);
    485   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
    486 
    487   // this contraction should be equivalent to a single matrix multiplication
    488   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
    489 
    490   // compute results by separate methods
    491   Eigen::Barrier barrier(1);
    492   t_result.device(device, [&barrier]() { barrier.Notify(); }) =
    493       t_left.contract(t_right, dims);
    494   barrier.Wait();
    495 
    496   m_result = m_left * m_right;
    497 
    498   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    499     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
    500   }
    501 }
    502 
    503 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
    504 template <int DataLayout>
    505 static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
    506 {
    507   typedef Tensor<float, 1>::DimensionPair DimPair;
    508 
    509   const int num_threads = internal::random<int>(4, 16);
    510   ThreadPool threads(num_threads);
    511   Eigen::ThreadPoolDevice device(&threads, num_threads);
    512 
    513   Tensor<float, 2, DataLayout> t_left(2, 10000);
    514   Tensor<float, 2, DataLayout> t_right(10000, 10);
    515   Tensor<float, 2, DataLayout> t_result(2, 10);
    516 
    517   t_left.setRandom();
    518   t_right.setRandom();
    519   // Put trash in t_result to verify contraction clears output memory.
    520   t_result.setRandom();
    521 
    522   // Add a little offset so that the results won't be close to zero.
    523   t_left += t_left.constant(1.0f);
    524   t_right += t_right.constant(1.0f);
    525 
    526   typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
    527   MapXf m_left(t_left.data(), 2, 10000);
    528   MapXf m_right(t_right.data(), 10000, 10);
    529   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
    530 
    531   // this contraction should be equivalent to a single matrix multiplication
    532   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
    533 
    534   // compute results by separate methods
    535   Eigen::Barrier barrier(1);
    536   t_result.device(device, [&barrier]() { barrier.Notify(); }) =
    537       t_left.contract(t_right, dims, SqrtOutputKernel());
    538   barrier.Wait();
    539   m_result = m_left * m_right;
    540 
    541   for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    542     VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
    543   }
    544 }
    545 
    546 template<int DataLayout>
    547 void test_full_contraction() {
    548   int contract_size1 = internal::random<int>(1, 500);
    549   int contract_size2 = internal::random<int>(1, 500);
    550 
    551   Tensor<float, 2, DataLayout> left(contract_size1,
    552                                     contract_size2);
    553   Tensor<float, 2, DataLayout> right(contract_size1,
    554                                     contract_size2);
    555   left.setRandom();
    556   right.setRandom();
    557 
    558   // add constants to shift values away from 0 for more precision
    559   left += left.constant(1.5f);
    560   right += right.constant(1.5f);
    561 
    562   typedef Tensor<float, 2>::DimensionPair DimPair;
    563   Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
    564 
    565   Eigen::ThreadPool tp(internal::random<int>(2, 11));
    566   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
    567 
    568   Tensor<float, 0, DataLayout> st_result;
    569   st_result = left.contract(right, dims);
    570 
    571   Tensor<float, 0, DataLayout> tp_result;
    572   tp_result.device(thread_pool_device) = left.contract(right, dims);
    573 
    574   VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
    575   // if both of the values are very small, then do nothing (because the test will fail
    576   // due to numerical precision issues when values are small)
    577   if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
    578     VERIFY_IS_APPROX(st_result(), tp_result());
    579   }
    580 }
    581 
    582 template<int DataLayout>
    583 void test_multithreaded_reductions() {
    584   const int num_threads = internal::random<int>(3, 11);
    585   ThreadPool thread_pool(num_threads);
    586   Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
    587 
    588   const int num_rows = internal::random<int>(13, 732);
    589   const int num_cols = internal::random<int>(13, 732);
    590   Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
    591   t1.setRandom();
    592 
    593   Tensor<float, 0, DataLayout> full_redux;
    594   full_redux = t1.sum();
    595 
    596   Tensor<float, 0, DataLayout> full_redux_tp;
    597   full_redux_tp.device(thread_pool_device) = t1.sum();
    598 
    599   // Check that the single threaded and the multi threaded reductions return
    600   // the same result.
    601   VERIFY_IS_APPROX(full_redux(), full_redux_tp());
    602 }
    603 
    604 
    605 void test_memcpy() {
    606 
    607   for (int i = 0; i < 5; ++i) {
    608     const int num_threads = internal::random<int>(3, 11);
    609     Eigen::ThreadPool tp(num_threads);
    610     Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
    611 
    612     const int size = internal::random<int>(13, 7632);
    613     Tensor<float, 1> t1(size);
    614     t1.setRandom();
    615     std::vector<float> result(size);
    616     thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
    617     for (int j = 0; j < size; j++) {
    618       VERIFY_IS_EQUAL(t1(j), result[j]);
    619     }
    620   }
    621 }
    622 
    623 
    624 void test_multithread_random()
    625 {
    626   Eigen::ThreadPool tp(2);
    627   Eigen::ThreadPoolDevice device(&tp, 2);
    628   Tensor<float, 1> t(1 << 20);
    629   t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
    630 }
    631 
    632 template<int DataLayout>
    633 void test_multithread_shuffle(Allocator* allocator)
    634 {
    635   Tensor<float, 4, DataLayout> tensor(17,5,7,11);
    636   tensor.setRandom();
    637 
    638   const int num_threads = internal::random<int>(2, 11);
    639   ThreadPool threads(num_threads);
    640   Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
    641 
    642   Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
    643   array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
    644   shuffle.device(device) = tensor.shuffle(shuffles);
    645 
    646   for (int i = 0; i < 17; ++i) {
    647     for (int j = 0; j < 5; ++j) {
    648       for (int k = 0; k < 7; ++k) {
    649         for (int l = 0; l < 11; ++l) {
    650           VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
    651         }
    652       }
    653     }
    654   }
    655 }
    656 
    657 void test_threadpool_allocate(TestAllocator* allocator)
    658 {
    659   const int num_threads = internal::random<int>(2, 11);
    660   const int num_allocs = internal::random<int>(2, 11);
    661   ThreadPool threads(num_threads);
    662   Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
    663 
    664   for (int a = 0; a < num_allocs; ++a) {
    665     void* ptr = device.allocate(512);
    666     device.deallocate(ptr);
    667   }
    668   VERIFY(allocator != NULL);
    669   VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
    670   VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
    671 }
    672 
    673 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
    674 {
    675   CALL_SUBTEST_1(test_multithread_elementwise());
    676   CALL_SUBTEST_1(test_async_multithread_elementwise());
    677   CALL_SUBTEST_1(test_multithread_compound_assignment());
    678 
    679   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
    680   CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
    681 
    682   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
    683   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
    684   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
    685   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
    686 
    687   CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
    688   CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
    689 
    690   // Test EvalShardedByInnerDimContext parallelization strategy.
    691   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
    692   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
    693   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
    694   CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
    695 
    696   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
    697   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
    698   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
    699   CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
    700 
    701   // Exercise various cases that have been problematic in the past.
    702   CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
    703   CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
    704 
    705   CALL_SUBTEST_8(test_full_contraction<ColMajor>());
    706   CALL_SUBTEST_8(test_full_contraction<RowMajor>());
    707 
    708   CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
    709   CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
    710 
    711   CALL_SUBTEST_10(test_memcpy());
    712   CALL_SUBTEST_10(test_multithread_random());
    713 
    714   TestAllocator test_allocator;
    715   CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
    716   CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
    717   CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
    718 
    719   // Force CMake to split this test.
    720   // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
    721 }