cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_contract_sycl.cpp (47521B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016
      5 // Mehdi Goli    Codeplay Software Ltd.
      6 // Ralph Potter  Codeplay Software Ltd.
      7 // Luke Iwanski  Codeplay Software Ltd.
      8 // Contact: <eigen@codeplay.com>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 
     14 #define EIGEN_TEST_NO_LONGDOUBLE
     15 #define EIGEN_TEST_NO_COMPLEX
     16 
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
     18 #define EIGEN_USE_SYCL
     19 
     20 #include <algorithm>
     21 #include <chrono>
     22 #include <ctime>
     23 #include <iostream>
     24 
     25 #include "main.h"
     26 
     27 #include <unsupported/Eigen/CXX11/Tensor>
     28 
     29 using Eigen::array;
     30 using Eigen::SyclDevice;
     31 using Eigen::Tensor;
     32 using Eigen::TensorMap;
     33 
     34 template <int DataLayout, typename DataType, typename IndexType,
     35           typename Device>
     36 void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
     37                                   IndexType k_size, IndexType n_size) {
     38   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
     39       DimPair;
     40   static const DataType error_threshold = DataType(1e-4);
     41   // with these dimensions, the output has 300 * 140 elements, which is
     42   // more than 30 * 1024, which is the number of threads in blocks on
     43   // a 15 SM GK110 GPU
     44   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
     45   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
     46   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
     47   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
     48   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
     49   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
     50   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
     51   Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
     52 
     53   t_left.setRandom();
     54   t_right.setRandom();
     55 
     56   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
     57   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
     58   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
     59 
     60   DataType *d_t_left =
     61       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
     62   DataType *d_t_right =
     63       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
     64   DataType *d_t_result =
     65       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
     66 
     67   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
     68       gpu_t_left(d_t_left, left_dims);
     69   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
     70       gpu_t_right(d_t_right, right_dims);
     71   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
     72       gpu_t_result(d_t_result, result_dims);
     73 
     74   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
     75   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
     76 
     77   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
     78   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
     79                                  t_result_bytes);
     80 
     81   t_result = t_left.contract(t_right, dims);
     82 
     83   for (IndexType i = 0; i < t_result.size(); i++) {
     84     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
     85             t_result(i) - t_result_gpu(i)))) < error_threshold) {
     86       continue;
     87     }
     88     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
     89                                   error_threshold)) {
     90       continue;
     91     }
     92 
     93     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
     94               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
     95               << " vs " << t_result_gpu(i) << std::endl;
     96     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
     97   }
     98   sycl_device.deallocate(d_t_left);
     99   sycl_device.deallocate(d_t_right);
    100   sycl_device.deallocate(d_t_result);
    101 }
    102 
    103 template <int DataLayout, typename DataType, typename IndexType,
    104           typename Device>
    105 void test_sycl_contraction_m(const Device &sycl_device) {
    106   for (IndexType k = 32; k < 256; k++) {
    107     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
    108                                                            128);
    109   }
    110 }
    111 
    112 template <int DataLayout, typename DataType, typename IndexType,
    113           typename Device>
    114 void test_sycl_contraction_k(const Device &sycl_device) {
    115   for (IndexType k = 32; k < 256; k++) {
    116     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
    117                                                            128);
    118   }
    119 }
    120 
    121 template <int DataLayout, typename DataType, typename IndexType,
    122           typename Device>
    123 void test_sycl_contraction_n(const Device &sycl_device) {
    124   for (IndexType k = 32; k < 256; k++) {
    125     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
    126                                                            128, k);
    127   }
    128 }
    129 
    130 template <int DataLayout, typename DataType, typename IndexType,
    131           typename Device>
    132 void test_sycl_contraction_sizes(const Device &sycl_device) {
    133   IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
    134                          257, 511, 512, 513, 1023, 1024, 1025};
    135 
    136   IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
    137                          257, 511, 512, 513, 1023, 1024, 1025};
    138 
    139   IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
    140                          255, 257, 511, 512, 513, 1023, 1024, 1025};
    141 
    142   for (IndexType i = 0; i < 15; i++) {
    143     for (IndexType j = 0; j < 15; j++) {
    144       for (IndexType k = 0; k < 17; k++) {
    145         test_sycl_contraction<DataLayout, DataType, IndexType>(
    146             sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
    147       }
    148     }
    149   }
    150 }
    151 
    152 template <int DataLayout, typename DataType, typename IndexType,
    153           typename Device>
    154 void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
    155                                   IndexType k_size, IndexType n_size) {
    156   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    157       DimPair;
    158   static const DataType error_threshold = DataType(1e-4);
    159   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
    160   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
    161   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
    162 
    163   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
    164   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
    165   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
    166   Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
    167 
    168   t_left.setRandom();
    169   t_right.setRandom();
    170 
    171   // Allocate buffers twice as big to check for invalid read and write
    172   auto padded_left_size = 2 * t_left.size();
    173   auto padded_right_size = 2 * t_right.size();
    174   auto padded_result_size = 2 * t_result.size();
    175 
    176   std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
    177   std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
    178   std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
    179 
    180   DataType *d_t_left =
    181       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    182   DataType *d_t_right =
    183       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    184   DataType *d_t_result =
    185       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    186 
    187   // TensorMaps are still of the same size than the Tensors
    188   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    189       gpu_t_left(d_t_left, left_dims);
    190   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    191       gpu_t_right(d_t_right, right_dims);
    192   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    193       gpu_t_result(d_t_result, result_dims);
    194 
    195   // Write nan after the actual buffer to propagate nans everywhere in case of
    196   // invalid reads
    197   DataType nan = std::numeric_limits<DataType>::quiet_NaN();
    198   auto host_left_data = new DataType[padded_left_size];
    199   std::copy_n(t_left.data(), t_left.size(), host_left_data);
    200   std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
    201   auto host_right_data = new DataType[padded_right_size];
    202   std::copy_n(t_right.data(), t_right.size(), host_right_data);
    203   std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
    204   auto host_result_data = new DataType[padded_result_size];
    205   std::fill_n(host_result_data, padded_result_size, nan);
    206 
    207   sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
    208   sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
    209   sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
    210 
    211   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
    212   sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
    213 
    214   t_result = t_left.contract(t_right, dims);
    215 
    216   for (IndexType i = 0; i < t_result.size(); i++) {
    217     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    218             t_result(i) - host_result_data[i]))) < error_threshold) {
    219       continue;
    220     }
    221     if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
    222                                   error_threshold)) {
    223       continue;
    224     }
    225     if (std::isnan(host_result_data[i])) {
    226       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    227                 << ", invalid read detected at IndexType " << i << ": "
    228                 << t_result(i) << " vs " << host_result_data[i] << std::endl;
    229     } else {
    230       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    231                 << ", mismatch detected at IndexType " << i << ": "
    232                 << t_result(i) << " vs " << host_result_data[i] << std::endl;
    233     }
    234     VERIFY_IS_APPROX(host_result_data[i], t_result(i));
    235   }
    236   // Make sure that the rest of the result is still nans
    237   for (IndexType i = t_result.size(); i < padded_result_size; i++) {
    238     if (std::isnan(host_result_data[i])) {
    239       continue;
    240     }
    241     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    242               << ", invalid write detected at IndexType " << i << ": "
    243               << host_result_data[i] << std::endl;
    244     VERIFY_IS_APPROX(host_result_data[i], t_result(i));
    245   }
    246   sycl_device.deallocate(d_t_left);
    247   sycl_device.deallocate(d_t_right);
    248   sycl_device.deallocate(d_t_result);
    249 
    250   delete[] host_left_data;
    251   delete[] host_right_data;
    252   delete[] host_result_data;
    253 }
    254 
    255 template <int DataLayout, typename DataType, typename IndexType,
    256           typename Device>
    257 void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
    258                  IndexType n_size) {
    259   // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
    260   // ")" << std::endl;
    261   // with these dimensions, the output has 300 * 140 elements, which is
    262   // more than 30 * 1024, which is the number of threads in blocks on
    263   // a 15 SM GK110 GPU
    264   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    265       DimPair;
    266   static const DataType error_threshold = DataType(1e-4);
    267   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
    268   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
    269   Tensor<DataType, 0, DataLayout, IndexType> t_result;
    270   Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
    271   Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
    272   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
    273   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
    274   t_left.setRandom();
    275   t_right.setRandom();
    276 
    277   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
    278   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
    279   std::size_t t_result_bytes = sizeof(DataType);
    280 
    281   DataType *d_t_left =
    282       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    283   DataType *d_t_right =
    284       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    285   DataType *d_t_result =
    286       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    287 
    288   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    289       gpu_t_left(d_t_left, left_dims);
    290   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    291       gpu_t_right(d_t_right, right_dims);
    292   Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
    293       gpu_t_result(d_t_result);
    294 
    295   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
    296   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
    297 
    298   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
    299   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
    300                                  t_result_bytes);
    301 
    302   t_result = t_left.contract(t_right, dims);
    303 
    304   if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    305           t_result() - t_result_gpu()))) > error_threshold &&
    306       !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
    307     std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
    308               << " : mismatch detected: " << t_result() << " vs "
    309               << t_result_gpu() << std::endl;
    310     VERIFY_IS_APPROX(t_result_gpu(), t_result());
    311   }
    312 
    313   sycl_device.deallocate(d_t_left);
    314   sycl_device.deallocate(d_t_right);
    315   sycl_device.deallocate(d_t_result);
    316 }
    317 
    318 template <int DataLayout, typename DataType, typename IndexType,
    319           typename Device>
    320 void contraction_batch(const Device &sycl_device, IndexType m_size,
    321                        IndexType k_size, IndexType n_size, IndexType m_batch,
    322                        IndexType start, IndexType limit) {
    323   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    324       DimPair;
    325   static const DataType error_threshold = DataType(1e-4);
    326   typedef Eigen::array<IndexType, 3> TensorDim;
    327   typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
    328   TensorDim left_dims = {{m_batch, k_size, m_size}};
    329   TensorDim right_dims = {{m_batch, n_size, k_size}};
    330   TensorDim res_dims = {{m_batch, m_size, n_size}};
    331   Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
    332 
    333   TensorType t_left(left_dims);
    334   TensorType t_right(right_dims);
    335   TensorType t_result_gpu(res_dims);
    336   TensorType t_result(res_dims);
    337 
    338   t_left.setRandom();
    339   t_right.setRandom();
    340 
    341   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
    342   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
    343   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
    344 
    345   DataType *d_t_left =
    346       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    347   DataType *d_t_right =
    348       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    349   DataType *d_t_result =
    350       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    351 
    352   Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
    353   Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
    354   Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
    355 
    356   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
    357   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
    358   for (int i = start; i < limit; ++i) {
    359     auto x = gpu_t_left.template chip<0>(i);
    360     auto y = gpu_t_right.template chip<0>(i);
    361     auto z = gpu_t_result.template chip<0>(i);
    362     z.device(sycl_device) = x.contract(y, contract_pairs);
    363   }
    364   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
    365                                  t_result_bytes);
    366 
    367   for (int i = start; i < limit; ++i) {
    368     auto x = t_left.template chip<0>(i);
    369     auto y = t_right.template chip<0>(i);
    370     auto z = t_result.template chip<0>(i);
    371     z = x.contract(y, contract_pairs);
    372   }
    373 
    374   for (IndexType i = 0; i < t_result.size(); i++) {
    375     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    376             t_result(i) - t_result_gpu(i)))) < error_threshold) {
    377       continue;
    378     }
    379     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
    380                                   error_threshold)) {
    381       continue;
    382     }
    383     std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
    384               << " vs " << t_result_gpu(i) << std::endl;
    385     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
    386   }
    387   sycl_device.deallocate(d_t_left);
    388   sycl_device.deallocate(d_t_right);
    389   sycl_device.deallocate(d_t_result);
    390 }
    391 
    392 template <int DataLayout, typename DataType, typename IndexType,
    393           typename Device>
    394 void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
    395                                 IndexType k_size, IndexType n_size) {
    396   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    397       DimPair;
    398   static const DataType error_threshold = DataType(1e-4);
    399   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
    400   Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
    401   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
    402   Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
    403 
    404   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
    405   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
    406   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
    407   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
    408 
    409   t_left.setRandom();
    410   t_right.setRandom();
    411 
    412   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
    413   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
    414   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
    415 
    416   DataType *d_t_left =
    417       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    418   DataType *d_t_right =
    419       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    420   DataType *d_t_result =
    421       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    422 
    423   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    424       gpu_t_left(d_t_left, left_dims);
    425   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    426       gpu_t_right(d_t_right, right_dims);
    427   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    428       gpu_t_result(d_t_result, res_dims);
    429 
    430   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
    431   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
    432 
    433   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
    434   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
    435                                  t_result_bytes);
    436 
    437   t_result = t_left.contract(t_right, dims);
    438 
    439   for (IndexType j = 0; j < m_size; j++) {
    440     for (IndexType i = 0; i < n_size; i++) {
    441       if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    442               t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
    443         continue;
    444       }
    445       if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
    446                                     error_threshold)) {
    447         continue;
    448       }
    449       std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    450                 << ", mismatch detected at IndexType m: " << j << " n: " << i
    451                 << " CPU : " << t_result(j, i)
    452                 << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
    453       VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
    454     }
    455   }
    456   sycl_device.deallocate(d_t_left);
    457   sycl_device.deallocate(d_t_right);
    458   sycl_device.deallocate(d_t_result);
    459 }
    460 
    461 template <int DataLayout, typename DataType, typename IndexType,
    462           typename Device>
    463 void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
    464                                 IndexType k_size, IndexType n_size) {
    465   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    466       DimPair;
    467   static const DataType error_threshold = DataType(1e-4);
    468   Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
    469   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
    470   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
    471   Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
    472 
    473   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
    474   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
    475   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
    476   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
    477 
    478   t_left.setRandom();
    479   t_right.setRandom();
    480 
    481   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
    482   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
    483   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
    484 
    485   DataType *d_t_left =
    486       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    487   DataType *d_t_right =
    488       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    489   DataType *d_t_result =
    490       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    491 
    492   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    493       gpu_t_left(d_t_left, left_dims);
    494   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    495       gpu_t_right(d_t_right, right_dims);
    496   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    497       gpu_t_result(d_t_result, res_dims);
    498 
    499   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
    500   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
    501 
    502   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
    503   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
    504                                  t_result_bytes);
    505 
    506   t_result = t_left.contract(t_right, dims);
    507 
    508   for (IndexType i = 0; i < t_result.size(); i++) {
    509     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    510             t_result(i) - t_result_gpu(i)))) < error_threshold) {
    511       continue;
    512     }
    513     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
    514                                   error_threshold)) {
    515       continue;
    516     }
    517     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    518               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
    519               << " vs " << t_result_gpu(i) << std::endl;
    520     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
    521   }
    522   sycl_device.deallocate(d_t_left);
    523   sycl_device.deallocate(d_t_right);
    524   sycl_device.deallocate(d_t_result);
    525 }
    526 
    527 template <int DataLayout, typename DataType, typename IndexType,
    528           typename Device>
    529 void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
    530                                  IndexType k_size, IndexType n_size) {
    531   typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
    532       DimPair;
    533   static const DataType error_threshold = DataType(1e-4);
    534   Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
    535   Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
    536   Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
    537   Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
    538 
    539   Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
    540   Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
    541   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
    542   Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
    543 
    544   t_left.setRandom();
    545   t_right.setRandom();
    546 
    547   std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
    548   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
    549   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
    550 
    551   DataType *d_t_left =
    552       static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
    553   DataType *d_t_right =
    554       static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
    555   DataType *d_t_result =
    556       static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
    557 
    558   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    559       gpu_t_left(d_t_left, left_dims);
    560   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    561       gpu_t_right(d_t_right, right_dims);
    562   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
    563       gpu_t_result(d_t_result, res_dims);
    564 
    565   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
    566   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
    567 
    568   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
    569   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
    570                                  t_result_bytes);
    571 
    572   t_result = t_left.contract(t_right, dims);
    573 
    574   for (IndexType i = 0; i < t_result.size(); i++) {
    575     if (static_cast<DataType>(std::fabs(static_cast<DataType>(
    576             t_result(i) - t_result_gpu(i)))) < error_threshold) {
    577       continue;
    578     }
    579     if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
    580                                   error_threshold)) {
    581       continue;
    582     }
    583     std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
    584               << ", mismatch detected at IndexType " << i << ": " << t_result(i)
    585               << " vs " << t_result_gpu(i) << std::endl;
    586 
    587     VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
    588   }
    589   sycl_device.deallocate(d_t_left);
    590   sycl_device.deallocate(d_t_right);
    591   sycl_device.deallocate(d_t_result);
    592 }
    593 
    594 template <typename Dev>
    595 void inline tensorOutofBound(const Dev &sycl_device) {
    596   typedef float DataType;
    597   typedef int64_t IndexType;
    598   std::chrono::time_point<std::chrono::system_clock> start, end;
    599   start = std::chrono::system_clock::now();
    600   // Test out of bound for Tensor-Tensor
    601   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
    602                                                        1024);
    603   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
    604                                                        4096);
    605   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
    606                                                        2048);
    607   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
    608                                                        1024);
    609   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
    610                                                        784);
    611   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
    612                                                        10);
    613   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
    614                                                        513);
    615   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
    616                                                        783);
    617   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
    618                                                        784);
    619   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
    620                                                        11);
    621   end = std::chrono::system_clock::now();
    622   std::chrono::duration<double> elapsed_seconds = end - start;
    623   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    624   std::cout << "tensor out of bound tests finished computation at "
    625             << std::ctime(&end_time)
    626             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    627 }
    628 
    629 template <typename Dev>
    630 void inline tensorTensor(const Dev &sycl_device) {
    631   typedef float DataType;
    632   typedef int64_t IndexType;
    633   std::chrono::time_point<std::chrono::system_clock> start, end;
    634   start = std::chrono::system_clock::now();
    635   // Tensor Tensor Contraction
    636   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
    637                                                        128);
    638   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
    639                                                        128);
    640   end = std::chrono::system_clock::now();
    641   std::chrono::duration<double> elapsed_seconds = end - start;
    642   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    643   std::cout << "tensor tensor tests finished computation at "
    644             << std::ctime(&end_time)
    645             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    646 }
    647 
    648 template <typename Dev>
    649 void inline tensorTensor_m(const Dev &sycl_device) {
    650   typedef float DataType;
    651   typedef int64_t IndexType;
    652   std::chrono::time_point<std::chrono::system_clock> start, end;
    653   start = std::chrono::system_clock::now();
    654   // Tensor Tensor Contraction
    655   test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
    656   test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
    657 
    658   end = std::chrono::system_clock::now();
    659   std::chrono::duration<double> elapsed_seconds = end - start;
    660   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    661   std::cout << "tensor tensor tests finished computation at "
    662             << std::ctime(&end_time)
    663             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    664 }
    665 
    666 template <typename Dev>
    667 void inline tensorTensor_n(const Dev &sycl_device) {
    668   typedef float DataType;
    669   typedef int64_t IndexType;
    670   std::chrono::time_point<std::chrono::system_clock> start, end;
    671   start = std::chrono::system_clock::now();
    672   // Tensor Tensor Contraction
    673   test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
    674   test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
    675 
    676   end = std::chrono::system_clock::now();
    677   std::chrono::duration<double> elapsed_seconds = end - start;
    678   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    679   std::cout << "tensor tensor tests finished computation at "
    680             << std::ctime(&end_time)
    681             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    682 }
    683 
    684 template <typename Dev>
    685 void inline tensorTensor_k(const Dev &sycl_device) {
    686   typedef float DataType;
    687   typedef int64_t IndexType;
    688   std::chrono::time_point<std::chrono::system_clock> start, end;
    689   start = std::chrono::system_clock::now();
    690   test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
    691   test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
    692 
    693   end = std::chrono::system_clock::now();
    694   std::chrono::duration<double> elapsed_seconds = end - start;
    695   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    696   std::cout << "tensor tensor tests finished computation at "
    697             << std::ctime(&end_time)
    698             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    699 }
    700 
    701 template <typename Dev>
    702 void inline tensorTensor_sizes(const Dev &sycl_device) {
    703   typedef float DataType;
    704   typedef int64_t IndexType;
    705   std::chrono::time_point<std::chrono::system_clock> start, end;
    706   start = std::chrono::system_clock::now();
    707   // Tensor Tensor Contraction
    708   test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
    709   test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
    710 
    711   end = std::chrono::system_clock::now();
    712   std::chrono::duration<double> elapsed_seconds = end - start;
    713   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    714   std::cout << "tensor tensor tests finished computation at "
    715             << std::ctime(&end_time)
    716             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    717 }
    718 template <typename Dev>
    719 void inline vectorVector(const Dev &sycl_device) {
    720   typedef float DataType;
    721   typedef int64_t IndexType;
    722   std::chrono::time_point<std::chrono::system_clock> start, end;
    723   start = std::chrono::system_clock::now();
    724   // VECTOR-VECTOR
    725   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
    726                                                        1025);
    727   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
    728                                                        1025);
    729   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
    730                                                        1024);
    731   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
    732                                                        1024);
    733   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
    734                                                        1023);
    735   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
    736                                                        1023);
    737 
    738   end = std::chrono::system_clock::now();
    739   std::chrono::duration<double> elapsed_seconds = end - start;
    740   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    741   std::cout << "contracted tensor tests finished computation at "
    742             << std::ctime(&end_time)
    743             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    744 }
    745 
    746 template <typename Dev>
    747 void inline vectorTensor(const Dev &sycl_device) {
    748   typedef float DataType;
    749   typedef int64_t IndexType;
    750   std::chrono::time_point<std::chrono::system_clock> start, end;
    751   start = std::chrono::system_clock::now();
    752   // Vector-Tensor
    753   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
    754                                                        1025);
    755   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
    756                                                        1025);
    757   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
    758                                                        1024);
    759   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
    760                                                        1024);
    761   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
    762                                                        1023);
    763   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
    764                                                        1023);
    765 
    766   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
    767                                                        4097);
    768   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
    769                                                        4097);
    770   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
    771                                                        4096);
    772   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
    773                                                        4096);
    774   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
    775                                                        4095);
    776   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
    777                                                        4095);
    778   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
    779                                                        32);
    780 
    781   end = std::chrono::system_clock::now();
    782   std::chrono::duration<double> elapsed_seconds = end - start;
    783   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    784   std::cout << "finished computation at " << std::ctime(&end_time)
    785             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    786 }
    787 
    788 template <typename Dev>
    789 void inline tensorVector(const Dev &sycl_device) {
    790   typedef float DataType;
    791   typedef int64_t IndexType;
    792   std::chrono::time_point<std::chrono::system_clock> start, end;
    793   start = std::chrono::system_clock::now();
    794   // Matrix-Vector
    795   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
    796                                                        1);
    797   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
    798                                                        1);
    799   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
    800                                                        1);
    801   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
    802                                                        1);
    803   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
    804                                                        1);
    805   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
    806                                                        1);
    807   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
    808                                                        1);
    809   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
    810                                                        1);
    811   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
    812                                                        1);
    813   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
    814                                                        1);
    815   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
    816                                                        1);
    817   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
    818                                                        1);
    819 // If the GEMV disabled it will creates one kernel to calculate the contraction.
    820 // Therefore the acumuation of float number will overflow the precision
    821 // threshold for float and cause the test to fail. While it the GMV multiple
    822 // kernel will be created and each one run the overflow of accumutation breaks
    823 // among the kernels.
    824 #ifndef EIGEN_SYCL_DISABLE_GEMV
    825   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
    826                                                        1);
    827 #endif
    828 
    829   end = std::chrono::system_clock::now();
    830   std::chrono::duration<double> elapsed_seconds = end - start;
    831   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    832   std::cout << "finished computation at " << std::ctime(&end_time)
    833             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    834 }
    835 
    836 template <typename Dev>
    837 void inline tensorScalar(const Dev &sycl_device) {
    838   typedef float DataType;
    839   typedef int64_t IndexType;
    840   std::chrono::time_point<std::chrono::system_clock> start, end;
    841   start = std::chrono::system_clock::now();
    842   // SCALAR Contraction
    843   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
    844   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
    845   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
    846   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
    847   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
    848   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
    849 
    850   end = std::chrono::system_clock::now();
    851   std::chrono::duration<double> elapsed_seconds = end - start;
    852   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    853   std::cout << "finished computation at " << std::ctime(&end_time)
    854             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    855 }
    856 
    857 template <typename Dev>
    858 void inline skinnyTensor_row(const Dev &sycl_device) {
    859   typedef float DataType;
    860   typedef int64_t IndexType;
    861   std::chrono::time_point<std::chrono::system_clock> start, end;
    862   start = std::chrono::system_clock::now();
    863   // Tensor Tensor Contraction
    864   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
    865   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
    866                                                        257);
    867   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
    868                                                        256);
    869   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
    870                                                        16);
    871   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
    872                                                        17);
    873   end = std::chrono::system_clock::now();
    874   std::chrono::duration<double> elapsed_seconds = end - start;
    875   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    876   std::cout << "finished computation at " << std::ctime(&end_time)
    877             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    878 }
    879 
    880 template <typename Dev>
    881 void inline skinnyTensor_col(const Dev &sycl_device) {
    882   typedef float DataType;
    883   typedef int64_t IndexType;
    884   std::chrono::time_point<std::chrono::system_clock> start, end;
    885   start = std::chrono::system_clock::now();
    886   // Tensor Tensor Contraction
    887   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
    888   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
    889                                                        257);
    890   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
    891                                                        256);
    892   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
    893                                                        16);
    894   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
    895                                                        17);
    896   end = std::chrono::system_clock::now();
    897   std::chrono::duration<double> elapsed_seconds = end - start;
    898   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    899   std::cout << "finished computation at " << std::ctime(&end_time)
    900             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    901 }
    902 
    903 template <typename Dev>
    904 void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
    905   typedef float DataType;
    906   typedef int64_t IndexType;
    907   std::chrono::time_point<std::chrono::system_clock> start, end;
    908   start = std::chrono::system_clock::now();
    909 
    910   contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
    911                                                    0, 4);
    912   contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
    913                                                    0, 4);
    914   end = std::chrono::system_clock::now();
    915   std::chrono::duration<double> elapsed_seconds = end - start;
    916   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    917   std::cout << "finished computation at " << std::ctime(&end_time)
    918             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    919 }
    920 
    921 template <typename Dev>
    922 void inline tensor_contraction_lhs_transposed_per_device(
    923     const Dev &sycl_device) {
    924   typedef float DataType;
    925   typedef int64_t IndexType;
    926   std::chrono::time_point<std::chrono::system_clock> start, end;
    927   start = std::chrono::system_clock::now();
    928 
    929   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
    930                                                             8);
    931   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
    932                                                             32);
    933   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
    934                                                             64);
    935   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
    936                                                             2048, 1024);
    937   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
    938                                                             10, 1024);
    939   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
    940                                                             1024, 1024);
    941   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
    942                                                             4096, 1024);
    943   end = std::chrono::system_clock::now();
    944   std::chrono::duration<double> elapsed_seconds = end - start;
    945   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    946   std::cout << "finished computation at " << std::ctime(&end_time)
    947             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    948 }
    949 
    950 template <typename Dev>
    951 void inline tensor_contraction_rhs_transposed_per_device(
    952     const Dev &sycl_device) {
    953   typedef float DataType;
    954   typedef int64_t IndexType;
    955   std::chrono::time_point<std::chrono::system_clock> start, end;
    956   start = std::chrono::system_clock::now();
    957 
    958   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
    959                                                             16);
    960   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
    961                                                             17);
    962   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
    963                                                             32);
    964   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
    965                                                             64);
    966   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
    967                                                             1024, 1024);
    968   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
    969                                                             1024, 4096);
    970   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
    971                                                             1024, 2048);
    972   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
    973                                                             1024, 784);
    974   end = std::chrono::system_clock::now();
    975   std::chrono::duration<double> elapsed_seconds = end - start;
    976   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    977   std::cout << "finished computation at " << std::ctime(&end_time)
    978             << "elapsed time: " << elapsed_seconds.count() << "s\n";
    979 }
    980 
    981 template <typename Dev>
    982 void inline tensor_contraction_both_transposed_per_device(
    983     const Dev &sycl_device) {
    984   typedef float DataType;
    985   typedef int64_t IndexType;
    986   std::chrono::time_point<std::chrono::system_clock> start, end;
    987   start = std::chrono::system_clock::now();
    988 
    989   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
    990                                                              17);
    991   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
    992                                                              32);
    993   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
    994                                                              16, 64);
    995   end = std::chrono::system_clock::now();
    996   std::chrono::duration<double> elapsed_seconds = end - start;
    997   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    998   std::cout << "finished computation at " << std::ctime(&end_time)
    999             << "elapsed time: " << elapsed_seconds.count() << "s\n";
   1000 }
   1001 
   1002 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
   1003   for (const auto &device : Eigen::get_sycl_supported_devices()) {
   1004     std::cout << "Running on "
   1005               << device.template get_info<cl::sycl::info::device::name>()
   1006               << std::endl;
   1007     QueueInterface queueInterface(device);
   1008     auto sycl_device = Eigen::SyclDevice(&queueInterface);
   1009     CALL_SUBTEST_1(tensorOutofBound(sycl_device));
   1010     CALL_SUBTEST_2(tensorTensor(sycl_device));
   1011     CALL_SUBTEST_2(tensorTensor_m(sycl_device));
   1012     CALL_SUBTEST_2(tensorTensor_n(sycl_device));
   1013     CALL_SUBTEST_2(tensorTensor_k(sycl_device));
   1014     CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
   1015     CALL_SUBTEST_3(vectorVector(sycl_device));
   1016     CALL_SUBTEST_4(vectorTensor(sycl_device));
   1017     CALL_SUBTEST_5(tensorVector(sycl_device));
   1018     CALL_SUBTEST_6(tensorScalar(sycl_device));
   1019     CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
   1020     CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
   1021     CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
   1022     CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
   1023     CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
   1024     CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
   1025   }
   1026 }