cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

tensor_benchmarks.h (20459B)


      1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
      2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
      3 
      4 typedef int TensorIndex;
      5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
      6 
      7 #include "unsupported/Eigen/CXX11/Tensor"
      8 #include "benchmark.h"
      9 
     10 #define BENCHMARK_RANGE(bench, lo, hi) \
     11   BENCHMARK(bench)->Range(lo, hi)
     12 
     13 using Eigen::Tensor;
     14 using Eigen::TensorMap;
     15 
     16 // TODO(bsteiner): also templatize on the input type since we have users
     17 // for int8 as well as floats.
     18 template <typename Device, typename T> class BenchmarkSuite {
     19  public:
     20   BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
     21       : m_(m), k_(k), n_(n), device_(device) {
     22     initialize();
     23   }
     24 
     25   BenchmarkSuite(const Device& device, size_t m)
     26       : m_(m), k_(m), n_(m), device_(device) {
     27     initialize();
     28   }
     29 
     30   BenchmarkSuite(const Device& device, size_t m, size_t k)
     31       : m_(1), k_(k), n_(m), device_(device) {
     32     initialize();
     33   }
     34 
     35   ~BenchmarkSuite() {
     36     device_.deallocate(a_);
     37     device_.deallocate(b_);
     38     device_.deallocate(c_);
     39   }
     40 
     41   void memcpy(int num_iters) {
     42     eigen_assert(m_ == k_ && k_ == n_);
     43 #ifdef EIGEN_USE_SYCL // warmup for sycl
     44     for (int iter = 0; iter < 10; ++iter) {
     45       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
     46     }
     47 #endif
     48     StartBenchmarkTiming();
     49     for (int iter = 0; iter < num_iters; ++iter) {
     50       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
     51     }
     52     // Record the number of values copied per second
     53     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
     54   }
     55 
     56   void typeCasting(int num_iters) {
     57     eigen_assert(m_ == n_);
     58     Eigen::array<TensorIndex, 2> sizes;
     59     if (sizeof(T) >= sizeof(int)) {
     60       sizes[0] = m_;
     61       sizes[1] = k_;
     62     } else {
     63       sizes[0] = m_ * sizeof(T) / sizeof(int);
     64       sizes[1] = k_ * sizeof(T) / sizeof(int);
     65     }
     66     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
     67     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
     68 #ifdef EIGEN_USE_SYCL // warmup for sycl
     69     for (int iter = 0; iter < 10; ++iter) {
     70       B.device(device_) = A.template cast<T>();
     71     }
     72 #endif
     73     StartBenchmarkTiming();
     74     for (int iter = 0; iter < num_iters; ++iter) {
     75       B.device(device_) = A.template cast<T>();
     76     }
     77     // Record the number of values copied per second
     78     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
     79   }
     80 
     81   void random(int num_iters) {
     82     eigen_assert(m_ == k_ && k_ == n_);
     83     Eigen::array<TensorIndex, 2> sizes;
     84     sizes[0] = m_;
     85     sizes[1] = m_;
     86     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
     87 #ifdef EIGEN_USE_SYCL // warmup for sycl
     88     for (int iter = 0; iter < 10; ++iter) {
     89       C.device(device_) = C.random();
     90     }
     91 #endif
     92     StartBenchmarkTiming();
     93     for (int iter = 0; iter < num_iters; ++iter) {
     94       C.device(device_) = C.random();
     95     }
     96     // Record the number of random numbers generated per second
     97     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
     98   }
     99 
    100   void slicing(int num_iters) {
    101     eigen_assert(m_ == k_ && k_ == n_);
    102     Eigen::array<TensorIndex, 2> sizes;
    103     sizes[0] = m_;
    104     sizes[1] = m_;
    105     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    106     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    107     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    108 
    109     const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
    110     const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
    111     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
    112     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
    113     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
    114 #ifdef EIGEN_USE_SYCL // warmup for sycl
    115     for (int iter = 0; iter < 10; ++iter) {
    116       C.slice(first_quadrant, quarter_sizes).device(device_) =
    117           A.slice(first_quadrant, quarter_sizes);
    118       C.slice(second_quadrant, quarter_sizes).device(device_) =
    119           B.slice(second_quadrant, quarter_sizes);
    120       C.slice(third_quadrant, quarter_sizes).device(device_) =
    121           A.slice(third_quadrant, quarter_sizes);
    122       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
    123           B.slice(fourth_quadrant, quarter_sizes);
    124     }
    125 #endif
    126     StartBenchmarkTiming();
    127     for (int iter = 0; iter < num_iters; ++iter) {
    128       C.slice(first_quadrant, quarter_sizes).device(device_) =
    129           A.slice(first_quadrant, quarter_sizes);
    130       C.slice(second_quadrant, quarter_sizes).device(device_) =
    131           B.slice(second_quadrant, quarter_sizes);
    132       C.slice(third_quadrant, quarter_sizes).device(device_) =
    133           A.slice(third_quadrant, quarter_sizes);
    134       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
    135           B.slice(fourth_quadrant, quarter_sizes);
    136     }
    137     // Record the number of values copied from the rhs slice to the lhs slice
    138     // each second
    139     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    140   }
    141 
    142   void rowChip(int num_iters) {
    143     Eigen::array<TensorIndex, 2> input_size;
    144     input_size[0] = k_;
    145     input_size[1] = n_;
    146     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    147     Eigen::array<TensorIndex, 1> output_size;
    148     output_size[0] = n_;
    149     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    150 #ifdef EIGEN_USE_SYCL // warmup for sycl
    151     for (int iter = 0; iter < 10; ++iter) {
    152       C.device(device_) = B.chip(iter % k_, 0);
    153     }
    154 #endif
    155     StartBenchmarkTiming();
    156     for (int iter = 0; iter < num_iters; ++iter) {
    157       C.device(device_) = B.chip(iter % k_, 0);
    158     }
    159     // Record the number of values copied from the rhs chip to the lhs.
    160     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
    161   }
    162 
    163   void colChip(int num_iters) {
    164     Eigen::array<TensorIndex, 2> input_size;
    165     input_size[0] = k_;
    166     input_size[1] = n_;
    167     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    168     Eigen::array<TensorIndex, 1> output_size;
    169     output_size[0] = n_;
    170     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    171 #ifdef EIGEN_USE_SYCL // warmup for sycl
    172     for (int iter = 0; iter < 10; ++iter) {
    173       C.device(device_) = B.chip(iter % n_, 1);
    174     }
    175 #endif
    176     StartBenchmarkTiming();
    177     for (int iter = 0; iter < num_iters; ++iter) {
    178       C.device(device_) = B.chip(iter % n_, 1);
    179     }
    180     // Record the number of values copied from the rhs chip to the lhs.
    181     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
    182   }
    183 
    184   void shuffling(int num_iters) {
    185     eigen_assert(m_ == n_);
    186     Eigen::array<TensorIndex, 2> size_a;
    187     size_a[0] = m_;
    188     size_a[1] = k_;
    189     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    190     Eigen::array<TensorIndex, 2> size_b;
    191     size_b[0] = k_;
    192     size_b[1] = m_;
    193     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    194 
    195     Eigen::array<int, 2> shuffle;
    196     shuffle[0] = 1;
    197     shuffle[1] = 0;
    198 #ifdef EIGEN_USE_SYCL // warmup for sycl
    199     for (int iter = 0; iter < 10; ++iter) {
    200       B.device(device_) = A.shuffle(shuffle);
    201     }
    202 #endif
    203     StartBenchmarkTiming();
    204     for (int iter = 0; iter < num_iters; ++iter) {
    205       B.device(device_) = A.shuffle(shuffle);
    206     }
    207     // Record the number of values shuffled from A and copied to B each second
    208     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    209   }
    210 
    211  void padding(int num_iters) {
    212     eigen_assert(m_ == k_);
    213     Eigen::array<TensorIndex, 2> size_a;
    214     size_a[0] = m_;
    215     size_a[1] = k_-3;
    216     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    217     Eigen::array<TensorIndex, 2> size_b;
    218     size_b[0] = k_;
    219     size_b[1] = m_;
    220     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    221 
    222 #if defined(EIGEN_HAS_INDEX_LIST)
    223     Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
    224                          Eigen::type2indexpair<2, 1> > paddings;
    225 #else
    226     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
    227     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
    228     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
    229 #endif
    230 #ifdef EIGEN_USE_SYCL // warmup for sycl
    231     for (int iter = 0; iter < 10; ++iter) {
    232       B.device(device_) = A.pad(paddings);
    233     }
    234 #endif
    235     StartBenchmarkTiming();
    236     for (int iter = 0; iter < num_iters; ++iter) {
    237       B.device(device_) = A.pad(paddings);
    238     }
    239     // Record the number of values copied from the padded tensor A each second
    240     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    241   }
    242 
    243  void striding(int num_iters) {
    244     eigen_assert(m_ == k_);
    245     Eigen::array<TensorIndex, 2> size_a;
    246     size_a[0] = m_;
    247     size_a[1] = k_;
    248     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    249     Eigen::array<TensorIndex, 2> size_b;
    250     size_b[0] = m_;
    251     size_b[1] = k_/2;
    252     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    253 
    254 #ifndef EIGEN_HAS_INDEX_LIST
    255     Eigen::array<TensorIndex, 2> strides;
    256     strides[0] = 1;
    257     strides[1] = 2;
    258 #else
    259     // Take advantage of cxx11 to give the compiler information it can use to
    260     // optimize the code.
    261     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
    262 #endif
    263 
    264 #ifdef EIGEN_USE_SYCL // warmup for sycl
    265     for (int iter = 0; iter < 10; ++iter) {
    266       B.device(device_) = A.stride(strides);
    267     }
    268 #endif
    269     StartBenchmarkTiming();
    270     for (int iter = 0; iter < num_iters; ++iter) {
    271       B.device(device_) = A.stride(strides);
    272     }
    273     // Record the number of values copied from the padded tensor A each second
    274     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    275   }
    276 
    277 
    278   void broadcasting(int num_iters) {
    279     Eigen::array<TensorIndex, 2> size_a;
    280     size_a[0] = m_;
    281     size_a[1] = 1;
    282     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    283     Eigen::array<TensorIndex, 2> size_c;
    284     size_c[0] = m_;
    285     size_c[1] = n_;
    286     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
    287 
    288 #ifndef EIGEN_HAS_INDEX_LIST
    289     Eigen::array<int, 2> broadcast;
    290     broadcast[0] = 1;
    291     broadcast[1] = n_;
    292 #else
    293     // Take advantage of cxx11 to give the compiler information it can use to
    294     // optimize the code.
    295     Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
    296     broadcast.set(1, n_);
    297 #endif
    298 
    299 #ifdef EIGEN_USE_SYCL // warmup for sycl
    300     for (int iter = 0; iter < 10; ++iter) {
    301       C.device(device_) = A.broadcast(broadcast);
    302     }
    303 #endif
    304     StartBenchmarkTiming();
    305     for (int iter = 0; iter < num_iters; ++iter) {
    306       C.device(device_) = A.broadcast(broadcast);
    307     }
    308     // Record the number of values broadcasted from A and copied to C each second
    309     finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
    310   }
    311 
    312   void coeffWiseOp(int num_iters) {
    313     eigen_assert(m_ == k_ && k_ == n_);
    314     Eigen::array<TensorIndex, 2> sizes;
    315     sizes[0] = m_;
    316     sizes[1] = m_;
    317     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    318     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    319     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    320 #ifdef EIGEN_USE_SYCL // warmup for sycl
    321     for (int iter = 0; iter < 10; ++iter) {
    322       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
    323     }
    324 #endif
    325     StartBenchmarkTiming();
    326     for (int iter = 0; iter < num_iters; ++iter) {
    327       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
    328     }
    329     // Record the number of FLOP executed per second (2 multiplications and
    330     // 1 addition per value)
    331     finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
    332   }
    333 
    334   void algebraicFunc(int num_iters) {
    335     eigen_assert(m_ == k_ && k_ == n_);
    336     Eigen::array<TensorIndex, 2> sizes;
    337     sizes[0] = m_;
    338     sizes[1] = m_;
    339     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    340     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    341     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    342 
    343 #ifdef EIGEN_USE_SYCL // warmup for sycl
    344 for (int iter = 0; iter < 10; ++iter) {
    345       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
    346 }
    347 #endif
    348     StartBenchmarkTiming();
    349     for (int iter = 0; iter < num_iters; ++iter) {
    350       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
    351     }
    352     // Record the number of FLOP executed per second (assuming one operation
    353     // per value)
    354     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    355   }
    356 
    357   void transcendentalFunc(int num_iters) {
    358     eigen_assert(m_ == k_ && k_ == n_);
    359     Eigen::array<TensorIndex, 2> sizes;
    360     sizes[0] = m_;
    361     sizes[1] = m_;
    362     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    363     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    364     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    365 #ifdef EIGEN_USE_SYCL // warmup for sycl
    366     for (int iter = 0; iter < 10; ++iter) {
    367       C.device(device_) = A.exp() + B.log();
    368     }
    369 #endif
    370     StartBenchmarkTiming();
    371     for (int iter = 0; iter < num_iters; ++iter) {
    372       C.device(device_) = A.exp() + B.log();
    373     }
    374     // Record the number of FLOP executed per second (assuming one operation
    375     // per value)
    376     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    377   }
    378 
    379  // Row reduction
    380   void rowReduction(int num_iters) {
    381     Eigen::array<TensorIndex, 2> input_size;
    382     input_size[0] = k_;
    383     input_size[1] = n_;
    384     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    385     Eigen::array<TensorIndex, 1> output_size;
    386     output_size[0] = n_;
    387     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    388 
    389 #ifndef EIGEN_HAS_INDEX_LIST
    390     Eigen::array<TensorIndex, 1> sum_along_dim;
    391     sum_along_dim[0] = 0;
    392 #else
    393     // Take advantage of cxx11 to give the compiler information it can use to
    394     // optimize the code.
    395     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
    396 #endif
    397 #ifdef EIGEN_USE_SYCL // warmup for sycl
    398   for (int iter = 0; iter < 10; ++iter) {
    399     C.device(device_) = B.sum(sum_along_dim);
    400   }
    401 #endif
    402     StartBenchmarkTiming();
    403     for (int iter = 0; iter < num_iters; ++iter) {
    404       C.device(device_) = B.sum(sum_along_dim);
    405     }
    406     // Record the number of FLOP executed per second (assuming one operation
    407     // per value)
    408     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    409   }
    410 
    411   // Column reduction
    412   void colReduction(int num_iters) {
    413     Eigen::array<TensorIndex, 2> input_size;
    414     input_size[0] = k_;
    415     input_size[1] = n_;
    416     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
    417         b_, input_size);
    418     Eigen::array<TensorIndex, 1> output_size;
    419     output_size[0] = k_;
    420     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
    421         a_, output_size);
    422 
    423 #ifndef EIGEN_HAS_INDEX_LIST
    424     Eigen::array<TensorIndex, 1> sum_along_dim;
    425     sum_along_dim[0] = 1;
    426 #else
    427     // Take advantage of cxx11 to give the compiler information it can use to
    428     // optimize the code.
    429     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
    430 #endif
    431 #ifdef EIGEN_USE_SYCL // warmup for sycl
    432   for (int iter = 0; iter < 10; ++iter) {
    433     A.device(device_) = B.sum(sum_along_dim);
    434   }
    435 #endif
    436     StartBenchmarkTiming();
    437     for (int iter = 0; iter < num_iters; ++iter) {
    438       A.device(device_) = B.sum(sum_along_dim);
    439     }
    440     // Record the number of FLOP executed per second (assuming one operation
    441     // per value)
    442     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    443   }
    444 
    445   // Full reduction
    446   void fullReduction(int num_iters) {
    447     Eigen::array<TensorIndex, 2> input_size;
    448     input_size[0] = k_;
    449     input_size[1] = n_;
    450     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
    451         b_, input_size);
    452     Eigen::array<TensorIndex, 0> output_size;
    453     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
    454         c_, output_size);
    455 #ifdef EIGEN_USE_SYCL // warmup for sycl
    456     for (int iter = 0; iter < 10; ++iter) {
    457       C.device(device_) = B.sum();
    458     }
    459 #endif
    460     StartBenchmarkTiming();
    461     for (int iter = 0; iter < num_iters; ++iter) {
    462       C.device(device_) = B.sum();
    463     }
    464     // Record the number of FLOP executed per second (assuming one operation
    465     // per value)
    466     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    467   }
    468 
    469   
    470 
    471   // do a contraction which is equivalent to a matrix multiplication
    472   void contraction(int num_iters) {
    473       contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
    474   }
    475 
    476     void contractionRowMajor(int num_iters) {
    477       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
    478   }
    479     
    480   void contractionRowMajorAT(int num_iters) {
    481       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
    482   }
    483 
    484   void contractionRowMajorBT(int num_iters) {
    485       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
    486   }
    487 
    488   void contractionRowMajorABT(int num_iters) {
    489       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
    490   }
    491 
    492   void convolution(int num_iters, int kernel_x, int kernel_y) {
    493     Eigen::array<TensorIndex, 2> input_sizes;
    494     input_sizes[0] = m_;
    495     input_sizes[1] = n_;
    496     TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
    497     Eigen::array<TensorIndex, 2> kernel_sizes;
    498     kernel_sizes[0] = kernel_x;
    499     kernel_sizes[1] = kernel_y;
    500     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
    501     Eigen::array<TensorIndex, 2> result_sizes;
    502     result_sizes[0] = m_ - kernel_x + 1;
    503     result_sizes[1] = n_ - kernel_y + 1;
    504     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
    505     Eigen::array<TensorIndex, 2> dims;
    506     dims[0] = 0;
    507     dims[1] = 1;
    508 #ifdef EIGEN_USE_SYCL // warmup for sycl
    509     for (int iter = 0; iter < 10; ++iter) {
    510       C.device(device_) = A.convolve(B, dims);
    511      }
    512 #endif
    513     StartBenchmarkTiming();
    514     for (int iter = 0; iter < num_iters; ++iter) {
    515       C.device(device_) = A.convolve(B, dims);
    516     }
    517     // Record the number of FLOPs executed per second (kernel_size
    518     // multiplications and additions for each value in the resulting tensor)
    519     finalizeBenchmark(static_cast<int64_t>(2) *
    520         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
    521   }
    522 
    523  private:
    524  // do a contraction which is equivalent to a matrix multiplication
    525   template<int Layout>
    526   void contraction(int num_iters, bool trans_a, bool trans_b) {
    527     Eigen::array<TensorIndex, 2> sizeA;
    528     sizeA[0] = (trans_a ? k_: m_);
    529     sizeA[1] = (trans_a ? m_:  k_);
    530     Eigen::array<TensorIndex, 2> sizeB;
    531     sizeB[0] = (trans_b ? n_: k_);
    532     sizeB[1] = (trans_b ? k_: n_);
    533     Eigen::array<TensorIndex, 2> sizeC;
    534     sizeC[0] = m_;
    535     sizeC[1] = n_;
    536 
    537     const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
    538     const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
    539     TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
    540 
    541     typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
    542     Eigen::array<DimPair, 1> dims;
    543     TensorIndex a_contract_dim = (trans_a ? 0 : 1);
    544     TensorIndex b_contract_dim = (trans_b ? 1 : 0);
    545     dims[0] = DimPair(a_contract_dim, b_contract_dim);
    546 #ifdef EIGEN_USE_SYCL // warmup for sycl
    547     for (int iter = 0; iter < 10; ++iter) {
    548       C.device(device_) = A.contract(B, dims);
    549      }
    550 #endif
    551     StartBenchmarkTiming();
    552     for (int iter = 0; iter < num_iters; ++iter) {
    553       C.device(device_) = A.contract(B, dims);
    554     }
    555     // Record the number of FLOP executed per second (size_ multiplications and
    556     // additions for each value in the resulting tensor)
    557     finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
    558   }
    559 
    560   void initialize() {
    561     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    562     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    563     c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    564 
    565     // Initialize the content of the memory pools to prevent asan from
    566     // complaining.
    567     device_.memset(a_, 12, m_ * k_ * sizeof(T));
    568     device_.memset(b_, 23, k_ * n_ * sizeof(T));
    569     device_.memset(c_, 31, m_ * n_ * sizeof(T));
    570 
    571   }
    572 
    573   inline void finalizeBenchmark(int64_t num_items) {
    574 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
    575     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
    576       device_.synchronize();
    577     }
    578 #elif defined(EIGEN_USE_SYCL)
    579     if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
    580       device_.synchronize();
    581     }
    582 
    583 #endif
    584     StopBenchmarkTiming();
    585     SetBenchmarkFlopsProcessed(num_items);
    586   }
    587 
    588 
    589   TensorIndex m_;
    590   TensorIndex k_;
    591   TensorIndex n_;
    592   T* a_;
    593   T* b_;
    594   T* c_;
    595   Device device_;
    596 };
    597 #endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_