cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

tensor_contract_sycl_bench.cc (11331B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016
      5 // Mehdi Goli    Codeplay Software Ltd.
      6 // Ralph Potter  Codeplay Software Ltd.
      7 // Luke Iwanski  Codeplay Software Ltd.
      8 // Contact: <eigen@codeplay.com>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 #ifndef EIGEN_BENCH_CONTRACT_SYCL
     14 #define EIGEN_BENCH_CONTRACT_SYCL
     15 #define EIGEN_TEST_NO_LONGDOUBLE
     16 #define EIGEN_TEST_NO_COMPLEX
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
     18 #include <SYCL/sycl.hpp>
     19 #include <fstream>
     20 #include <iostream>
     21 #include <chrono>
     22 #include <ctime>
     23 
     24 #include <unsupported/Eigen/CXX11/Tensor>
     25 
     26 using Eigen::array;
     27 using Eigen::SyclDevice;
     28 using Eigen::Tensor;
     29 using Eigen::TensorMap;
     30 std::ofstream out("Result.txt");
     31 
     32 std::chrono::time_point<std::chrono::system_clock> get_time(){
     33   std::chrono::time_point<std::chrono::system_clock> start, end;
     34   return std::chrono::system_clock::now();
     35 }
     36 
     37 template<typename Start, typename End, typename TensorIndex>
     38 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
     39 
     40   std::chrono::duration<double> elapsed_seconds = end-start;
     41   std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
     42   static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
     43     out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
     44     static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
     45 }
     46 
     47 // do a contraction which is equivalent to a matrix multiplication
     48 template<typename T, typename Device, typename TensorIndex>
     49 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
     50   T* a_;
     51   T* b_;
     52   T* c_;
     53   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
     54   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
     55   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
     56 
     57   // Initialize the content of the memory pools to prevent asan from
     58   // complaining.
     59   device_.memset(a_, 12, m_ * k_ * sizeof(T));
     60   device_.memset(b_, 23, k_ * n_ * sizeof(T));
     61   device_.memset(c_, 31, m_ * n_ * sizeof(T));
     62 
     63   Eigen::array<TensorIndex, 2> sizeA;
     64   sizeA[0] = m_;
     65   sizeA[1] = k_;
     66   Eigen::array<TensorIndex, 2> sizeB;
     67   sizeB[0] = k_;
     68   sizeB[1] = n_;
     69   Eigen::array<TensorIndex, 2> sizeC;
     70   sizeC[0] = m_;
     71   sizeC[1] = n_;
     72 
     73   const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
     74   const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
     75   TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
     76 
     77   typedef typename Tensor<T, 2>::DimensionPair DimPair;
     78   Eigen::array<DimPair, 1> dims;
     79   dims[0] = DimPair(1, 0);
     80 #ifdef EIGEN_USE_SYCL // warmup for sycl
     81   for (int iter = 0; iter < 10; ++iter) {
     82     C.device(device_) = A.contract(B, dims);
     83    }
     84 #endif
     85   auto start = get_time();
     86   for (int iter = 0; iter < num_iters; ++iter) {
     87     C.device(device_) = A.contract(B, dims);
     88   }
     89  auto end = get_time();
     90   // Record the number of FLOPs executed per second (size_ multiplications and
     91   // additions for each value in the resulting tensor)
     92   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
     93   device_.deallocate(a_);
     94   device_.deallocate(b_);
     95   device_.deallocate(c_);
     96   device_.synchronize();
     97 }
     98 
     99 
    100 
    101 // do a contraction which is equivalent to a matrix multiplication
    102 template<typename T, typename Device, typename TensorIndex>
    103 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
    104   T* a_;
    105   T* b_;
    106   T* c_;
    107   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    108   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    109   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    110 
    111   // Initialize the content of the memory pools to prevent asan from
    112   // complaining.
    113   device_.memset(a_, 12, m_ * k_ * sizeof(T));
    114   device_.memset(b_, 23, k_ * n_ * sizeof(T));
    115   device_.memset(c_, 31, m_ * n_ * sizeof(T));
    116 
    117   Eigen::array<TensorIndex, 2> sizeA;
    118   sizeA[0] = m_;
    119   sizeA[1] = k_;
    120   Eigen::array<TensorIndex, 2> sizeB;
    121   sizeB[0] = k_;
    122   sizeB[1] = n_;
    123   Eigen::array<TensorIndex, 2> sizeC;
    124   sizeC[0] = m_;
    125   sizeC[1] = n_;
    126 
    127   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
    128   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
    129   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
    130 
    131   typedef typename Tensor<T, 2>::DimensionPair DimPair;
    132   Eigen::array<DimPair, 1> dims;
    133   dims[0] = DimPair(1, 0);
    134 #ifdef EIGEN_USE_SYCL // warmup for sycl
    135   for (int iter = 0; iter < 10; ++iter) {
    136     C.device(device_) = A.contract(B, dims);
    137    }
    138 #endif
    139   auto start = get_time();
    140   for (int iter = 0; iter < num_iters; ++iter) {
    141     C.device(device_) = A.contract(B, dims);
    142   }
    143   auto end = get_time();
    144   // Record the number of FLOPs executed per second (size_ multiplications and
    145   // additions for each value in the resulting tensor)
    146   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
    147   device_.deallocate(a_);
    148   device_.deallocate(b_);
    149   device_.deallocate(c_);
    150   device_.synchronize();
    151 }
    152 
    153 
    154 template<typename T, typename Device, typename TensorIndex>
    155 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
    156   T* a_;
    157   T* b_;
    158   T* c_;
    159   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    160   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    161   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    162 
    163   // Initialize the content of the memory pools to prevent asan from
    164   // complaining.
    165   device_.memset(a_, 12, m_ * k_ * sizeof(T));
    166   device_.memset(b_, 23, k_ * n_ * sizeof(T));
    167   device_.memset(c_, 31, m_ * n_ * sizeof(T));
    168   Eigen::array<TensorIndex, 2> sizeA;
    169   sizeA[0] = k_;
    170   sizeA[1] = m_;
    171   Eigen::array<TensorIndex, 2> sizeB;
    172   sizeB[0] = k_;
    173   sizeB[1] = n_;
    174   Eigen::array<TensorIndex, 2> sizeC;
    175   sizeC[0] = m_;
    176   sizeC[1] = n_;
    177 
    178   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
    179   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
    180   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
    181 
    182   typedef typename Tensor<T, 2>::DimensionPair DimPair;
    183   Eigen::array<DimPair, 1> dims;
    184   dims[0] = DimPair(0, 0);
    185 #ifdef EIGEN_USE_SYCL // warmup for sycl
    186   for (int iter = 0; iter < 10; ++iter) {
    187     C.device(device_) = A.contract(B, dims);
    188    }
    189 #endif
    190   auto start = get_time();
    191   for (int iter = 0; iter < num_iters; ++iter) {
    192     C.device(device_) = A.contract(B, dims);
    193   }
    194   auto end = get_time();
    195   // Record the number of FLOPs executed per second (size_ multiplications and
    196   // additions for each value in the resulting tensor)
    197   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
    198   device_.deallocate(a_);
    199   device_.deallocate(b_);
    200   device_.deallocate(c_);
    201   device_.synchronize();
    202 
    203 }
    204 
    205 template<typename T, typename Device, typename TensorIndex>
    206 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
    207   T* a_;
    208   T* b_;
    209   T* c_;
    210   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    211   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    212   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    213 
    214   // Initialize the content of the memory pools to prevent asan from
    215   // complaining.
    216   device_.memset(a_, 12, m_ * k_ * sizeof(T));
    217   device_.memset(b_, 23, k_ * n_ * sizeof(T));
    218   device_.memset(c_, 31, m_ * n_ * sizeof(T));
    219 
    220   Eigen::array<TensorIndex, 2> sizeA;
    221   sizeA[0] = m_;
    222   sizeA[1] = k_;
    223   Eigen::array<TensorIndex, 2> sizeB;
    224   sizeB[0] = n_;
    225   sizeB[1] = k_;
    226   Eigen::array<TensorIndex, 2> sizeC;
    227   sizeC[0] = m_;
    228   sizeC[1] = n_;
    229 
    230   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
    231   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
    232   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
    233 
    234   typedef typename Tensor<T, 2>::DimensionPair DimPair;
    235   Eigen::array<DimPair, 1> dims;
    236   dims[0] = DimPair(1, 1);
    237 #ifdef EIGEN_USE_SYCL // warmup for sycl
    238   for (int iter = 0; iter < 10; ++iter) {
    239     C.device(device_) = A.contract(B, dims);
    240    }
    241 #endif
    242   auto start = get_time();
    243   for (int iter = 0; iter < num_iters; ++iter) {
    244     C.device(device_) = A.contract(B, dims);
    245   }
    246   auto end = get_time();
    247   // Record the number of FLOPs executed per second (size_ multiplications and
    248   // additions for each value in the resulting tensor)
    249   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
    250   device_.deallocate(a_);
    251   device_.deallocate(b_);
    252   device_.deallocate(c_);
    253   device_.synchronize();
    254 
    255 }
    256 
    257 template<typename T, typename Device, typename TensorIndex>
    258 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
    259   T* a_;
    260   T* b_;
    261   T* c_;
    262   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    263   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    264   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    265 
    266   // Initialize the content of the memory pools to prevent asan from
    267   // complaining.
    268   device_.memset(a_, 12, m_ * k_ * sizeof(T));
    269   device_.memset(b_, 23, k_ * n_ * sizeof(T));
    270   device_.memset(c_, 31, m_ * n_ * sizeof(T));
    271 
    272   Eigen::array<TensorIndex, 2> sizeA;
    273   sizeA[0] = k_;
    274   sizeA[1] = m_;
    275   Eigen::array<TensorIndex, 2> sizeB;
    276   sizeB[0] = n_;
    277   sizeB[1] = k_;
    278   Eigen::array<TensorIndex, 2> sizeC;
    279   sizeC[0] = m_;
    280   sizeC[1] = n_;
    281 
    282   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
    283   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
    284   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
    285 
    286   typedef typename Tensor<T, 2>::DimensionPair DimPair;
    287   Eigen::array<DimPair, 1> dims;
    288   dims[0] = DimPair(0, 1);
    289 #ifdef EIGEN_USE_SYCL // warmup for sycl
    290   for (int iter = 0; iter < 10; ++iter) {
    291     C.device(device_) = A.contract(B, dims);
    292    }
    293 #endif
    294   auto start = get_time();
    295   for (int iter = 0; iter < num_iters; ++iter) {
    296     C.device(device_) = A.contract(B, dims);
    297   }
    298   auto end = get_time();
    299   // Record the number of FLOPs executed per second (size_ multiplications and
    300   // additions for each value in the resulting tensor)
    301   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
    302   device_.deallocate(a_);
    303   device_.deallocate(b_);
    304   device_.deallocate(c_);
    305   device_.synchronize();
    306 }
    307 
    308 int main() {
    309   cl::sycl::gpu_selector selector;
    310   Eigen::QueueInterface queue(selector);
    311   Eigen::SyclDevice device(&queue);
    312   int64_t num_iters =20;
    313   for(int64_t m = 32; m <= 4096; m *= 2)
    314     for(int64_t k = 32; k <= 4096; k *= 2)
    315       for(int64_t n = 32; n <= 4096; n*= 2){
    316         (contraction<float>(device, num_iters, m, k, n));
    317         (contractionRowMajor<float>(device, num_iters, m, k, n));
    318         (contractionAT<float>(device, num_iters, m, k, n));
    319         (contractionBT<float>(device, num_iters, m, k, n));
    320         (contractionABT<float>(device, num_iters, m, k, n));
    321       }
    322   return 0;
    323   }
    324 
    325 #endif // EIGEN_BENCH_CONTRACT_SYCL