cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_convolution_sycl.cpp (20033B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016
      5 // Mehdi Goli    Codeplay Software Ltd.
      6 // Ralph Potter  Codeplay Software Ltd.
      7 // Luke Iwanski  Codeplay Software Ltd.
      8 // Contact: <eigen@codeplay.com>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 
     14 #define EIGEN_TEST_NO_LONGDOUBLE
     15 #define EIGEN_TEST_NO_COMPLEX
     16 
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
     18 #define EIGEN_USE_SYCL
     19 
     20 #include <iostream>
     21 #include <chrono>
     22 #include <ctime>
     23 
     24 #include "main.h"
     25 #include <unsupported/Eigen/CXX11/Tensor>
     26 #include <iomanip>
     27 
     28 using Eigen::array;
     29 using Eigen::SyclDevice;
     30 using Eigen::Tensor;
     31 using Eigen::TensorMap;
     32 static const float error_threshold =1e-4f;
     33 
     34 
     35 template <typename DataType, int DataLayout, typename IndexType>
     36 static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
     37 {
     38   IndexType indim0 =53;
     39   IndexType indim1= 55;
     40   IndexType indim2= 51;
     41   IndexType outdim0=50;
     42   IndexType outdim1=55;
     43   IndexType outdim2=51;
     44   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
     45   Eigen::array<IndexType, 1> kernel_dims = {{4}};
     46   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
     47 
     48   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
     49   Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
     50   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
     51   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
     52 
     53   Eigen::array<IndexType, 1> dims3{{0}};
     54 
     55   input.setRandom();
     56   kernel.setRandom();
     57   result.setZero();
     58   result_host.setZero();
     59 
     60   std::size_t input_bytes = input.size()  * sizeof(DataType);
     61   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
     62   std::size_t result_bytes = result.size() * sizeof(DataType);
     63 
     64   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
     65   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
     66   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
     67 
     68   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
     69   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
     70   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
     71   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
     72   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
     73 
     74   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
     75   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
     76 
     77   result_host=input.convolve(kernel, dims3);
     78 
     79 for(IndexType i=0; i< outdim0; i++ ){
     80   for(IndexType j=0; j< outdim1; j++ ){
     81     for(IndexType k=0; k< outdim2; k++ ){
     82       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
     83         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
     84         assert(false);
     85       }
     86     }
     87   }
     88 }
     89   sycl_device.deallocate(d_input);
     90   sycl_device.deallocate(d_kernel);
     91   sycl_device.deallocate(d_result);
     92 
     93 }
     94 
     95 
     96 template <typename DataType, int DataLayout, typename IndexType>
     97 static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
     98 {
     99   IndexType indim0 =53;
    100   IndexType indim1= 55;
    101   IndexType indim2= 51;
    102   IndexType outdim0=50;
    103   IndexType outdim1=51;
    104   IndexType outdim2=51;
    105   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
    106   Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
    107   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
    108 
    109   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
    110   Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
    111   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
    112   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
    113 
    114   Eigen::array<IndexType, 2> dims3{{0,1}};
    115 
    116   input.setRandom();
    117   kernel.setRandom();
    118   result.setZero();
    119   result_host.setZero();
    120 
    121   std::size_t input_bytes = input.size()  * sizeof(DataType);
    122   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    123   std::size_t result_bytes = result.size() * sizeof(DataType);
    124 
    125   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    126   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    127   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
    128 
    129   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
    130   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
    131   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
    132   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    133   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    134 
    135   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
    136   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
    137 
    138   result_host=input.convolve(kernel, dims3);
    139 
    140 for(IndexType i=0; i< outdim0; i++ ){
    141   for(IndexType j=0; j< outdim1; j++ ){
    142     for(IndexType k=0; k< outdim2; k++ ){
    143       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
    144         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
    145         assert(false);
    146       }
    147     }
    148   }
    149 }
    150   sycl_device.deallocate(d_input);
    151   sycl_device.deallocate(d_kernel);
    152   sycl_device.deallocate(d_result);
    153 
    154 }
    155 
    156 
    157 template <typename DataType, int DataLayout, typename IndexType>
    158 static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
    159 {
    160   IndexType indim0 =53;
    161   IndexType indim1= 55;
    162   IndexType indim2= 51;
    163   IndexType outdim0=50;
    164   IndexType outdim1=51;
    165   IndexType outdim2=49;
    166   Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
    167   Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
    168   Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
    169 
    170   Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
    171   Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
    172   Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
    173   Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
    174 
    175   Eigen::array<IndexType, 3> dims3{{0,1,2}};
    176 
    177   input.setRandom();
    178   kernel.setRandom();
    179   result.setZero();
    180   result_host.setZero();
    181 
    182   std::size_t input_bytes = input.size()  * sizeof(DataType);
    183   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    184   std::size_t result_bytes = result.size() * sizeof(DataType);
    185 
    186   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    187   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    188   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
    189 
    190   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
    191   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
    192   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
    193   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    194   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    195 
    196   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
    197   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
    198 
    199   result_host=input.convolve(kernel, dims3);
    200 
    201 for(IndexType i=0; i< outdim0; i++ ){
    202   for(IndexType j=0; j< outdim1; j++ ){
    203     for(IndexType k=0; k< outdim2; k++ ){
    204       if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
    205         std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
    206         assert(false);
    207       }
    208     }
    209   }
    210 }
    211   sycl_device.deallocate(d_input);
    212   sycl_device.deallocate(d_kernel);
    213   sycl_device.deallocate(d_result);
    214 
    215 }
    216 
    217 
    218 template <typename DataType, int DataLayout, typename IndexType>
    219 static void test_evals(const Eigen::SyclDevice& sycl_device)
    220 {
    221   Eigen::array<IndexType, 2> input_dims = {{3, 3}};
    222   Eigen::array<IndexType, 1> kernel_dims = {{2}};
    223   Eigen::array<IndexType, 2> result_dims = {{2, 3}};
    224 
    225   Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
    226   Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
    227   Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
    228 
    229   Eigen::array<IndexType, 1> dims3{{0}};
    230 
    231   input.setRandom();
    232   kernel.setRandom();
    233   result.setZero();
    234 
    235   std::size_t input_bytes = input.size()  * sizeof(DataType);
    236   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    237   std::size_t result_bytes = result.size() * sizeof(DataType);
    238 
    239   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    240   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    241   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
    242 
    243   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
    244   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
    245   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
    246   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    247   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    248 
    249   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
    250   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
    251 
    252   VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
    253   VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
    254   VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
    255   VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
    256   VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
    257   VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
    258 
    259   sycl_device.deallocate(d_input);
    260   sycl_device.deallocate(d_kernel);
    261   sycl_device.deallocate(d_result);
    262 }
    263 
    264 template <typename DataType, int DataLayout, typename IndexType>
    265 static void test_expr(const Eigen::SyclDevice& sycl_device)
    266 {
    267   Eigen::array<IndexType, 2> input_dims = {{3, 3}};
    268   Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
    269   Eigen::array<IndexType, 2> result_dims = {{2, 2}};
    270 
    271   Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
    272   Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
    273   Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
    274 
    275   input.setRandom();
    276   kernel.setRandom();
    277   Eigen::array<IndexType, 2> dims;
    278   dims[0] = 0;
    279   dims[1] = 1;
    280 
    281   std::size_t input_bytes = input.size()  * sizeof(DataType);
    282   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    283   std::size_t result_bytes = result.size() * sizeof(DataType);
    284 
    285   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    286   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    287   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
    288 
    289   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
    290   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
    291   Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
    292   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    293   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    294 
    295   gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
    296   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
    297 
    298   VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
    299                                 input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
    300   VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
    301                                 input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
    302   VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
    303                                 input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
    304   VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
    305                                 input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
    306 
    307   sycl_device.deallocate(d_input);
    308   sycl_device.deallocate(d_kernel);
    309   sycl_device.deallocate(d_result);
    310 }
    311 
    312 
    313 template <typename DataType, int DataLayout, typename IndexType>
    314 static void test_modes(const Eigen::SyclDevice& sycl_device){
    315 
    316 Eigen::array<IndexType, 1> input_dims = {{3}};
    317 Eigen::array<IndexType, 1> kernel_dims = {{3}};
    318 
    319 Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
    320 Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
    321 
    322 input.setRandom();
    323 kernel.setRandom();
    324 Eigen::array<IndexType, 1> dims;
    325 dims[0] = 0;
    326 
    327   input(0) = 1.0f;
    328   input(1) = 2.0f;
    329   input(2) = 3.0f;
    330   kernel(0) = 0.5f;
    331   kernel(1) = 1.0f;
    332   kernel(2) = 0.0f;
    333 
    334   Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
    335 
    336   // Emulate VALID mode (as defined in
    337   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
    338   padding[0] = std::make_pair(0, 0);
    339   Tensor<DataType, 1, DataLayout, IndexType> valid(1);
    340 
    341   std::size_t input_bytes = input.size()  * sizeof(DataType);
    342   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    343   std::size_t valid_bytes = valid.size() * sizeof(DataType);
    344 
    345   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    346   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    347   DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
    348 
    349   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
    350   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
    351   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
    352   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    353   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    354 
    355   gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
    356   sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
    357 
    358   VERIFY_IS_EQUAL(valid.dimension(0), 1);
    359   VERIFY_IS_APPROX(valid(0), 2.5f);
    360 
    361   // Emulate SAME mode (as defined in
    362   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
    363   padding[0] = std::make_pair(1, 1);
    364   Tensor<DataType, 1, DataLayout, IndexType> same(3);
    365   std::size_t same_bytes = same.size() * sizeof(DataType);
    366   DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
    367   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
    368   gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
    369   sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
    370 
    371   VERIFY_IS_EQUAL(same.dimension(0), 3);
    372   VERIFY_IS_APPROX(same(0), 1.0f);
    373   VERIFY_IS_APPROX(same(1), 2.5f);
    374   VERIFY_IS_APPROX(same(2), 4.0f);
    375 
    376   // Emulate FULL mode (as defined in
    377   // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
    378   padding[0] = std::make_pair(2, 2);
    379 
    380   Tensor<DataType, 1, DataLayout, IndexType> full(5);
    381   std::size_t full_bytes = full.size() * sizeof(DataType);
    382   DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
    383   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
    384   gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
    385   sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
    386 
    387   VERIFY_IS_EQUAL(full.dimension(0), 5);
    388   VERIFY_IS_APPROX(full(0), 0.0f);
    389   VERIFY_IS_APPROX(full(1), 1.0f);
    390   VERIFY_IS_APPROX(full(2), 2.5f);
    391   VERIFY_IS_APPROX(full(3), 4.0f);
    392   VERIFY_IS_APPROX(full(4), 1.5f);
    393 
    394   sycl_device.deallocate(d_input);
    395   sycl_device.deallocate(d_kernel);
    396   sycl_device.deallocate(d_valid);
    397   sycl_device.deallocate(d_same);
    398   sycl_device.deallocate(d_full);
    399 
    400 }
    401 
    402 template <typename DataType, int DataLayout, typename IndexType>
    403 static void test_strides(const Eigen::SyclDevice& sycl_device){
    404 
    405   Eigen::array<IndexType, 1> input_dims = {{13}};
    406   Eigen::array<IndexType, 1> kernel_dims = {{3}};
    407 
    408   Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
    409   Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
    410   Tensor<DataType, 1, DataLayout, IndexType> result(2);
    411 
    412   input.setRandom();
    413   kernel.setRandom();
    414   Eigen::array<IndexType, 1> dims;
    415   dims[0] = 0;
    416 
    417   Eigen::array<IndexType, 1> stride_of_3;
    418   stride_of_3[0] = 3;
    419   Eigen::array<IndexType, 1> stride_of_2;
    420   stride_of_2[0] = 2;
    421 
    422   std::size_t input_bytes = input.size()  * sizeof(DataType);
    423   std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
    424   std::size_t result_bytes = result.size() * sizeof(DataType);
    425 
    426   DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
    427   DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
    428   DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
    429 
    430   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
    431   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
    432   Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
    433   sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
    434   sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
    435 
    436   gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
    437   sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
    438 
    439   VERIFY_IS_EQUAL(result.dimension(0), 2);
    440   VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
    441                                input(6)*kernel(2)));
    442   VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
    443                                input(12)*kernel(2)));
    444 }
    445 
    446 template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
    447   QueueInterface queueInterface(s);
    448   auto sycl_device=Eigen::SyclDevice(&queueInterface);
    449   test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
    450   test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
    451   test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
    452   test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
    453   test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
    454   test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
    455   test_evals<float, ColMajor, int64_t>(sycl_device);
    456   test_evals<float, RowMajor, int64_t>(sycl_device);
    457   test_expr<float, ColMajor, int64_t>(sycl_device);
    458   test_expr<float, RowMajor, int64_t>(sycl_device);
    459   test_modes<float, ColMajor, int64_t>(sycl_device);
    460   test_modes<float, RowMajor, int64_t>(sycl_device);
    461   test_strides<float, ColMajor, int64_t>(sycl_device);
    462   test_strides<float, RowMajor, int64_t>(sycl_device);
    463 }
    464 
    465 EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
    466   for (const auto& device :Eigen::get_sycl_supported_devices()) {
    467     CALL_SUBTEST(tensorConvolutionPerDevice(device));
    468   }
    469 }