cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_reduction_sycl.cpp (42176B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2015
      5 // Mehdi Goli    Codeplay Software Ltd.
      6 // Ralph Potter  Codeplay Software Ltd.
      7 // Luke Iwanski  Codeplay Software Ltd.
      8 // Contact: <eigen@codeplay.com>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 
     14 #define EIGEN_TEST_NO_LONGDOUBLE
     15 #define EIGEN_TEST_NO_COMPLEX
     16 
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
     18 #define EIGEN_USE_SYCL
     19 #define EIGEN_HAS_CONSTEXPR 1
     20 
     21 #include "main.h"
     22 
     23 #include <unsupported/Eigen/CXX11/Tensor>
     24 
     25 template <typename DataType, int DataLayout, typename IndexType>
     26 static void test_full_reductions_sum_sycl(
     27     const Eigen::SyclDevice& sycl_device) {
     28   const IndexType num_rows = 753;
     29   const IndexType num_cols = 537;
     30   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
     31 
     32   array<IndexType, 2> outRange = {{1, 1}};
     33 
     34   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
     35   Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
     36   Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
     37 
     38   in.setRandom();
     39   auto dim = DSizes<IndexType, 2>(1, 1);
     40   full_redux = in.sum().reshape(dim);
     41 
     42   DataType* gpu_in_data = static_cast<DataType*>(
     43       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
     44   DataType* gpu_out_data = (DataType*)sycl_device.allocate(
     45       sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
     46 
     47   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
     48                                                                tensorRange);
     49   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
     50                                                                 outRange);
     51   sycl_device.memcpyHostToDevice(
     52       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
     53   out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
     54   sycl_device.memcpyDeviceToHost(
     55       full_redux_gpu.data(), gpu_out_data,
     56       (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
     57   // Check that the CPU and GPU reductions return the same result.
     58   std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
     59             << ", CPU FULL: " << full_redux(0, 0) << "\n";
     60   VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
     61   sycl_device.deallocate(gpu_in_data);
     62   sycl_device.deallocate(gpu_out_data);
     63 }
     64 
     65 template <typename DataType, int DataLayout, typename IndexType>
     66 static void test_full_reductions_sum_with_offset_sycl(
     67     const Eigen::SyclDevice& sycl_device) {
     68   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
     69   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
     70   const IndexType num_rows = 64;
     71   const IndexType num_cols = 64;
     72   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
     73   const IndexType n_elems = internal::array_prod(tensor_range);
     74 
     75   data_tensor in(tensor_range);
     76   scalar_tensor full_redux;
     77   scalar_tensor full_redux_gpu;
     78 
     79   in.setRandom();
     80   array<IndexType, 2> tensor_offset_range(tensor_range);
     81   tensor_offset_range[0] -= 1;
     82 
     83   const IndexType offset = 64;
     84   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
     85   full_redux = in_offset.sum();
     86 
     87   DataType* gpu_in_data =
     88       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
     89   DataType* gpu_out_data =
     90       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
     91 
     92   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
     93   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
     94   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
     95                                  n_elems * sizeof(DataType));
     96   out_gpu.device(sycl_device) = in_gpu.sum();
     97   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
     98                                  sizeof(DataType));
     99 
    100   // Check that the CPU and GPU reductions return the same result.
    101   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    102 
    103   sycl_device.deallocate(gpu_in_data);
    104   sycl_device.deallocate(gpu_out_data);
    105 }
    106 
    107 template <typename DataType, int DataLayout, typename IndexType>
    108 static void test_full_reductions_max_sycl(
    109     const Eigen::SyclDevice& sycl_device) {
    110   const IndexType num_rows = 4096;
    111   const IndexType num_cols = 4096;
    112   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
    113 
    114   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
    115   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
    116   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
    117 
    118   in.setRandom();
    119 
    120   full_redux = in.maximum();
    121 
    122   DataType* gpu_in_data = static_cast<DataType*>(
    123       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    124   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
    125 
    126   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
    127                                                                tensorRange);
    128   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
    129   sycl_device.memcpyHostToDevice(
    130       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    131   out_gpu.device(sycl_device) = in_gpu.maximum();
    132   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    133                                  sizeof(DataType));
    134   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    135   sycl_device.deallocate(gpu_in_data);
    136   sycl_device.deallocate(gpu_out_data);
    137 }
    138 
    139 template <typename DataType, int DataLayout, typename IndexType>
    140 static void test_full_reductions_max_with_offset_sycl(
    141     const Eigen::SyclDevice& sycl_device) {
    142   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
    143   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
    144   const IndexType num_rows = 64;
    145   const IndexType num_cols = 64;
    146   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
    147   const IndexType n_elems = internal::array_prod(tensor_range);
    148 
    149   data_tensor in(tensor_range);
    150   scalar_tensor full_redux;
    151   scalar_tensor full_redux_gpu;
    152 
    153   in.setRandom();
    154   array<IndexType, 2> tensor_offset_range(tensor_range);
    155   tensor_offset_range[0] -= 1;
    156   // Set the initial value to be the max.
    157   // As we don't include this in the reduction the result should not be 2.
    158   in(0) = static_cast<DataType>(2);
    159 
    160   const IndexType offset = 64;
    161   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
    162   full_redux = in_offset.maximum();
    163   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
    164 
    165   DataType* gpu_in_data =
    166       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    167   DataType* gpu_out_data =
    168       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
    169 
    170   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
    171   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
    172   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    173                                  n_elems * sizeof(DataType));
    174   out_gpu.device(sycl_device) = in_gpu.maximum();
    175   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    176                                  sizeof(DataType));
    177 
    178   // Check that the CPU and GPU reductions return the same result.
    179   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    180 
    181   sycl_device.deallocate(gpu_in_data);
    182   sycl_device.deallocate(gpu_out_data);
    183 }
    184 
    185 template <typename DataType, int DataLayout, typename IndexType>
    186 static void test_full_reductions_mean_sycl(
    187     const Eigen::SyclDevice& sycl_device) {
    188   const IndexType num_rows = 4096;
    189   const IndexType num_cols = 4096;
    190   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
    191   array<IndexType, 1> argRange = {{num_cols}};
    192   Eigen::array<IndexType, 1> red_axis;
    193   red_axis[0] = 0;
    194   //  red_axis[1]=1;
    195   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
    196   Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
    197   Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
    198   Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
    199   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
    200   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
    201   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
    202   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
    203 
    204   in.setRandom();
    205   in_arg1.setRandom();
    206   in_arg2.setRandom();
    207 
    208   DataType* gpu_in_data = static_cast<DataType*>(
    209       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    210   DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
    211       in_arg1.dimensions().TotalSize() * sizeof(DataType)));
    212   DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
    213       in_arg2.dimensions().TotalSize() * sizeof(DataType)));
    214   bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
    215       out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
    216   bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
    217       out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
    218 
    219   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
    220 
    221   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
    222                                                                tensorRange);
    223   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
    224       gpu_in_arg1_data, tensorRange);
    225   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
    226       gpu_in_arg2_data, tensorRange);
    227   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
    228       gpu_out_arg_data, argRange);
    229   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
    230       gpu_out_arg__gpu_helper_data, argRange);
    231   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
    232 
    233   // CPU VERSION
    234   out_arg_cpu =
    235       (in_arg1.argmax(1) == in_arg2.argmax(1))
    236           .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
    237   full_redux = (out_arg_cpu.template cast<float>())
    238                    .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
    239 
    240   // GPU VERSION
    241   sycl_device.memcpyHostToDevice(
    242       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    243   sycl_device.memcpyHostToDevice(
    244       gpu_in_arg1_data, in_arg1.data(),
    245       (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
    246   sycl_device.memcpyHostToDevice(
    247       gpu_in_arg2_data, in_arg2.data(),
    248       (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
    249   out_Argout_gpu_helper.device(sycl_device) =
    250       (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
    251   out_Argout_gpu.device(sycl_device) =
    252       (out_Argout_gpu_helper)
    253           .select(out_Argout_gpu.constant(true),
    254                   out_Argout_gpu.constant(false));
    255   out_gpu.device(sycl_device) =
    256       (out_Argout_gpu.template cast<float>())
    257           .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
    258   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    259                                  sizeof(DataType));
    260   // Check that the CPU and GPU reductions return the same result.
    261   std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
    262             << '\n';
    263   VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
    264   sycl_device.deallocate(gpu_in_data);
    265   sycl_device.deallocate(gpu_in_arg1_data);
    266   sycl_device.deallocate(gpu_in_arg2_data);
    267   sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
    268   sycl_device.deallocate(gpu_out_arg_data);
    269   sycl_device.deallocate(gpu_out_data);
    270 }
    271 
    272 template <typename DataType, int DataLayout, typename IndexType>
    273 static void test_full_reductions_mean_with_offset_sycl(
    274     const Eigen::SyclDevice& sycl_device) {
    275   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
    276   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
    277   const IndexType num_rows = 64;
    278   const IndexType num_cols = 64;
    279   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
    280   const IndexType n_elems = internal::array_prod(tensor_range);
    281 
    282   data_tensor in(tensor_range);
    283   scalar_tensor full_redux;
    284   scalar_tensor full_redux_gpu;
    285 
    286   in.setRandom();
    287   array<IndexType, 2> tensor_offset_range(tensor_range);
    288   tensor_offset_range[0] -= 1;
    289 
    290   const IndexType offset = 64;
    291   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
    292   full_redux = in_offset.mean();
    293   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
    294 
    295   DataType* gpu_in_data =
    296       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    297   DataType* gpu_out_data =
    298       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
    299 
    300   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
    301   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
    302   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    303                                  n_elems * sizeof(DataType));
    304   out_gpu.device(sycl_device) = in_gpu.mean();
    305   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    306                                  sizeof(DataType));
    307 
    308   // Check that the CPU and GPU reductions return the same result.
    309   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    310 
    311   sycl_device.deallocate(gpu_in_data);
    312   sycl_device.deallocate(gpu_out_data);
    313 }
    314 
    315 template <typename DataType, int DataLayout, typename IndexType>
    316 static void test_full_reductions_mean_with_odd_offset_sycl(
    317     const Eigen::SyclDevice& sycl_device) {
    318   // This is a particular case which illustrates a possible problem when the
    319   // number of local threads in a workgroup is even, but is not a power of two.
    320   using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
    321   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
    322   // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
    323   // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
    324   const IndexType n_elems = 8707;
    325   array<IndexType, 1> tensor_range = {{n_elems}};
    326 
    327   data_tensor in(tensor_range);
    328   DataType full_redux;
    329   DataType full_redux_gpu;
    330   TensorMap<scalar_tensor> red_cpu(&full_redux);
    331   TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
    332 
    333   const DataType const_val = static_cast<DataType>(0.6391);
    334   in = in.constant(const_val);
    335 
    336   Eigen::IndexList<Eigen::type2index<0>> red_axis;
    337   red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
    338   VERIFY_IS_APPROX(const_val, red_cpu());
    339 
    340   DataType* gpu_in_data =
    341       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    342   DataType* gpu_out_data =
    343       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
    344 
    345   TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
    346   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
    347   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    348                                  n_elems * sizeof(DataType));
    349   out_gpu.device(sycl_device) =
    350       in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
    351   sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
    352                                  sizeof(DataType));
    353 
    354   // Check that the CPU and GPU reductions return the same result.
    355   VERIFY_IS_APPROX(full_redux_gpu, full_redux);
    356 
    357   sycl_device.deallocate(gpu_in_data);
    358   sycl_device.deallocate(gpu_out_data);
    359 }
    360 
    361 template <typename DataType, int DataLayout, typename IndexType>
    362 static void test_full_reductions_min_sycl(
    363     const Eigen::SyclDevice& sycl_device) {
    364   const IndexType num_rows = 876;
    365   const IndexType num_cols = 953;
    366   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
    367 
    368   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
    369   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
    370   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
    371 
    372   in.setRandom();
    373 
    374   full_redux = in.minimum();
    375 
    376   DataType* gpu_in_data = static_cast<DataType*>(
    377       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    378   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
    379 
    380   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
    381                                                                tensorRange);
    382   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
    383 
    384   sycl_device.memcpyHostToDevice(
    385       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    386   out_gpu.device(sycl_device) = in_gpu.minimum();
    387   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    388                                  sizeof(DataType));
    389   // Check that the CPU and GPU reductions return the same result.
    390   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    391   sycl_device.deallocate(gpu_in_data);
    392   sycl_device.deallocate(gpu_out_data);
    393 }
    394 
    395 template <typename DataType, int DataLayout, typename IndexType>
    396 static void test_full_reductions_min_with_offset_sycl(
    397     const Eigen::SyclDevice& sycl_device) {
    398   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
    399   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
    400   const IndexType num_rows = 64;
    401   const IndexType num_cols = 64;
    402   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
    403   const IndexType n_elems = internal::array_prod(tensor_range);
    404 
    405   data_tensor in(tensor_range);
    406   scalar_tensor full_redux;
    407   scalar_tensor full_redux_gpu;
    408 
    409   in.setRandom();
    410   array<IndexType, 2> tensor_offset_range(tensor_range);
    411   tensor_offset_range[0] -= 1;
    412   // Set the initial value to be the min.
    413   // As we don't include this in the reduction the result should not be -2.
    414   in(0) = static_cast<DataType>(-2);
    415 
    416   const IndexType offset = 64;
    417   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
    418   full_redux = in_offset.minimum();
    419   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
    420 
    421   DataType* gpu_in_data =
    422       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    423   DataType* gpu_out_data =
    424       static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
    425 
    426   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
    427   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
    428   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    429                                  n_elems * sizeof(DataType));
    430   out_gpu.device(sycl_device) = in_gpu.minimum();
    431   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
    432                                  sizeof(DataType));
    433 
    434   // Check that the CPU and GPU reductions return the same result.
    435   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
    436 
    437   sycl_device.deallocate(gpu_in_data);
    438   sycl_device.deallocate(gpu_out_data);
    439 }
    440 template <typename DataType, int DataLayout, typename IndexType>
    441 static void test_first_dim_reductions_max_sycl(
    442     const Eigen::SyclDevice& sycl_device) {
    443   IndexType dim_x = 145;
    444   IndexType dim_y = 1;
    445   IndexType dim_z = 67;
    446 
    447   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
    448   Eigen::array<IndexType, 1> red_axis;
    449   red_axis[0] = 0;
    450   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
    451 
    452   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
    453   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
    454   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
    455 
    456   in.setRandom();
    457 
    458   redux = in.maximum(red_axis);
    459 
    460   DataType* gpu_in_data = static_cast<DataType*>(
    461       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    462   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    463       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
    464 
    465   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
    466                                                                tensorRange);
    467   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
    468       gpu_out_data, reduced_tensorRange);
    469 
    470   sycl_device.memcpyHostToDevice(
    471       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    472   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
    473   sycl_device.memcpyDeviceToHost(
    474       redux_gpu.data(), gpu_out_data,
    475       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
    476 
    477   // Check that the CPU and GPU reductions return the same result.
    478   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
    479     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
    480       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
    481 
    482   sycl_device.deallocate(gpu_in_data);
    483   sycl_device.deallocate(gpu_out_data);
    484 }
    485 
    486 template <typename DataType, int DataLayout, typename IndexType>
    487 static void test_first_dim_reductions_max_with_offset_sycl(
    488     const Eigen::SyclDevice& sycl_device) {
    489   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
    490   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
    491 
    492   const IndexType num_rows = 64;
    493   const IndexType num_cols = 64;
    494   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
    495   array<IndexType, 1> reduced_range = {{num_cols}};
    496   const IndexType n_elems = internal::array_prod(tensor_range);
    497   const IndexType n_reduced = num_cols;
    498 
    499   data_tensor in(tensor_range);
    500   reduced_tensor redux;
    501   reduced_tensor redux_gpu(reduced_range);
    502 
    503   in.setRandom();
    504   array<IndexType, 2> tensor_offset_range(tensor_range);
    505   tensor_offset_range[0] -= 1;
    506   // Set maximum value outside of the considered range.
    507   for (IndexType i = 0; i < n_reduced; i++) {
    508     in(i) = static_cast<DataType>(2);
    509   }
    510 
    511   Eigen::array<IndexType, 1> red_axis;
    512   red_axis[0] = 0;
    513 
    514   const IndexType offset = 64;
    515   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
    516   redux = in_offset.maximum(red_axis);
    517   for (IndexType i = 0; i < n_reduced; i++) {
    518     VERIFY_IS_NOT_EQUAL(redux(i), in(i));
    519   }
    520 
    521   DataType* gpu_in_data =
    522       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    523   DataType* gpu_out_data = static_cast<DataType*>(
    524       sycl_device.allocate(n_reduced * sizeof(DataType)));
    525 
    526   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
    527   TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
    528   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    529                                  n_elems * sizeof(DataType));
    530   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
    531   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
    532                                  n_reduced * sizeof(DataType));
    533 
    534   // Check that the CPU and GPU reductions return the same result.
    535   for (IndexType i = 0; i < n_reduced; i++) {
    536     VERIFY_IS_APPROX(redux_gpu(i), redux(i));
    537   }
    538 
    539   sycl_device.deallocate(gpu_in_data);
    540   sycl_device.deallocate(gpu_out_data);
    541 }
    542 
    543 template <typename DataType, int DataLayout, typename IndexType>
    544 static void test_last_dim_reductions_max_with_offset_sycl(
    545     const Eigen::SyclDevice& sycl_device) {
    546   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
    547   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
    548 
    549   const IndexType num_rows = 64;
    550   const IndexType num_cols = 64;
    551   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
    552   array<IndexType, 1> full_reduced_range = {{num_rows}};
    553   array<IndexType, 1> reduced_range = {{num_rows - 1}};
    554   const IndexType n_elems = internal::array_prod(tensor_range);
    555   const IndexType n_reduced = reduced_range[0];
    556 
    557   data_tensor in(tensor_range);
    558   reduced_tensor redux(full_reduced_range);
    559   reduced_tensor redux_gpu(reduced_range);
    560 
    561   in.setRandom();
    562   redux.setZero();
    563   array<IndexType, 2> tensor_offset_range(tensor_range);
    564   tensor_offset_range[0] -= 1;
    565   // Set maximum value outside of the considered range.
    566   for (IndexType i = 0; i < n_reduced; i++) {
    567     in(i) = static_cast<DataType>(2);
    568   }
    569 
    570   Eigen::array<IndexType, 1> red_axis;
    571   red_axis[0] = 1;
    572 
    573   const IndexType offset = 64;
    574   // Introduce an offset in both the input and the output.
    575   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
    576   TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
    577   red_offset = in_offset.maximum(red_axis);
    578 
    579   // Check that the first value hasn't been changed and that the reduced values
    580   // are not equal to the previously set maximum in the input outside the range.
    581   VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
    582   for (IndexType i = 0; i < n_reduced; i++) {
    583     VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
    584   }
    585 
    586   DataType* gpu_in_data =
    587       static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
    588   DataType* gpu_out_data = static_cast<DataType*>(
    589       sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
    590 
    591   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
    592   TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
    593   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
    594                                  n_elems * sizeof(DataType));
    595   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
    596   sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
    597                                  n_reduced * sizeof(DataType));
    598 
    599   // Check that the CPU and GPU reductions return the same result.
    600   for (IndexType i = 0; i < n_reduced; i++) {
    601     VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
    602   }
    603 
    604   sycl_device.deallocate(gpu_in_data);
    605   sycl_device.deallocate(gpu_out_data);
    606 }
    607 
    608 template <typename DataType, int DataLayout, typename IndexType>
    609 static void test_first_dim_reductions_sum_sycl(
    610     const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
    611   array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
    612   Eigen::array<IndexType, 1> red_axis;
    613   red_axis[0] = 0;
    614   array<IndexType, 1> reduced_tensorRange = {{dim_y}};
    615 
    616   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
    617   Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
    618   Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
    619 
    620   in.setRandom();
    621   redux = in.sum(red_axis);
    622 
    623   DataType* gpu_in_data = static_cast<DataType*>(
    624       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    625   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    626       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
    627 
    628   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
    629                                                                tensorRange);
    630   TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
    631       gpu_out_data, reduced_tensorRange);
    632 
    633   sycl_device.memcpyHostToDevice(
    634       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    635   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
    636   sycl_device.memcpyDeviceToHost(
    637       redux_gpu.data(), gpu_out_data,
    638       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
    639 
    640   // Check that the CPU and GPU reductions return the same result.
    641   for (IndexType i = 0; i < redux.size(); i++) {
    642     VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
    643   }
    644   sycl_device.deallocate(gpu_in_data);
    645   sycl_device.deallocate(gpu_out_data);
    646 }
    647 
    648 template <typename DataType, int DataLayout, typename IndexType>
    649 static void test_first_dim_reductions_mean_sycl(
    650     const Eigen::SyclDevice& sycl_device) {
    651   IndexType dim_x = 145;
    652   IndexType dim_y = 1;
    653   IndexType dim_z = 67;
    654 
    655   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
    656   Eigen::array<IndexType, 1> red_axis;
    657   red_axis[0] = 0;
    658   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
    659 
    660   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
    661   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
    662   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
    663 
    664   in.setRandom();
    665 
    666   redux = in.mean(red_axis);
    667 
    668   DataType* gpu_in_data = static_cast<DataType*>(
    669       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    670   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    671       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
    672 
    673   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
    674                                                                tensorRange);
    675   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
    676       gpu_out_data, reduced_tensorRange);
    677 
    678   sycl_device.memcpyHostToDevice(
    679       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    680   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
    681   sycl_device.memcpyDeviceToHost(
    682       redux_gpu.data(), gpu_out_data,
    683       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
    684 
    685   // Check that the CPU and GPU reductions return the same result.
    686   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
    687     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
    688       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
    689 
    690   sycl_device.deallocate(gpu_in_data);
    691   sycl_device.deallocate(gpu_out_data);
    692 }
    693 
    694 template <typename DataType, int DataLayout, typename IndexType>
    695 static void test_last_dim_reductions_mean_sycl(
    696     const Eigen::SyclDevice& sycl_device) {
    697   IndexType dim_x = 64;
    698   IndexType dim_y = 1;
    699   IndexType dim_z = 32;
    700 
    701   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
    702   Eigen::array<IndexType, 1> red_axis;
    703   red_axis[0] = 2;
    704   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
    705 
    706   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
    707   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
    708   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
    709 
    710   in.setRandom();
    711 
    712   redux = in.mean(red_axis);
    713 
    714   DataType* gpu_in_data = static_cast<DataType*>(
    715       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    716   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    717       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
    718 
    719   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
    720                                                                tensorRange);
    721   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
    722       gpu_out_data, reduced_tensorRange);
    723 
    724   sycl_device.memcpyHostToDevice(
    725       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    726   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
    727   sycl_device.memcpyDeviceToHost(
    728       redux_gpu.data(), gpu_out_data,
    729       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
    730   // Check that the CPU and GPU reductions return the same result.
    731   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
    732     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
    733       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
    734 
    735   sycl_device.deallocate(gpu_in_data);
    736   sycl_device.deallocate(gpu_out_data);
    737 }
    738 
    739 template <typename DataType, int DataLayout, typename IndexType>
    740 static void test_last_dim_reductions_sum_sycl(
    741     const Eigen::SyclDevice& sycl_device) {
    742   IndexType dim_x = 64;
    743   IndexType dim_y = 1;
    744   IndexType dim_z = 32;
    745 
    746   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
    747   Eigen::array<IndexType, 1> red_axis;
    748   red_axis[0] = 2;
    749   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
    750 
    751   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
    752   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
    753   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
    754 
    755   in.setRandom();
    756 
    757   redux = in.sum(red_axis);
    758 
    759   DataType* gpu_in_data = static_cast<DataType*>(
    760       sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
    761   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    762       redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
    763 
    764   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
    765                                                                tensorRange);
    766   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
    767       gpu_out_data, reduced_tensorRange);
    768 
    769   sycl_device.memcpyHostToDevice(
    770       gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
    771   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
    772   sycl_device.memcpyDeviceToHost(
    773       redux_gpu.data(), gpu_out_data,
    774       redux_gpu.dimensions().TotalSize() * sizeof(DataType));
    775   // Check that the CPU and GPU reductions return the same result.
    776   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
    777     for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
    778       VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
    779 
    780   sycl_device.deallocate(gpu_in_data);
    781   sycl_device.deallocate(gpu_out_data);
    782 }
    783 
    784 template <typename DataType, int DataLayout, typename IndexType>
    785 static void test_last_reductions_sum_sycl(
    786     const Eigen::SyclDevice& sycl_device) {
    787   auto tensorRange = Sizes<64, 32>(64, 32);
    788   // auto red_axis =  Sizes<0,1>(0,1);
    789   Eigen::IndexList<Eigen::type2index<1>> red_axis;
    790   auto reduced_tensorRange = Sizes<64>(64);
    791   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
    792   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
    793   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
    794 
    795   in_fix.setRandom();
    796 
    797   redux_fix = in_fix.sum(red_axis);
    798 
    799   DataType* gpu_in_data = static_cast<DataType*>(
    800       sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
    801   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    802       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
    803 
    804   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
    805       gpu_in_data, tensorRange);
    806   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
    807       gpu_out_data, reduced_tensorRange);
    808 
    809   sycl_device.memcpyHostToDevice(
    810       gpu_in_data, in_fix.data(),
    811       (in_fix.dimensions().TotalSize()) * sizeof(DataType));
    812   out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
    813   sycl_device.memcpyDeviceToHost(
    814       redux_gpu_fix.data(), gpu_out_data,
    815       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
    816   // Check that the CPU and GPU reductions return the same result.
    817   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
    818     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
    819   }
    820 
    821   sycl_device.deallocate(gpu_in_data);
    822   sycl_device.deallocate(gpu_out_data);
    823 }
    824 
    825 template <typename DataType, int DataLayout, typename IndexType>
    826 static void test_last_reductions_mean_sycl(
    827     const Eigen::SyclDevice& sycl_device) {
    828   auto tensorRange = Sizes<64, 32>(64, 32);
    829   Eigen::IndexList<Eigen::type2index<1>> red_axis;
    830   auto reduced_tensorRange = Sizes<64>(64);
    831   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
    832   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
    833   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
    834 
    835   in_fix.setRandom();
    836   redux_fix = in_fix.mean(red_axis);
    837 
    838   DataType* gpu_in_data = static_cast<DataType*>(
    839       sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
    840   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
    841       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
    842 
    843   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
    844       gpu_in_data, tensorRange);
    845   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
    846       gpu_out_data, reduced_tensorRange);
    847 
    848   sycl_device.memcpyHostToDevice(
    849       gpu_in_data, in_fix.data(),
    850       (in_fix.dimensions().TotalSize()) * sizeof(DataType));
    851   out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
    852   sycl_device.memcpyDeviceToHost(
    853       redux_gpu_fix.data(), gpu_out_data,
    854       redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
    855   sycl_device.synchronize();
    856   // Check that the CPU and GPU reductions return the same result.
    857   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
    858     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
    859   }
    860 
    861   sycl_device.deallocate(gpu_in_data);
    862   sycl_device.deallocate(gpu_out_data);
    863 }
    864 
    865 // SYCL supports a generic case of reduction where the accumulator is a
    866 // different type than the input data This is an example on how to get if a
    867 // Tensor contains nan and/or inf in one reduction
    868 template <typename InT, typename OutT>
    869 struct CustomReducer {
    870   static const bool PacketAccess = false;
    871   static const bool IsStateful = false;
    872 
    873   static constexpr OutT InfBit = 1;
    874   static constexpr OutT NanBit = 2;
    875 
    876   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
    877                                                     OutT* accum) const {
    878     if (Eigen::numext::isinf(x))
    879       *accum |= InfBit;
    880     else if (Eigen::numext::isnan(x))
    881       *accum |= NanBit;
    882   }
    883 
    884   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
    885                                                     OutT* accum) const {
    886     *accum |= x;
    887   }
    888 
    889   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
    890     return OutT(0);
    891   }
    892 
    893   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
    894     return accum;
    895   }
    896 };
    897 
    898 template <typename DataType, typename AccumType, int DataLayout,
    899           typename IndexType>
    900 static void test_full_reductions_custom_sycl(
    901     const Eigen::SyclDevice& sycl_device) {
    902   constexpr IndexType InSize = 64;
    903   auto tensorRange = Sizes<InSize>(InSize);
    904   Eigen::IndexList<Eigen::type2index<0>> dims;
    905   auto reduced_tensorRange = Sizes<>();
    906   TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
    907   TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
    908 
    909   CustomReducer<DataType, AccumType> reducer;
    910 
    911   in_fix.setRandom();
    912 
    913   size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
    914   DataType* gpu_in_data =
    915       static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
    916   AccumType* gpu_out_data =
    917       static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
    918 
    919   TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
    920       gpu_in_data, tensorRange);
    921   TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
    922       gpu_out_data, reduced_tensorRange);
    923 
    924   sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
    925   out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
    926   sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
    927                                  sizeof(AccumType));
    928   VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
    929 
    930   sycl_device.deallocate(gpu_in_data);
    931   sycl_device.deallocate(gpu_out_data);
    932 }
    933 
    934 template <typename DataType, typename Dev>
    935 void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
    936   test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
    937   test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
    938   test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
    939   test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
    940   test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
    941   test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
    942 
    943   test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
    944   test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
    945   test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
    946       sycl_device);
    947   test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
    948       sycl_device);
    949   sycl_device.synchronize();
    950 }
    951 
    952 template <typename DataType, typename Dev>
    953 void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
    954   test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
    955       sycl_device);
    956   test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
    957       sycl_device);
    958   test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
    959       sycl_device);
    960   test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
    961       sycl_device);
    962   test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
    963       sycl_device);
    964   test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
    965       sycl_device);
    966   test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
    967       sycl_device);
    968   test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
    969       sycl_device);
    970   test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
    971       sycl_device);
    972   sycl_device.synchronize();
    973 }
    974 
    975 template <typename DataType, typename Dev>
    976 void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
    977   test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
    978                                                                   4197, 4097);
    979   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
    980                                                                   4197, 4097);
    981   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
    982                                                                   129, 8);
    983   test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
    984   test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
    985       sycl_device);
    986   sycl_device.synchronize();
    987 }
    988 
    989 template <typename DataType, typename Dev>
    990 void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
    991   test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
    992   test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
    993       sycl_device);
    994   test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
    995   test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
    996   test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
    997   test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
    998   sycl_device.synchronize();
    999 }
   1000 
   1001 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
   1002   for (const auto& device : Eigen::get_sycl_supported_devices()) {
   1003     std::cout << "Running on "
   1004               << device.template get_info<cl::sycl::info::device::name>()
   1005               << std::endl;
   1006     QueueInterface queueInterface(device);
   1007     auto sycl_device = Eigen::SyclDevice(&queueInterface);
   1008     CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
   1009     CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
   1010     CALL_SUBTEST_3(
   1011         sycl_reduction_test_first_dim_per_device<float>(sycl_device));
   1012     CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
   1013   }
   1014 }