cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_builtins_sycl.cpp (15767B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016
      5 // Mehdi Goli    Codeplay Software Ltd.
      6 // Ralph Potter  Codeplay Software Ltd.
      7 // Luke Iwanski  Codeplay Software Ltd.
      8 // Contact: <eigen@codeplay.com>
      9 //
     10 // This Source Code Form is subject to the terms of the Mozilla
     11 // Public License v. 2.0. If a copy of the MPL was not distributed
     12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     13 
     14 #define EIGEN_TEST_NO_LONGDOUBLE
     15 #define EIGEN_TEST_NO_COMPLEX
     16 
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
     18 #define EIGEN_USE_SYCL
     19 
     20 #include "main.h"
     21 #include <unsupported/Eigen/CXX11/Tensor>
     22 
     23 using Eigen::array;
     24 using Eigen::SyclDevice;
     25 using Eigen::Tensor;
     26 using Eigen::TensorMap;
     27 
     28 // Functions used to compare the TensorMap implementation on the device with
     29 // the equivalent on the host
     30 namespace cl {
     31 namespace sycl {
     32 template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
     33 template <typename T> T square(T x) { return x * x; }
     34 template <typename T> T cube(T x) { return x * x * x; }
     35 template <typename T> T inverse(T x) { return T(1) / x; }
     36 template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
     37 template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
     38 }
     39 }
     40 
     41 struct EqualAssignement {
     42   template <typename Lhs, typename Rhs>
     43   void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
     44 };
     45 
     46 struct PlusEqualAssignement {
     47   template <typename Lhs, typename Rhs>
     48   void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
     49 };
     50 
     51 template <typename DataType, int DataLayout,
     52           typename Assignement, typename Operator>
     53 void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
     54                                     const array<int64_t, 3>& tensor_range) {
     55   Operator op;
     56   Assignement asgn;
     57   {
     58     /* Assignement(out, Operator(in)) */
     59     Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
     60     Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
     61     in = in.random() + DataType(0.01);
     62     out = out.random() + DataType(0.01);
     63     Tensor<DataType, 3, DataLayout, int64_t> reference(out);
     64     DataType *gpu_data = static_cast<DataType *>(
     65         sycl_device.allocate(in.size() * sizeof(DataType)));
     66     DataType *gpu_data_out = static_cast<DataType *>(
     67         sycl_device.allocate(out.size() * sizeof(DataType)));
     68     TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
     69     TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
     70     sycl_device.memcpyHostToDevice(gpu_data, in.data(),
     71                                    (in.size()) * sizeof(DataType));
     72     sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
     73                                    (out.size()) * sizeof(DataType));
     74     auto device_expr = gpu_out.device(sycl_device);
     75     asgn(device_expr, op(gpu));
     76     sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
     77                                    (out.size()) * sizeof(DataType));
     78     for (int64_t i = 0; i < out.size(); ++i) {
     79       DataType ver = reference(i);
     80       asgn(ver, op(in(i)));
     81       VERIFY_IS_APPROX(out(i), ver);
     82     }
     83     sycl_device.deallocate(gpu_data);
     84     sycl_device.deallocate(gpu_data_out);
     85   }
     86   {
     87     /* Assignement(out, Operator(out)) */
     88     Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
     89     out = out.random() + DataType(0.01);
     90     Tensor<DataType, 3, DataLayout, int64_t> reference(out);
     91     DataType *gpu_data_out = static_cast<DataType *>(
     92         sycl_device.allocate(out.size() * sizeof(DataType)));
     93     TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
     94     sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
     95                                    (out.size()) * sizeof(DataType));
     96     auto device_expr = gpu_out.device(sycl_device);
     97     asgn(device_expr, op(gpu_out));
     98     sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
     99                                    (out.size()) * sizeof(DataType));
    100     for (int64_t i = 0; i < out.size(); ++i) {
    101       DataType ver = reference(i);
    102       asgn(ver, op(reference(i)));
    103       VERIFY_IS_APPROX(out(i), ver);
    104     }
    105     sycl_device.deallocate(gpu_data_out);
    106   }
    107 }
    108 
    109 #define DECLARE_UNARY_STRUCT(FUNC)                                 \
    110   struct op_##FUNC {                                               \
    111     template <typename T>                                          \
    112     auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
    113       return cl::sycl::FUNC(x);                                    \
    114     }                                                              \
    115     template <typename T>                                          \
    116     auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
    117       return x.FUNC();                                             \
    118     }                                                              \
    119   };
    120 
    121 DECLARE_UNARY_STRUCT(abs)
    122 DECLARE_UNARY_STRUCT(sqrt)
    123 DECLARE_UNARY_STRUCT(rsqrt)
    124 DECLARE_UNARY_STRUCT(square)
    125 DECLARE_UNARY_STRUCT(cube)
    126 DECLARE_UNARY_STRUCT(inverse)
    127 DECLARE_UNARY_STRUCT(tanh)
    128 DECLARE_UNARY_STRUCT(exp)
    129 DECLARE_UNARY_STRUCT(expm1)
    130 DECLARE_UNARY_STRUCT(log)
    131 DECLARE_UNARY_STRUCT(ceil)
    132 DECLARE_UNARY_STRUCT(floor)
    133 DECLARE_UNARY_STRUCT(round)
    134 DECLARE_UNARY_STRUCT(log1p)
    135 DECLARE_UNARY_STRUCT(sign)
    136 DECLARE_UNARY_STRUCT(isnan)
    137 DECLARE_UNARY_STRUCT(isfinite)
    138 DECLARE_UNARY_STRUCT(isinf)
    139 
    140 template <typename DataType, int DataLayout, typename Assignement>
    141 void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
    142                                          const array<int64_t, 3>& tensor_range) {
    143 #define RUN_UNARY_TEST(FUNC) \
    144   test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
    145                                  op_##FUNC>(sycl_device, tensor_range)
    146   RUN_UNARY_TEST(abs);
    147   RUN_UNARY_TEST(sqrt);
    148   RUN_UNARY_TEST(rsqrt);
    149   RUN_UNARY_TEST(square);
    150   RUN_UNARY_TEST(cube);
    151   RUN_UNARY_TEST(inverse);
    152   RUN_UNARY_TEST(tanh);
    153   RUN_UNARY_TEST(exp);
    154   RUN_UNARY_TEST(expm1);
    155   RUN_UNARY_TEST(log);
    156   RUN_UNARY_TEST(ceil);
    157   RUN_UNARY_TEST(floor);
    158   RUN_UNARY_TEST(round);
    159   RUN_UNARY_TEST(log1p);
    160   RUN_UNARY_TEST(sign);
    161 }
    162 
    163 template <typename DataType, int DataLayout, typename Operator>
    164 void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
    165                                      const array<int64_t, 3>& tensor_range) {
    166   /* out = op(in) */
    167   Operator op;
    168   Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
    169   Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
    170   in = in.random() + DataType(0.01);
    171   DataType *gpu_data = static_cast<DataType *>(
    172       sycl_device.allocate(in.size() * sizeof(DataType)));
    173   bool *gpu_data_out =
    174       static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
    175   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
    176   TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
    177   sycl_device.memcpyHostToDevice(gpu_data, in.data(),
    178                                  (in.size()) * sizeof(DataType));
    179   gpu_out.device(sycl_device) = op(gpu);
    180   sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
    181                                  (out.size()) * sizeof(bool));
    182   for (int64_t i = 0; i < out.size(); ++i) {
    183     VERIFY_IS_EQUAL(out(i), op(in(i)));
    184   }
    185   sycl_device.deallocate(gpu_data);
    186   sycl_device.deallocate(gpu_data_out);
    187 }
    188 
    189 template <typename DataType, int DataLayout>
    190 void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
    191                          const array<int64_t, 3>& tensor_range) {
    192   test_unary_builtins_for_assignement<DataType, DataLayout,
    193                                       PlusEqualAssignement>(sycl_device, tensor_range);
    194   test_unary_builtins_for_assignement<DataType, DataLayout,
    195                                       EqualAssignement>(sycl_device, tensor_range);
    196   test_unary_builtins_return_bool<DataType, DataLayout,
    197                                   op_isnan>(sycl_device, tensor_range);
    198   test_unary_builtins_return_bool<DataType, DataLayout,
    199                                   op_isfinite>(sycl_device, tensor_range);
    200   test_unary_builtins_return_bool<DataType, DataLayout,
    201                                   op_isinf>(sycl_device, tensor_range);
    202 }
    203 
    204 template <typename DataType>
    205 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
    206   int64_t sizeDim1 = 10;
    207   int64_t sizeDim2 = 10;
    208   int64_t sizeDim3 = 10;
    209   array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
    210 
    211   test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
    212   test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
    213 }
    214 
    215 template <typename DataType, int DataLayout, typename Operator>
    216 void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
    217                                const array<int64_t, 3>& tensor_range) {
    218   /* out = op(in_1, in_2) */
    219   Operator op;
    220   Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
    221   Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
    222   Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
    223   in_1 = in_1.random() + DataType(0.01);
    224   in_2 = in_2.random() + DataType(0.01);
    225   Tensor<DataType, 3, DataLayout, int64_t> reference(out);
    226   DataType *gpu_data_1 = static_cast<DataType *>(
    227       sycl_device.allocate(in_1.size() * sizeof(DataType)));
    228   DataType *gpu_data_2 = static_cast<DataType *>(
    229       sycl_device.allocate(in_2.size() * sizeof(DataType)));
    230   DataType *gpu_data_out = static_cast<DataType *>(
    231       sycl_device.allocate(out.size() * sizeof(DataType)));
    232   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
    233   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
    234   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
    235   sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
    236                                  (in_1.size()) * sizeof(DataType));
    237   sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
    238                                  (in_2.size()) * sizeof(DataType));
    239   gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
    240   sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
    241                                  (out.size()) * sizeof(DataType));
    242   for (int64_t i = 0; i < out.size(); ++i) {
    243     VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
    244   }
    245   sycl_device.deallocate(gpu_data_1);
    246   sycl_device.deallocate(gpu_data_2);
    247   sycl_device.deallocate(gpu_data_out);
    248 }
    249 
    250 template <typename DataType, int DataLayout, typename Operator>
    251 void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
    252                                      const array<int64_t, 3>& tensor_range) {
    253   /* out = op(in_1, 2) */
    254   Operator op;
    255   const DataType arg2(2);
    256   Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
    257   Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
    258   in_1 = in_1.random();
    259   Tensor<DataType, 3, DataLayout, int64_t> reference(out);
    260   DataType *gpu_data_1 = static_cast<DataType *>(
    261       sycl_device.allocate(in_1.size() * sizeof(DataType)));
    262   DataType *gpu_data_out = static_cast<DataType *>(
    263       sycl_device.allocate(out.size() * sizeof(DataType)));
    264   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
    265   TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
    266   sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
    267                                  (in_1.size()) * sizeof(DataType));
    268   gpu_out.device(sycl_device) = op(gpu_1, arg2);
    269   sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
    270                                  (out.size()) * sizeof(DataType));
    271   for (int64_t i = 0; i < out.size(); ++i) {
    272     VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
    273   }
    274   sycl_device.deallocate(gpu_data_1);
    275   sycl_device.deallocate(gpu_data_out);
    276 }
    277 
    278 #define DECLARE_BINARY_STRUCT(FUNC)                                                          \
    279   struct op_##FUNC {                                                                         \
    280     template <typename T1, typename T2>                                                      \
    281     auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
    282       return cl::sycl::FUNC(x, y);                                                           \
    283     }                                                                                        \
    284     template <typename T1, typename T2>                                                      \
    285     auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
    286       return x.FUNC(y);                                                                      \
    287     }                                                                                        \
    288   };
    289 
    290 DECLARE_BINARY_STRUCT(cwiseMax)
    291 DECLARE_BINARY_STRUCT(cwiseMin)
    292 
    293 #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
    294   struct op_##NAME {                                                      \
    295     template <typename T1, typename T2>                                   \
    296     auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
    297       return x OPERATOR y;                                                \
    298     }                                                                     \
    299   };
    300 
    301 DECLARE_BINARY_STRUCT_OP(plus, +)
    302 DECLARE_BINARY_STRUCT_OP(minus, -)
    303 DECLARE_BINARY_STRUCT_OP(times, *)
    304 DECLARE_BINARY_STRUCT_OP(divide, /)
    305 DECLARE_BINARY_STRUCT_OP(modulo, %)
    306 
    307 template <typename DataType, int DataLayout>
    308 void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
    309                           const array<int64_t, 3>& tensor_range) {
    310   test_binary_builtins_func<DataType, DataLayout,
    311                             op_cwiseMax>(sycl_device, tensor_range);
    312   test_binary_builtins_func<DataType, DataLayout,
    313                             op_cwiseMin>(sycl_device, tensor_range);
    314   test_binary_builtins_func<DataType, DataLayout,
    315                             op_plus>(sycl_device, tensor_range);
    316   test_binary_builtins_func<DataType, DataLayout,
    317                             op_minus>(sycl_device, tensor_range);
    318   test_binary_builtins_func<DataType, DataLayout,
    319                             op_times>(sycl_device, tensor_range);
    320   test_binary_builtins_func<DataType, DataLayout,
    321                             op_divide>(sycl_device, tensor_range);
    322 }
    323 
    324 template <typename DataType>
    325 static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
    326   int64_t sizeDim1 = 10;
    327   int64_t sizeDim2 = 10;
    328   int64_t sizeDim3 = 10;
    329   array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
    330   test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
    331   test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
    332 }
    333 
    334 template <typename DataType>
    335 static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
    336   int64_t sizeDim1 = 10;
    337   int64_t sizeDim2 = 10;
    338   int64_t sizeDim3 = 10;
    339   array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
    340   test_binary_builtins_fixed_arg2<DataType, RowMajor,
    341                                   op_modulo>(sycl_device, tensor_range);
    342   test_binary_builtins_fixed_arg2<DataType, ColMajor,
    343                                   op_modulo>(sycl_device, tensor_range);
    344 }
    345 
    346 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
    347   for (const auto& device :Eigen::get_sycl_supported_devices()) {
    348     QueueInterface queueInterface(device);
    349     Eigen::SyclDevice sycl_device(&queueInterface);
    350     CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
    351     CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
    352     CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
    353   }
    354 }