cxx11_tensor_builtins_sycl.cpp (15767B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // 10 // This Source Code Form is subject to the terms of the Mozilla 11 // Public License v. 2.0. If a copy of the MPL was not distributed 12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 14 #define EIGEN_TEST_NO_LONGDOUBLE 15 #define EIGEN_TEST_NO_COMPLEX 16 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #define EIGEN_USE_SYCL 19 20 #include "main.h" 21 #include <unsupported/Eigen/CXX11/Tensor> 22 23 using Eigen::array; 24 using Eigen::SyclDevice; 25 using Eigen::Tensor; 26 using Eigen::TensorMap; 27 28 // Functions used to compare the TensorMap implementation on the device with 29 // the equivalent on the host 30 namespace cl { 31 namespace sycl { 32 template <typename T> T abs(T x) { return cl::sycl::fabs(x); } 33 template <typename T> T square(T x) { return x * x; } 34 template <typename T> T cube(T x) { return x * x * x; } 35 template <typename T> T inverse(T x) { return T(1) / x; } 36 template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } 37 template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } 38 } 39 } 40 41 struct EqualAssignement { 42 template <typename Lhs, typename Rhs> 43 void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } 44 }; 45 46 struct PlusEqualAssignement { 47 template <typename Lhs, typename Rhs> 48 void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } 49 }; 50 51 template <typename DataType, int DataLayout, 52 typename Assignement, typename Operator> 53 void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, 54 const array<int64_t, 3>& tensor_range) { 55 Operator op; 56 Assignement asgn; 57 { 58 /* Assignement(out, Operator(in)) */ 59 Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); 60 Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); 61 in = in.random() + DataType(0.01); 62 out = out.random() + DataType(0.01); 63 Tensor<DataType, 3, DataLayout, int64_t> reference(out); 64 DataType *gpu_data = static_cast<DataType *>( 65 sycl_device.allocate(in.size() * sizeof(DataType))); 66 DataType *gpu_data_out = static_cast<DataType *>( 67 sycl_device.allocate(out.size() * sizeof(DataType))); 68 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); 69 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); 70 sycl_device.memcpyHostToDevice(gpu_data, in.data(), 71 (in.size()) * sizeof(DataType)); 72 sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), 73 (out.size()) * sizeof(DataType)); 74 auto device_expr = gpu_out.device(sycl_device); 75 asgn(device_expr, op(gpu)); 76 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, 77 (out.size()) * sizeof(DataType)); 78 for (int64_t i = 0; i < out.size(); ++i) { 79 DataType ver = reference(i); 80 asgn(ver, op(in(i))); 81 VERIFY_IS_APPROX(out(i), ver); 82 } 83 sycl_device.deallocate(gpu_data); 84 sycl_device.deallocate(gpu_data_out); 85 } 86 { 87 /* Assignement(out, Operator(out)) */ 88 Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); 89 out = out.random() + DataType(0.01); 90 Tensor<DataType, 3, DataLayout, int64_t> reference(out); 91 DataType *gpu_data_out = static_cast<DataType *>( 92 sycl_device.allocate(out.size() * sizeof(DataType))); 93 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); 94 sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), 95 (out.size()) * sizeof(DataType)); 96 auto device_expr = gpu_out.device(sycl_device); 97 asgn(device_expr, op(gpu_out)); 98 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, 99 (out.size()) * sizeof(DataType)); 100 for (int64_t i = 0; i < out.size(); ++i) { 101 DataType ver = reference(i); 102 asgn(ver, op(reference(i))); 103 VERIFY_IS_APPROX(out(i), ver); 104 } 105 sycl_device.deallocate(gpu_data_out); 106 } 107 } 108 109 #define DECLARE_UNARY_STRUCT(FUNC) \ 110 struct op_##FUNC { \ 111 template <typename T> \ 112 auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ 113 return cl::sycl::FUNC(x); \ 114 } \ 115 template <typename T> \ 116 auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \ 117 return x.FUNC(); \ 118 } \ 119 }; 120 121 DECLARE_UNARY_STRUCT(abs) 122 DECLARE_UNARY_STRUCT(sqrt) 123 DECLARE_UNARY_STRUCT(rsqrt) 124 DECLARE_UNARY_STRUCT(square) 125 DECLARE_UNARY_STRUCT(cube) 126 DECLARE_UNARY_STRUCT(inverse) 127 DECLARE_UNARY_STRUCT(tanh) 128 DECLARE_UNARY_STRUCT(exp) 129 DECLARE_UNARY_STRUCT(expm1) 130 DECLARE_UNARY_STRUCT(log) 131 DECLARE_UNARY_STRUCT(ceil) 132 DECLARE_UNARY_STRUCT(floor) 133 DECLARE_UNARY_STRUCT(round) 134 DECLARE_UNARY_STRUCT(log1p) 135 DECLARE_UNARY_STRUCT(sign) 136 DECLARE_UNARY_STRUCT(isnan) 137 DECLARE_UNARY_STRUCT(isfinite) 138 DECLARE_UNARY_STRUCT(isinf) 139 140 template <typename DataType, int DataLayout, typename Assignement> 141 void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, 142 const array<int64_t, 3>& tensor_range) { 143 #define RUN_UNARY_TEST(FUNC) \ 144 test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \ 145 op_##FUNC>(sycl_device, tensor_range) 146 RUN_UNARY_TEST(abs); 147 RUN_UNARY_TEST(sqrt); 148 RUN_UNARY_TEST(rsqrt); 149 RUN_UNARY_TEST(square); 150 RUN_UNARY_TEST(cube); 151 RUN_UNARY_TEST(inverse); 152 RUN_UNARY_TEST(tanh); 153 RUN_UNARY_TEST(exp); 154 RUN_UNARY_TEST(expm1); 155 RUN_UNARY_TEST(log); 156 RUN_UNARY_TEST(ceil); 157 RUN_UNARY_TEST(floor); 158 RUN_UNARY_TEST(round); 159 RUN_UNARY_TEST(log1p); 160 RUN_UNARY_TEST(sign); 161 } 162 163 template <typename DataType, int DataLayout, typename Operator> 164 void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device, 165 const array<int64_t, 3>& tensor_range) { 166 /* out = op(in) */ 167 Operator op; 168 Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); 169 Tensor<bool, 3, DataLayout, int64_t> out(tensor_range); 170 in = in.random() + DataType(0.01); 171 DataType *gpu_data = static_cast<DataType *>( 172 sycl_device.allocate(in.size() * sizeof(DataType))); 173 bool *gpu_data_out = 174 static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); 175 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); 176 TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); 177 sycl_device.memcpyHostToDevice(gpu_data, in.data(), 178 (in.size()) * sizeof(DataType)); 179 gpu_out.device(sycl_device) = op(gpu); 180 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, 181 (out.size()) * sizeof(bool)); 182 for (int64_t i = 0; i < out.size(); ++i) { 183 VERIFY_IS_EQUAL(out(i), op(in(i))); 184 } 185 sycl_device.deallocate(gpu_data); 186 sycl_device.deallocate(gpu_data_out); 187 } 188 189 template <typename DataType, int DataLayout> 190 void test_unary_builtins(const Eigen::SyclDevice& sycl_device, 191 const array<int64_t, 3>& tensor_range) { 192 test_unary_builtins_for_assignement<DataType, DataLayout, 193 PlusEqualAssignement>(sycl_device, tensor_range); 194 test_unary_builtins_for_assignement<DataType, DataLayout, 195 EqualAssignement>(sycl_device, tensor_range); 196 test_unary_builtins_return_bool<DataType, DataLayout, 197 op_isnan>(sycl_device, tensor_range); 198 test_unary_builtins_return_bool<DataType, DataLayout, 199 op_isfinite>(sycl_device, tensor_range); 200 test_unary_builtins_return_bool<DataType, DataLayout, 201 op_isinf>(sycl_device, tensor_range); 202 } 203 204 template <typename DataType> 205 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { 206 int64_t sizeDim1 = 10; 207 int64_t sizeDim2 = 10; 208 int64_t sizeDim3 = 10; 209 array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; 210 211 test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range); 212 test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range); 213 } 214 215 template <typename DataType, int DataLayout, typename Operator> 216 void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device, 217 const array<int64_t, 3>& tensor_range) { 218 /* out = op(in_1, in_2) */ 219 Operator op; 220 Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); 221 Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range); 222 Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); 223 in_1 = in_1.random() + DataType(0.01); 224 in_2 = in_2.random() + DataType(0.01); 225 Tensor<DataType, 3, DataLayout, int64_t> reference(out); 226 DataType *gpu_data_1 = static_cast<DataType *>( 227 sycl_device.allocate(in_1.size() * sizeof(DataType))); 228 DataType *gpu_data_2 = static_cast<DataType *>( 229 sycl_device.allocate(in_2.size() * sizeof(DataType))); 230 DataType *gpu_data_out = static_cast<DataType *>( 231 sycl_device.allocate(out.size() * sizeof(DataType))); 232 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); 233 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range); 234 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); 235 sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), 236 (in_1.size()) * sizeof(DataType)); 237 sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), 238 (in_2.size()) * sizeof(DataType)); 239 gpu_out.device(sycl_device) = op(gpu_1, gpu_2); 240 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, 241 (out.size()) * sizeof(DataType)); 242 for (int64_t i = 0; i < out.size(); ++i) { 243 VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i))); 244 } 245 sycl_device.deallocate(gpu_data_1); 246 sycl_device.deallocate(gpu_data_2); 247 sycl_device.deallocate(gpu_data_out); 248 } 249 250 template <typename DataType, int DataLayout, typename Operator> 251 void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, 252 const array<int64_t, 3>& tensor_range) { 253 /* out = op(in_1, 2) */ 254 Operator op; 255 const DataType arg2(2); 256 Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); 257 Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); 258 in_1 = in_1.random(); 259 Tensor<DataType, 3, DataLayout, int64_t> reference(out); 260 DataType *gpu_data_1 = static_cast<DataType *>( 261 sycl_device.allocate(in_1.size() * sizeof(DataType))); 262 DataType *gpu_data_out = static_cast<DataType *>( 263 sycl_device.allocate(out.size() * sizeof(DataType))); 264 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); 265 TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); 266 sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), 267 (in_1.size()) * sizeof(DataType)); 268 gpu_out.device(sycl_device) = op(gpu_1, arg2); 269 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, 270 (out.size()) * sizeof(DataType)); 271 for (int64_t i = 0; i < out.size(); ++i) { 272 VERIFY_IS_APPROX(out(i), op(in_1(i), arg2)); 273 } 274 sycl_device.deallocate(gpu_data_1); 275 sycl_device.deallocate(gpu_data_out); 276 } 277 278 #define DECLARE_BINARY_STRUCT(FUNC) \ 279 struct op_##FUNC { \ 280 template <typename T1, typename T2> \ 281 auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \ 282 return cl::sycl::FUNC(x, y); \ 283 } \ 284 template <typename T1, typename T2> \ 285 auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \ 286 return x.FUNC(y); \ 287 } \ 288 }; 289 290 DECLARE_BINARY_STRUCT(cwiseMax) 291 DECLARE_BINARY_STRUCT(cwiseMin) 292 293 #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ 294 struct op_##NAME { \ 295 template <typename T1, typename T2> \ 296 auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \ 297 return x OPERATOR y; \ 298 } \ 299 }; 300 301 DECLARE_BINARY_STRUCT_OP(plus, +) 302 DECLARE_BINARY_STRUCT_OP(minus, -) 303 DECLARE_BINARY_STRUCT_OP(times, *) 304 DECLARE_BINARY_STRUCT_OP(divide, /) 305 DECLARE_BINARY_STRUCT_OP(modulo, %) 306 307 template <typename DataType, int DataLayout> 308 void test_binary_builtins(const Eigen::SyclDevice& sycl_device, 309 const array<int64_t, 3>& tensor_range) { 310 test_binary_builtins_func<DataType, DataLayout, 311 op_cwiseMax>(sycl_device, tensor_range); 312 test_binary_builtins_func<DataType, DataLayout, 313 op_cwiseMin>(sycl_device, tensor_range); 314 test_binary_builtins_func<DataType, DataLayout, 315 op_plus>(sycl_device, tensor_range); 316 test_binary_builtins_func<DataType, DataLayout, 317 op_minus>(sycl_device, tensor_range); 318 test_binary_builtins_func<DataType, DataLayout, 319 op_times>(sycl_device, tensor_range); 320 test_binary_builtins_func<DataType, DataLayout, 321 op_divide>(sycl_device, tensor_range); 322 } 323 324 template <typename DataType> 325 static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { 326 int64_t sizeDim1 = 10; 327 int64_t sizeDim2 = 10; 328 int64_t sizeDim3 = 10; 329 array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; 330 test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range); 331 test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range); 332 } 333 334 template <typename DataType> 335 static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { 336 int64_t sizeDim1 = 10; 337 int64_t sizeDim2 = 10; 338 int64_t sizeDim3 = 10; 339 array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; 340 test_binary_builtins_fixed_arg2<DataType, RowMajor, 341 op_modulo>(sycl_device, tensor_range); 342 test_binary_builtins_fixed_arg2<DataType, ColMajor, 343 op_modulo>(sycl_device, tensor_range); 344 } 345 346 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { 347 for (const auto& device :Eigen::get_sycl_supported_devices()) { 348 QueueInterface queueInterface(device); 349 Eigen::SyclDevice sycl_device(&queueInterface); 350 CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device)); 351 CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device)); 352 CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device)); 353 } 354 }