cxx11_tensor_reduction_sycl.cpp (42176B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2015 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // 10 // This Source Code Form is subject to the terms of the Mozilla 11 // Public License v. 2.0. If a copy of the MPL was not distributed 12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 14 #define EIGEN_TEST_NO_LONGDOUBLE 15 #define EIGEN_TEST_NO_COMPLEX 16 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #define EIGEN_USE_SYCL 19 #define EIGEN_HAS_CONSTEXPR 1 20 21 #include "main.h" 22 23 #include <unsupported/Eigen/CXX11/Tensor> 24 25 template <typename DataType, int DataLayout, typename IndexType> 26 static void test_full_reductions_sum_sycl( 27 const Eigen::SyclDevice& sycl_device) { 28 const IndexType num_rows = 753; 29 const IndexType num_cols = 537; 30 array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; 31 32 array<IndexType, 2> outRange = {{1, 1}}; 33 34 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); 35 Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange); 36 Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange); 37 38 in.setRandom(); 39 auto dim = DSizes<IndexType, 2>(1, 1); 40 full_redux = in.sum().reshape(dim); 41 42 DataType* gpu_in_data = static_cast<DataType*>( 43 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 44 DataType* gpu_out_data = (DataType*)sycl_device.allocate( 45 sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); 46 47 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, 48 tensorRange); 49 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, 50 outRange); 51 sycl_device.memcpyHostToDevice( 52 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 53 out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); 54 sycl_device.memcpyDeviceToHost( 55 full_redux_gpu.data(), gpu_out_data, 56 (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); 57 // Check that the CPU and GPU reductions return the same result. 58 std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) 59 << ", CPU FULL: " << full_redux(0, 0) << "\n"; 60 VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); 61 sycl_device.deallocate(gpu_in_data); 62 sycl_device.deallocate(gpu_out_data); 63 } 64 65 template <typename DataType, int DataLayout, typename IndexType> 66 static void test_full_reductions_sum_with_offset_sycl( 67 const Eigen::SyclDevice& sycl_device) { 68 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 69 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; 70 const IndexType num_rows = 64; 71 const IndexType num_cols = 64; 72 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 73 const IndexType n_elems = internal::array_prod(tensor_range); 74 75 data_tensor in(tensor_range); 76 scalar_tensor full_redux; 77 scalar_tensor full_redux_gpu; 78 79 in.setRandom(); 80 array<IndexType, 2> tensor_offset_range(tensor_range); 81 tensor_offset_range[0] -= 1; 82 83 const IndexType offset = 64; 84 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 85 full_redux = in_offset.sum(); 86 87 DataType* gpu_in_data = 88 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 89 DataType* gpu_out_data = 90 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); 91 92 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 93 TensorMap<scalar_tensor> out_gpu(gpu_out_data); 94 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 95 n_elems * sizeof(DataType)); 96 out_gpu.device(sycl_device) = in_gpu.sum(); 97 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 98 sizeof(DataType)); 99 100 // Check that the CPU and GPU reductions return the same result. 101 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 102 103 sycl_device.deallocate(gpu_in_data); 104 sycl_device.deallocate(gpu_out_data); 105 } 106 107 template <typename DataType, int DataLayout, typename IndexType> 108 static void test_full_reductions_max_sycl( 109 const Eigen::SyclDevice& sycl_device) { 110 const IndexType num_rows = 4096; 111 const IndexType num_cols = 4096; 112 array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; 113 114 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); 115 Tensor<DataType, 0, DataLayout, IndexType> full_redux; 116 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; 117 118 in.setRandom(); 119 120 full_redux = in.maximum(); 121 122 DataType* gpu_in_data = static_cast<DataType*>( 123 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 124 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); 125 126 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, 127 tensorRange); 128 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); 129 sycl_device.memcpyHostToDevice( 130 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 131 out_gpu.device(sycl_device) = in_gpu.maximum(); 132 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 133 sizeof(DataType)); 134 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 135 sycl_device.deallocate(gpu_in_data); 136 sycl_device.deallocate(gpu_out_data); 137 } 138 139 template <typename DataType, int DataLayout, typename IndexType> 140 static void test_full_reductions_max_with_offset_sycl( 141 const Eigen::SyclDevice& sycl_device) { 142 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 143 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; 144 const IndexType num_rows = 64; 145 const IndexType num_cols = 64; 146 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 147 const IndexType n_elems = internal::array_prod(tensor_range); 148 149 data_tensor in(tensor_range); 150 scalar_tensor full_redux; 151 scalar_tensor full_redux_gpu; 152 153 in.setRandom(); 154 array<IndexType, 2> tensor_offset_range(tensor_range); 155 tensor_offset_range[0] -= 1; 156 // Set the initial value to be the max. 157 // As we don't include this in the reduction the result should not be 2. 158 in(0) = static_cast<DataType>(2); 159 160 const IndexType offset = 64; 161 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 162 full_redux = in_offset.maximum(); 163 VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); 164 165 DataType* gpu_in_data = 166 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 167 DataType* gpu_out_data = 168 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); 169 170 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 171 TensorMap<scalar_tensor> out_gpu(gpu_out_data); 172 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 173 n_elems * sizeof(DataType)); 174 out_gpu.device(sycl_device) = in_gpu.maximum(); 175 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 176 sizeof(DataType)); 177 178 // Check that the CPU and GPU reductions return the same result. 179 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 180 181 sycl_device.deallocate(gpu_in_data); 182 sycl_device.deallocate(gpu_out_data); 183 } 184 185 template <typename DataType, int DataLayout, typename IndexType> 186 static void test_full_reductions_mean_sycl( 187 const Eigen::SyclDevice& sycl_device) { 188 const IndexType num_rows = 4096; 189 const IndexType num_cols = 4096; 190 array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; 191 array<IndexType, 1> argRange = {{num_cols}}; 192 Eigen::array<IndexType, 1> red_axis; 193 red_axis[0] = 0; 194 // red_axis[1]=1; 195 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); 196 Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange); 197 Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange); 198 Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange); 199 Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange); 200 Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange); 201 Tensor<DataType, 0, DataLayout, IndexType> full_redux; 202 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; 203 204 in.setRandom(); 205 in_arg1.setRandom(); 206 in_arg2.setRandom(); 207 208 DataType* gpu_in_data = static_cast<DataType*>( 209 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 210 DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate( 211 in_arg1.dimensions().TotalSize() * sizeof(DataType))); 212 DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate( 213 in_arg2.dimensions().TotalSize() * sizeof(DataType))); 214 bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate( 215 out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); 216 bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate( 217 out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); 218 219 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); 220 221 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, 222 tensorRange); 223 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu( 224 gpu_in_arg1_data, tensorRange); 225 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu( 226 gpu_in_arg2_data, tensorRange); 227 TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu( 228 gpu_out_arg_data, argRange); 229 TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper( 230 gpu_out_arg__gpu_helper_data, argRange); 231 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); 232 233 // CPU VERSION 234 out_arg_cpu = 235 (in_arg1.argmax(1) == in_arg2.argmax(1)) 236 .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); 237 full_redux = (out_arg_cpu.template cast<float>()) 238 .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); 239 240 // GPU VERSION 241 sycl_device.memcpyHostToDevice( 242 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 243 sycl_device.memcpyHostToDevice( 244 gpu_in_arg1_data, in_arg1.data(), 245 (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); 246 sycl_device.memcpyHostToDevice( 247 gpu_in_arg2_data, in_arg2.data(), 248 (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); 249 out_Argout_gpu_helper.device(sycl_device) = 250 (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); 251 out_Argout_gpu.device(sycl_device) = 252 (out_Argout_gpu_helper) 253 .select(out_Argout_gpu.constant(true), 254 out_Argout_gpu.constant(false)); 255 out_gpu.device(sycl_device) = 256 (out_Argout_gpu.template cast<float>()) 257 .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); 258 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 259 sizeof(DataType)); 260 // Check that the CPU and GPU reductions return the same result. 261 std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() 262 << '\n'; 263 VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); 264 sycl_device.deallocate(gpu_in_data); 265 sycl_device.deallocate(gpu_in_arg1_data); 266 sycl_device.deallocate(gpu_in_arg2_data); 267 sycl_device.deallocate(gpu_out_arg__gpu_helper_data); 268 sycl_device.deallocate(gpu_out_arg_data); 269 sycl_device.deallocate(gpu_out_data); 270 } 271 272 template <typename DataType, int DataLayout, typename IndexType> 273 static void test_full_reductions_mean_with_offset_sycl( 274 const Eigen::SyclDevice& sycl_device) { 275 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 276 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; 277 const IndexType num_rows = 64; 278 const IndexType num_cols = 64; 279 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 280 const IndexType n_elems = internal::array_prod(tensor_range); 281 282 data_tensor in(tensor_range); 283 scalar_tensor full_redux; 284 scalar_tensor full_redux_gpu; 285 286 in.setRandom(); 287 array<IndexType, 2> tensor_offset_range(tensor_range); 288 tensor_offset_range[0] -= 1; 289 290 const IndexType offset = 64; 291 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 292 full_redux = in_offset.mean(); 293 VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); 294 295 DataType* gpu_in_data = 296 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 297 DataType* gpu_out_data = 298 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); 299 300 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 301 TensorMap<scalar_tensor> out_gpu(gpu_out_data); 302 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 303 n_elems * sizeof(DataType)); 304 out_gpu.device(sycl_device) = in_gpu.mean(); 305 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 306 sizeof(DataType)); 307 308 // Check that the CPU and GPU reductions return the same result. 309 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 310 311 sycl_device.deallocate(gpu_in_data); 312 sycl_device.deallocate(gpu_out_data); 313 } 314 315 template <typename DataType, int DataLayout, typename IndexType> 316 static void test_full_reductions_mean_with_odd_offset_sycl( 317 const Eigen::SyclDevice& sycl_device) { 318 // This is a particular case which illustrates a possible problem when the 319 // number of local threads in a workgroup is even, but is not a power of two. 320 using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>; 321 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; 322 // 2177 = (17 * 128) + 1 gives rise to 18 local threads. 323 // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. 324 const IndexType n_elems = 8707; 325 array<IndexType, 1> tensor_range = {{n_elems}}; 326 327 data_tensor in(tensor_range); 328 DataType full_redux; 329 DataType full_redux_gpu; 330 TensorMap<scalar_tensor> red_cpu(&full_redux); 331 TensorMap<scalar_tensor> red_gpu(&full_redux_gpu); 332 333 const DataType const_val = static_cast<DataType>(0.6391); 334 in = in.constant(const_val); 335 336 Eigen::IndexList<Eigen::type2index<0>> red_axis; 337 red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); 338 VERIFY_IS_APPROX(const_val, red_cpu()); 339 340 DataType* gpu_in_data = 341 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 342 DataType* gpu_out_data = 343 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); 344 345 TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range); 346 TensorMap<scalar_tensor> out_gpu(gpu_out_data); 347 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 348 n_elems * sizeof(DataType)); 349 out_gpu.device(sycl_device) = 350 in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); 351 sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, 352 sizeof(DataType)); 353 354 // Check that the CPU and GPU reductions return the same result. 355 VERIFY_IS_APPROX(full_redux_gpu, full_redux); 356 357 sycl_device.deallocate(gpu_in_data); 358 sycl_device.deallocate(gpu_out_data); 359 } 360 361 template <typename DataType, int DataLayout, typename IndexType> 362 static void test_full_reductions_min_sycl( 363 const Eigen::SyclDevice& sycl_device) { 364 const IndexType num_rows = 876; 365 const IndexType num_cols = 953; 366 array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; 367 368 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); 369 Tensor<DataType, 0, DataLayout, IndexType> full_redux; 370 Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; 371 372 in.setRandom(); 373 374 full_redux = in.minimum(); 375 376 DataType* gpu_in_data = static_cast<DataType*>( 377 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 378 DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); 379 380 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, 381 tensorRange); 382 TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); 383 384 sycl_device.memcpyHostToDevice( 385 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 386 out_gpu.device(sycl_device) = in_gpu.minimum(); 387 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 388 sizeof(DataType)); 389 // Check that the CPU and GPU reductions return the same result. 390 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 391 sycl_device.deallocate(gpu_in_data); 392 sycl_device.deallocate(gpu_out_data); 393 } 394 395 template <typename DataType, int DataLayout, typename IndexType> 396 static void test_full_reductions_min_with_offset_sycl( 397 const Eigen::SyclDevice& sycl_device) { 398 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 399 using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; 400 const IndexType num_rows = 64; 401 const IndexType num_cols = 64; 402 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 403 const IndexType n_elems = internal::array_prod(tensor_range); 404 405 data_tensor in(tensor_range); 406 scalar_tensor full_redux; 407 scalar_tensor full_redux_gpu; 408 409 in.setRandom(); 410 array<IndexType, 2> tensor_offset_range(tensor_range); 411 tensor_offset_range[0] -= 1; 412 // Set the initial value to be the min. 413 // As we don't include this in the reduction the result should not be -2. 414 in(0) = static_cast<DataType>(-2); 415 416 const IndexType offset = 64; 417 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 418 full_redux = in_offset.minimum(); 419 VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); 420 421 DataType* gpu_in_data = 422 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 423 DataType* gpu_out_data = 424 static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); 425 426 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 427 TensorMap<scalar_tensor> out_gpu(gpu_out_data); 428 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 429 n_elems * sizeof(DataType)); 430 out_gpu.device(sycl_device) = in_gpu.minimum(); 431 sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, 432 sizeof(DataType)); 433 434 // Check that the CPU and GPU reductions return the same result. 435 VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); 436 437 sycl_device.deallocate(gpu_in_data); 438 sycl_device.deallocate(gpu_out_data); 439 } 440 template <typename DataType, int DataLayout, typename IndexType> 441 static void test_first_dim_reductions_max_sycl( 442 const Eigen::SyclDevice& sycl_device) { 443 IndexType dim_x = 145; 444 IndexType dim_y = 1; 445 IndexType dim_z = 67; 446 447 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; 448 Eigen::array<IndexType, 1> red_axis; 449 red_axis[0] = 0; 450 array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; 451 452 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); 453 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); 454 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); 455 456 in.setRandom(); 457 458 redux = in.maximum(red_axis); 459 460 DataType* gpu_in_data = static_cast<DataType*>( 461 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 462 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 463 redux_gpu.dimensions().TotalSize() * sizeof(DataType))); 464 465 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, 466 tensorRange); 467 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( 468 gpu_out_data, reduced_tensorRange); 469 470 sycl_device.memcpyHostToDevice( 471 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 472 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); 473 sycl_device.memcpyDeviceToHost( 474 redux_gpu.data(), gpu_out_data, 475 redux_gpu.dimensions().TotalSize() * sizeof(DataType)); 476 477 // Check that the CPU and GPU reductions return the same result. 478 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) 479 for (IndexType k = 0; k < reduced_tensorRange[1]; k++) 480 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); 481 482 sycl_device.deallocate(gpu_in_data); 483 sycl_device.deallocate(gpu_out_data); 484 } 485 486 template <typename DataType, int DataLayout, typename IndexType> 487 static void test_first_dim_reductions_max_with_offset_sycl( 488 const Eigen::SyclDevice& sycl_device) { 489 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 490 using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; 491 492 const IndexType num_rows = 64; 493 const IndexType num_cols = 64; 494 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 495 array<IndexType, 1> reduced_range = {{num_cols}}; 496 const IndexType n_elems = internal::array_prod(tensor_range); 497 const IndexType n_reduced = num_cols; 498 499 data_tensor in(tensor_range); 500 reduced_tensor redux; 501 reduced_tensor redux_gpu(reduced_range); 502 503 in.setRandom(); 504 array<IndexType, 2> tensor_offset_range(tensor_range); 505 tensor_offset_range[0] -= 1; 506 // Set maximum value outside of the considered range. 507 for (IndexType i = 0; i < n_reduced; i++) { 508 in(i) = static_cast<DataType>(2); 509 } 510 511 Eigen::array<IndexType, 1> red_axis; 512 red_axis[0] = 0; 513 514 const IndexType offset = 64; 515 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 516 redux = in_offset.maximum(red_axis); 517 for (IndexType i = 0; i < n_reduced; i++) { 518 VERIFY_IS_NOT_EQUAL(redux(i), in(i)); 519 } 520 521 DataType* gpu_in_data = 522 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 523 DataType* gpu_out_data = static_cast<DataType*>( 524 sycl_device.allocate(n_reduced * sizeof(DataType))); 525 526 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 527 TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range); 528 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 529 n_elems * sizeof(DataType)); 530 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); 531 sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, 532 n_reduced * sizeof(DataType)); 533 534 // Check that the CPU and GPU reductions return the same result. 535 for (IndexType i = 0; i < n_reduced; i++) { 536 VERIFY_IS_APPROX(redux_gpu(i), redux(i)); 537 } 538 539 sycl_device.deallocate(gpu_in_data); 540 sycl_device.deallocate(gpu_out_data); 541 } 542 543 template <typename DataType, int DataLayout, typename IndexType> 544 static void test_last_dim_reductions_max_with_offset_sycl( 545 const Eigen::SyclDevice& sycl_device) { 546 using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; 547 using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; 548 549 const IndexType num_rows = 64; 550 const IndexType num_cols = 64; 551 array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; 552 array<IndexType, 1> full_reduced_range = {{num_rows}}; 553 array<IndexType, 1> reduced_range = {{num_rows - 1}}; 554 const IndexType n_elems = internal::array_prod(tensor_range); 555 const IndexType n_reduced = reduced_range[0]; 556 557 data_tensor in(tensor_range); 558 reduced_tensor redux(full_reduced_range); 559 reduced_tensor redux_gpu(reduced_range); 560 561 in.setRandom(); 562 redux.setZero(); 563 array<IndexType, 2> tensor_offset_range(tensor_range); 564 tensor_offset_range[0] -= 1; 565 // Set maximum value outside of the considered range. 566 for (IndexType i = 0; i < n_reduced; i++) { 567 in(i) = static_cast<DataType>(2); 568 } 569 570 Eigen::array<IndexType, 1> red_axis; 571 red_axis[0] = 1; 572 573 const IndexType offset = 64; 574 // Introduce an offset in both the input and the output. 575 TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); 576 TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range); 577 red_offset = in_offset.maximum(red_axis); 578 579 // Check that the first value hasn't been changed and that the reduced values 580 // are not equal to the previously set maximum in the input outside the range. 581 VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0)); 582 for (IndexType i = 0; i < n_reduced; i++) { 583 VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); 584 } 585 586 DataType* gpu_in_data = 587 static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); 588 DataType* gpu_out_data = static_cast<DataType*>( 589 sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); 590 591 TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); 592 TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range); 593 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), 594 n_elems * sizeof(DataType)); 595 out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); 596 sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), 597 n_reduced * sizeof(DataType)); 598 599 // Check that the CPU and GPU reductions return the same result. 600 for (IndexType i = 0; i < n_reduced; i++) { 601 VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); 602 } 603 604 sycl_device.deallocate(gpu_in_data); 605 sycl_device.deallocate(gpu_out_data); 606 } 607 608 template <typename DataType, int DataLayout, typename IndexType> 609 static void test_first_dim_reductions_sum_sycl( 610 const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { 611 array<IndexType, 2> tensorRange = {{dim_x, dim_y}}; 612 Eigen::array<IndexType, 1> red_axis; 613 red_axis[0] = 0; 614 array<IndexType, 1> reduced_tensorRange = {{dim_y}}; 615 616 Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); 617 Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange); 618 Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange); 619 620 in.setRandom(); 621 redux = in.sum(red_axis); 622 623 DataType* gpu_in_data = static_cast<DataType*>( 624 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 625 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 626 redux_gpu.dimensions().TotalSize() * sizeof(DataType))); 627 628 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, 629 tensorRange); 630 TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu( 631 gpu_out_data, reduced_tensorRange); 632 633 sycl_device.memcpyHostToDevice( 634 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 635 out_gpu.device(sycl_device) = in_gpu.sum(red_axis); 636 sycl_device.memcpyDeviceToHost( 637 redux_gpu.data(), gpu_out_data, 638 redux_gpu.dimensions().TotalSize() * sizeof(DataType)); 639 640 // Check that the CPU and GPU reductions return the same result. 641 for (IndexType i = 0; i < redux.size(); i++) { 642 VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); 643 } 644 sycl_device.deallocate(gpu_in_data); 645 sycl_device.deallocate(gpu_out_data); 646 } 647 648 template <typename DataType, int DataLayout, typename IndexType> 649 static void test_first_dim_reductions_mean_sycl( 650 const Eigen::SyclDevice& sycl_device) { 651 IndexType dim_x = 145; 652 IndexType dim_y = 1; 653 IndexType dim_z = 67; 654 655 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; 656 Eigen::array<IndexType, 1> red_axis; 657 red_axis[0] = 0; 658 array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; 659 660 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); 661 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); 662 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); 663 664 in.setRandom(); 665 666 redux = in.mean(red_axis); 667 668 DataType* gpu_in_data = static_cast<DataType*>( 669 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 670 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 671 redux_gpu.dimensions().TotalSize() * sizeof(DataType))); 672 673 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, 674 tensorRange); 675 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( 676 gpu_out_data, reduced_tensorRange); 677 678 sycl_device.memcpyHostToDevice( 679 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 680 out_gpu.device(sycl_device) = in_gpu.mean(red_axis); 681 sycl_device.memcpyDeviceToHost( 682 redux_gpu.data(), gpu_out_data, 683 redux_gpu.dimensions().TotalSize() * sizeof(DataType)); 684 685 // Check that the CPU and GPU reductions return the same result. 686 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) 687 for (IndexType k = 0; k < reduced_tensorRange[1]; k++) 688 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); 689 690 sycl_device.deallocate(gpu_in_data); 691 sycl_device.deallocate(gpu_out_data); 692 } 693 694 template <typename DataType, int DataLayout, typename IndexType> 695 static void test_last_dim_reductions_mean_sycl( 696 const Eigen::SyclDevice& sycl_device) { 697 IndexType dim_x = 64; 698 IndexType dim_y = 1; 699 IndexType dim_z = 32; 700 701 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; 702 Eigen::array<IndexType, 1> red_axis; 703 red_axis[0] = 2; 704 array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; 705 706 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); 707 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); 708 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); 709 710 in.setRandom(); 711 712 redux = in.mean(red_axis); 713 714 DataType* gpu_in_data = static_cast<DataType*>( 715 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 716 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 717 redux_gpu.dimensions().TotalSize() * sizeof(DataType))); 718 719 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, 720 tensorRange); 721 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( 722 gpu_out_data, reduced_tensorRange); 723 724 sycl_device.memcpyHostToDevice( 725 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 726 out_gpu.device(sycl_device) = in_gpu.mean(red_axis); 727 sycl_device.memcpyDeviceToHost( 728 redux_gpu.data(), gpu_out_data, 729 redux_gpu.dimensions().TotalSize() * sizeof(DataType)); 730 // Check that the CPU and GPU reductions return the same result. 731 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) 732 for (IndexType k = 0; k < reduced_tensorRange[1]; k++) 733 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); 734 735 sycl_device.deallocate(gpu_in_data); 736 sycl_device.deallocate(gpu_out_data); 737 } 738 739 template <typename DataType, int DataLayout, typename IndexType> 740 static void test_last_dim_reductions_sum_sycl( 741 const Eigen::SyclDevice& sycl_device) { 742 IndexType dim_x = 64; 743 IndexType dim_y = 1; 744 IndexType dim_z = 32; 745 746 array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; 747 Eigen::array<IndexType, 1> red_axis; 748 red_axis[0] = 2; 749 array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; 750 751 Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); 752 Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); 753 Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); 754 755 in.setRandom(); 756 757 redux = in.sum(red_axis); 758 759 DataType* gpu_in_data = static_cast<DataType*>( 760 sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); 761 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 762 redux_gpu.dimensions().TotalSize() * sizeof(DataType))); 763 764 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, 765 tensorRange); 766 TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( 767 gpu_out_data, reduced_tensorRange); 768 769 sycl_device.memcpyHostToDevice( 770 gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); 771 out_gpu.device(sycl_device) = in_gpu.sum(red_axis); 772 sycl_device.memcpyDeviceToHost( 773 redux_gpu.data(), gpu_out_data, 774 redux_gpu.dimensions().TotalSize() * sizeof(DataType)); 775 // Check that the CPU and GPU reductions return the same result. 776 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) 777 for (IndexType k = 0; k < reduced_tensorRange[1]; k++) 778 VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); 779 780 sycl_device.deallocate(gpu_in_data); 781 sycl_device.deallocate(gpu_out_data); 782 } 783 784 template <typename DataType, int DataLayout, typename IndexType> 785 static void test_last_reductions_sum_sycl( 786 const Eigen::SyclDevice& sycl_device) { 787 auto tensorRange = Sizes<64, 32>(64, 32); 788 // auto red_axis = Sizes<0,1>(0,1); 789 Eigen::IndexList<Eigen::type2index<1>> red_axis; 790 auto reduced_tensorRange = Sizes<64>(64); 791 TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; 792 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; 793 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; 794 795 in_fix.setRandom(); 796 797 redux_fix = in_fix.sum(red_axis); 798 799 DataType* gpu_in_data = static_cast<DataType*>( 800 sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); 801 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 802 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); 803 804 TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( 805 gpu_in_data, tensorRange); 806 TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( 807 gpu_out_data, reduced_tensorRange); 808 809 sycl_device.memcpyHostToDevice( 810 gpu_in_data, in_fix.data(), 811 (in_fix.dimensions().TotalSize()) * sizeof(DataType)); 812 out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); 813 sycl_device.memcpyDeviceToHost( 814 redux_gpu_fix.data(), gpu_out_data, 815 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); 816 // Check that the CPU and GPU reductions return the same result. 817 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { 818 VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); 819 } 820 821 sycl_device.deallocate(gpu_in_data); 822 sycl_device.deallocate(gpu_out_data); 823 } 824 825 template <typename DataType, int DataLayout, typename IndexType> 826 static void test_last_reductions_mean_sycl( 827 const Eigen::SyclDevice& sycl_device) { 828 auto tensorRange = Sizes<64, 32>(64, 32); 829 Eigen::IndexList<Eigen::type2index<1>> red_axis; 830 auto reduced_tensorRange = Sizes<64>(64); 831 TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; 832 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; 833 TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; 834 835 in_fix.setRandom(); 836 redux_fix = in_fix.mean(red_axis); 837 838 DataType* gpu_in_data = static_cast<DataType*>( 839 sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); 840 DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( 841 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); 842 843 TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( 844 gpu_in_data, tensorRange); 845 TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( 846 gpu_out_data, reduced_tensorRange); 847 848 sycl_device.memcpyHostToDevice( 849 gpu_in_data, in_fix.data(), 850 (in_fix.dimensions().TotalSize()) * sizeof(DataType)); 851 out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); 852 sycl_device.memcpyDeviceToHost( 853 redux_gpu_fix.data(), gpu_out_data, 854 redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); 855 sycl_device.synchronize(); 856 // Check that the CPU and GPU reductions return the same result. 857 for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { 858 VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); 859 } 860 861 sycl_device.deallocate(gpu_in_data); 862 sycl_device.deallocate(gpu_out_data); 863 } 864 865 // SYCL supports a generic case of reduction where the accumulator is a 866 // different type than the input data This is an example on how to get if a 867 // Tensor contains nan and/or inf in one reduction 868 template <typename InT, typename OutT> 869 struct CustomReducer { 870 static const bool PacketAccess = false; 871 static const bool IsStateful = false; 872 873 static constexpr OutT InfBit = 1; 874 static constexpr OutT NanBit = 2; 875 876 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, 877 OutT* accum) const { 878 if (Eigen::numext::isinf(x)) 879 *accum |= InfBit; 880 else if (Eigen::numext::isnan(x)) 881 *accum |= NanBit; 882 } 883 884 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, 885 OutT* accum) const { 886 *accum |= x; 887 } 888 889 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { 890 return OutT(0); 891 } 892 893 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { 894 return accum; 895 } 896 }; 897 898 template <typename DataType, typename AccumType, int DataLayout, 899 typename IndexType> 900 static void test_full_reductions_custom_sycl( 901 const Eigen::SyclDevice& sycl_device) { 902 constexpr IndexType InSize = 64; 903 auto tensorRange = Sizes<InSize>(InSize); 904 Eigen::IndexList<Eigen::type2index<0>> dims; 905 auto reduced_tensorRange = Sizes<>(); 906 TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix; 907 TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix; 908 909 CustomReducer<DataType, AccumType> reducer; 910 911 in_fix.setRandom(); 912 913 size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); 914 DataType* gpu_in_data = 915 static_cast<DataType*>(sycl_device.allocate(in_size_bytes)); 916 AccumType* gpu_out_data = 917 static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType))); 918 919 TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix( 920 gpu_in_data, tensorRange); 921 TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix( 922 gpu_out_data, reduced_tensorRange); 923 924 sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); 925 out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); 926 sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, 927 sizeof(AccumType)); 928 VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); 929 930 sycl_device.deallocate(gpu_in_data); 931 sycl_device.deallocate(gpu_out_data); 932 } 933 934 template <typename DataType, typename Dev> 935 void sycl_reduction_test_full_per_device(const Dev& sycl_device) { 936 test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); 937 test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); 938 test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); 939 test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); 940 test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); 941 test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); 942 943 test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); 944 test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); 945 test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>( 946 sycl_device); 947 test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>( 948 sycl_device); 949 sycl_device.synchronize(); 950 } 951 952 template <typename DataType, typename Dev> 953 void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { 954 test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>( 955 sycl_device); 956 test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>( 957 sycl_device); 958 test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>( 959 sycl_device); 960 test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>( 961 sycl_device); 962 test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>( 963 sycl_device); 964 test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( 965 sycl_device); 966 test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>( 967 sycl_device); 968 test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>( 969 sycl_device); 970 test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>( 971 sycl_device); 972 sycl_device.synchronize(); 973 } 974 975 template <typename DataType, typename Dev> 976 void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { 977 test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, 978 4197, 4097); 979 test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 980 4197, 4097); 981 test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 982 129, 8); 983 test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); 984 test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( 985 sycl_device); 986 sycl_device.synchronize(); 987 } 988 989 template <typename DataType, typename Dev> 990 void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { 991 test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); 992 test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( 993 sycl_device); 994 test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); 995 test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); 996 test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); 997 test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); 998 sycl_device.synchronize(); 999 } 1000 1001 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { 1002 for (const auto& device : Eigen::get_sycl_supported_devices()) { 1003 std::cout << "Running on " 1004 << device.template get_info<cl::sycl::info::device::name>() 1005 << std::endl; 1006 QueueInterface queueInterface(device); 1007 auto sycl_device = Eigen::SyclDevice(&queueInterface); 1008 CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device)); 1009 CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device)); 1010 CALL_SUBTEST_3( 1011 sycl_reduction_test_first_dim_per_device<float>(sycl_device)); 1012 CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device)); 1013 } 1014 }