cxx11_tensor_executor.cpp (30675B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_USE_THREADS 11 12 #include "main.h" 13 14 #include <Eigen/CXX11/Tensor> 15 16 using Eigen::Tensor; 17 using Eigen::RowMajor; 18 using Eigen::ColMajor; 19 using Eigen::internal::TiledEvaluation; 20 21 // A set of tests to verify that different TensorExecutor strategies yields the 22 // same results for all the ops, supporting tiled evaluation. 23 24 // Default assignment that does no use block evaluation or vectorization. 25 // We assume that default coefficient evaluation is well tested and correct. 26 template <typename Dst, typename Expr> 27 static void DefaultAssign(Dst& dst, Expr expr) { 28 using Assign = Eigen::TensorAssignOp<Dst, const Expr>; 29 using Executor = 30 Eigen::internal::TensorExecutor<const Assign, DefaultDevice, 31 /*Vectorizable=*/false, 32 /*Tiling=*/TiledEvaluation::Off>; 33 34 Executor::run(Assign(dst, expr), DefaultDevice()); 35 } 36 37 // Assignment with specified device and tiling strategy. 38 template <bool Vectorizable, TiledEvaluation Tiling, typename Device, 39 typename Dst, typename Expr> 40 static void DeviceAssign(Device& d, Dst& dst, Expr expr) { 41 using Assign = Eigen::TensorAssignOp<Dst, const Expr>; 42 using Executor = Eigen::internal::TensorExecutor<const Assign, Device, 43 Vectorizable, Tiling>; 44 45 Executor::run(Assign(dst, expr), d); 46 } 47 48 template <int NumDims> 49 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) { 50 array<Index, NumDims> dims; 51 for (int i = 0; i < NumDims; ++i) { 52 dims[i] = internal::random<int>(min_dim, max_dim); 53 } 54 return dims; 55 } 56 57 template <typename T, int NumDims, typename Device, bool Vectorizable, 58 TiledEvaluation Tiling, int Layout> 59 static void test_execute_unary_expr(Device d) 60 { 61 static constexpr int Options = 0 | Layout; 62 63 // Pick a large enough tensor size to bypass small tensor block evaluation 64 // optimization. 65 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); 66 67 Tensor<T, NumDims, Options, Index> src(dims); 68 Tensor<T, NumDims, Options, Index> dst(dims); 69 70 src.setRandom(); 71 const auto expr = src.square(); 72 73 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 74 using Executor = 75 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 76 77 Executor::run(Assign(dst, expr), d); 78 79 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 80 T square = src.coeff(i) * src.coeff(i); 81 VERIFY_IS_EQUAL(square, dst.coeff(i)); 82 } 83 } 84 85 template <typename T, int NumDims, typename Device, bool Vectorizable, 86 TiledEvaluation Tiling, int Layout> 87 static void test_execute_binary_expr(Device d) 88 { 89 static constexpr int Options = 0 | Layout; 90 91 // Pick a large enough tensor size to bypass small tensor block evaluation 92 // optimization. 93 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); 94 95 Tensor<T, NumDims, Options, Index> lhs(dims); 96 Tensor<T, NumDims, Options, Index> rhs(dims); 97 Tensor<T, NumDims, Options, Index> dst(dims); 98 99 lhs.setRandom(); 100 rhs.setRandom(); 101 102 const auto expr = lhs + rhs; 103 104 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 105 using Executor = 106 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 107 108 Executor::run(Assign(dst, expr), d); 109 110 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 111 T sum = lhs.coeff(i) + rhs.coeff(i); 112 VERIFY_IS_EQUAL(sum, dst.coeff(i)); 113 } 114 } 115 116 template <typename T, int NumDims, typename Device, bool Vectorizable, 117 TiledEvaluation Tiling, int Layout> 118 static void test_execute_broadcasting(Device d) 119 { 120 static constexpr int Options = 0 | Layout; 121 122 auto dims = RandomDims<NumDims>(1, 10); 123 Tensor<T, NumDims, Options, Index> src(dims); 124 src.setRandom(); 125 126 const auto broadcasts = RandomDims<NumDims>(1, 7); 127 const auto expr = src.broadcast(broadcasts); 128 129 // We assume that broadcasting on a default device is tested and correct, so 130 // we can rely on it to verify correctness of tensor executor and tiling. 131 Tensor<T, NumDims, Options, Index> golden; 132 golden = expr; 133 134 // Now do the broadcasting using configured tensor executor. 135 Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); 136 137 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 138 using Executor = 139 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 140 141 Executor::run(Assign(dst, expr), d); 142 143 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 144 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 145 } 146 } 147 148 template <typename T, int NumDims, typename Device, bool Vectorizable, 149 TiledEvaluation Tiling, int Layout> 150 static void test_execute_chipping_rvalue(Device d) 151 { 152 auto dims = RandomDims<NumDims>(1, 10); 153 Tensor<T, NumDims, Layout, Index> src(dims); 154 src.setRandom(); 155 156 #define TEST_CHIPPING(CHIP_DIM) \ 157 if (NumDims > (CHIP_DIM)) { \ 158 const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ 159 const auto expr = src.template chip<(CHIP_DIM)>(offset); \ 160 \ 161 Tensor<T, NumDims - 1, Layout, Index> golden; \ 162 golden = expr; \ 163 \ 164 Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \ 165 \ 166 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \ 167 using Executor = internal::TensorExecutor<const Assign, Device, \ 168 Vectorizable, Tiling>; \ 169 \ 170 Executor::run(Assign(dst, expr), d); \ 171 \ 172 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ 173 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ 174 } \ 175 } 176 177 TEST_CHIPPING(0) 178 TEST_CHIPPING(1) 179 TEST_CHIPPING(2) 180 TEST_CHIPPING(3) 181 TEST_CHIPPING(4) 182 TEST_CHIPPING(5) 183 184 #undef TEST_CHIPPING 185 } 186 187 template <typename T, int NumDims, typename Device, bool Vectorizable, 188 TiledEvaluation Tiling, int Layout> 189 static void test_execute_chipping_lvalue(Device d) 190 { 191 auto dims = RandomDims<NumDims>(1, 10); 192 193 #define TEST_CHIPPING(CHIP_DIM) \ 194 if (NumDims > (CHIP_DIM)) { \ 195 /* Generate random data that we'll assign to the chipped tensor dim. */ \ 196 array<Index, NumDims - 1> src_dims; \ 197 for (int i = 0; i < NumDims - 1; ++i) { \ 198 int dim = i < (CHIP_DIM) ? i : i + 1; \ 199 src_dims[i] = dims[dim]; \ 200 } \ 201 \ 202 Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \ 203 src.setRandom(); \ 204 \ 205 const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ 206 \ 207 Tensor<T, NumDims, Layout, Index> random(dims); \ 208 random.setZero(); \ 209 \ 210 Tensor<T, NumDims, Layout, Index> golden(dims); \ 211 golden = random; \ 212 golden.template chip<(CHIP_DIM)>(offset) = src; \ 213 \ 214 Tensor<T, NumDims, Layout, Index> dst(dims); \ 215 dst = random; \ 216 auto expr = dst.template chip<(CHIP_DIM)>(offset); \ 217 \ 218 using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \ 219 using Executor = internal::TensorExecutor<const Assign, Device, \ 220 Vectorizable, Tiling>; \ 221 \ 222 Executor::run(Assign(expr, src), d); \ 223 \ 224 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ 225 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ 226 } \ 227 } 228 229 TEST_CHIPPING(0) 230 TEST_CHIPPING(1) 231 TEST_CHIPPING(2) 232 TEST_CHIPPING(3) 233 TEST_CHIPPING(4) 234 TEST_CHIPPING(5) 235 236 #undef TEST_CHIPPING 237 } 238 239 template <typename T, int NumDims, typename Device, bool Vectorizable, 240 TiledEvaluation Tiling, int Layout> 241 static void test_execute_shuffle_rvalue(Device d) 242 { 243 static constexpr int Options = 0 | Layout; 244 245 auto dims = RandomDims<NumDims>(1, 10); 246 Tensor<T, NumDims, Options, Index> src(dims); 247 src.setRandom(); 248 249 DSizes<Index, NumDims> shuffle; 250 for (int i = 0; i < NumDims; ++i) shuffle[i] = i; 251 252 // Test all possible shuffle permutations. 253 do { 254 DSizes<Index, NumDims> shuffled_dims; 255 for (int i = 0; i < NumDims; ++i) { 256 shuffled_dims[i] = dims[shuffle[i]]; 257 } 258 259 const auto expr = src.shuffle(shuffle); 260 261 // We assume that shuffling on a default device is tested and correct, so 262 // we can rely on it to verify correctness of tensor executor and tiling. 263 Tensor<T, NumDims, Options, Index> golden(shuffled_dims); 264 DefaultAssign(golden, expr); 265 266 // Now do the shuffling using configured tensor executor. 267 Tensor<T, NumDims, Options, Index> dst(shuffled_dims); 268 DeviceAssign<Vectorizable, Tiling>(d, dst, expr); 269 270 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 271 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 272 } 273 274 } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); 275 } 276 277 template <typename T, int NumDims, typename Device, bool Vectorizable, 278 TiledEvaluation Tiling, int Layout> 279 static void test_execute_shuffle_lvalue(Device d) 280 { 281 static constexpr int Options = 0 | Layout; 282 283 auto dims = RandomDims<NumDims>(5, 10); 284 Tensor<T, NumDims, Options, Index> src(dims); 285 src.setRandom(); 286 287 DSizes<Index, NumDims> shuffle; 288 for (int i = 0; i < NumDims; ++i) shuffle[i] = i; 289 290 // Test all possible shuffle permutations. 291 do { 292 DSizes<Index, NumDims> shuffled_dims; 293 for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; 294 295 // We assume that shuffling on a default device is tested and correct, so 296 // we can rely on it to verify correctness of tensor executor and tiling. 297 Tensor<T, NumDims, Options, Index> golden(shuffled_dims); 298 auto golden_shuffle = golden.shuffle(shuffle); 299 DefaultAssign(golden_shuffle, src); 300 301 // Now do the shuffling using configured tensor executor. 302 Tensor<T, NumDims, Options, Index> dst(shuffled_dims); 303 auto dst_shuffle = dst.shuffle(shuffle); 304 DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src); 305 306 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 307 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 308 } 309 310 } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); 311 } 312 313 template <typename T, int NumDims, typename Device, bool Vectorizable, 314 TiledEvaluation Tiling, int Layout> 315 static void test_execute_reshape(Device d) 316 { 317 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); 318 319 static constexpr int ReshapedDims = NumDims - 1; 320 static constexpr int Options = 0 | Layout; 321 322 auto dims = RandomDims<NumDims>(5, 10); 323 Tensor<T, NumDims, Options, Index> src(dims); 324 src.setRandom(); 325 326 // Multiple 0th dimension and then shuffle. 327 std::vector<Index> shuffle; 328 for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i); 329 std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); 330 331 DSizes<Index, ReshapedDims> reshaped_dims; 332 reshaped_dims[shuffle[0]] = dims[0] * dims[1]; 333 for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1]; 334 335 Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims); 336 337 // Now reshape using configured tensor executor. 338 Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions()); 339 340 auto expr = src.reshape(reshaped_dims); 341 342 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 343 using Executor = 344 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 345 346 Executor::run(Assign(dst, expr), d); 347 348 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 349 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 350 } 351 } 352 353 template <typename T, int NumDims, typename Device, bool Vectorizable, 354 TiledEvaluation Tiling, int Layout> 355 static void test_execute_slice_rvalue(Device d) 356 { 357 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); 358 static constexpr int Options = 0 | Layout; 359 360 auto dims = RandomDims<NumDims>(5, 10); 361 Tensor<T, NumDims, Options, Index> src(dims); 362 src.setRandom(); 363 364 // Pick a random slice of src tensor. 365 auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>()); 366 auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>()); 367 368 // Make sure that slice start + size do not overflow tensor dims. 369 for (int i = 0; i < NumDims; ++i) { 370 slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); 371 slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); 372 } 373 374 Tensor<T, NumDims, Options, Index> golden = 375 src.slice(slice_start, slice_size); 376 377 // Now reshape using configured tensor executor. 378 Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); 379 380 auto expr = src.slice(slice_start, slice_size); 381 382 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 383 using Executor = 384 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 385 386 Executor::run(Assign(dst, expr), d); 387 388 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 389 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 390 } 391 } 392 393 template <typename T, int NumDims, typename Device, bool Vectorizable, 394 TiledEvaluation Tiling, int Layout> 395 static void test_execute_slice_lvalue(Device d) 396 { 397 static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); 398 static constexpr int Options = 0 | Layout; 399 400 auto dims = RandomDims<NumDims>(5, 10); 401 Tensor<T, NumDims, Options, Index> src(dims); 402 src.setRandom(); 403 404 // Pick a random slice of src tensor. 405 auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10)); 406 auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10)); 407 408 // Make sure that slice start + size do not overflow tensor dims. 409 for (int i = 0; i < NumDims; ++i) { 410 slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); 411 slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); 412 } 413 414 Tensor<T, NumDims, Options, Index> slice(slice_size); 415 slice.setRandom(); 416 417 // Assign a slice using default executor. 418 Tensor<T, NumDims, Options, Index> golden = src; 419 golden.slice(slice_start, slice_size) = slice; 420 421 // And using configured execution strategy. 422 Tensor<T, NumDims, Options, Index> dst = src; 423 auto expr = dst.slice(slice_start, slice_size); 424 425 using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>; 426 using Executor = 427 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 428 429 Executor::run(Assign(expr, slice), d); 430 431 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 432 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 433 } 434 } 435 436 template <typename T, int NumDims, typename Device, bool Vectorizable, 437 TiledEvaluation Tiling, int Layout> 438 static void test_execute_broadcasting_of_forced_eval(Device d) 439 { 440 static constexpr int Options = 0 | Layout; 441 442 auto dims = RandomDims<NumDims>(1, 10); 443 Tensor<T, NumDims, Options, Index> src(dims); 444 src.setRandom(); 445 446 const auto broadcasts = RandomDims<NumDims>(1, 7); 447 const auto expr = src.square().eval().broadcast(broadcasts); 448 449 // We assume that broadcasting on a default device is tested and correct, so 450 // we can rely on it to verify correctness of tensor executor and tiling. 451 Tensor<T, NumDims, Options, Index> golden; 452 golden = expr; 453 454 // Now do the broadcasting using configured tensor executor. 455 Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); 456 457 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 458 using Executor = 459 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 460 461 Executor::run(Assign(dst, expr), d); 462 463 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 464 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 465 } 466 } 467 468 template<typename T, int NumDims> 469 struct DummyGenerator { 470 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE 471 T operator()(const array <Index, NumDims>& dims) const { 472 T result = static_cast<T>(0); 473 for (int i = 0; i < NumDims; ++i) { 474 result += static_cast<T>((i + 1) * dims[i]); 475 } 476 return result; 477 } 478 }; 479 480 template <typename T, int NumDims, typename Device, bool Vectorizable, 481 TiledEvaluation Tiling, int Layout> 482 static void test_execute_generator_op(Device d) 483 { 484 static constexpr int Options = 0 | Layout; 485 486 auto dims = RandomDims<NumDims>(20, 30); 487 Tensor<T, NumDims, Options, Index> src(dims); 488 src.setRandom(); 489 490 const auto expr = src.generate(DummyGenerator<T, NumDims>()); 491 492 // We assume that generator on a default device is tested and correct, so 493 // we can rely on it to verify correctness of tensor executor and tiling. 494 Tensor<T, NumDims, Options, Index> golden; 495 golden = expr; 496 497 // Now do the broadcasting using configured tensor executor. 498 Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); 499 500 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 501 using Executor = 502 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 503 504 Executor::run(Assign(dst, expr), d); 505 506 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 507 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 508 } 509 } 510 511 template <typename T, int NumDims, typename Device, bool Vectorizable, 512 TiledEvaluation Tiling, int Layout> 513 static void test_execute_reverse_rvalue(Device d) 514 { 515 static constexpr int Options = 0 | Layout; 516 517 auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims)); 518 Tensor <T, NumDims, Options, Index> src(dims); 519 src.setRandom(); 520 521 // Reverse half of the dimensions. 522 Eigen::array<bool, NumDims> reverse; 523 for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>(); 524 525 const auto expr = src.reverse(reverse); 526 527 // We assume that reversing on a default device is tested and correct, so 528 // we can rely on it to verify correctness of tensor executor and tiling. 529 Tensor <T, NumDims, Options, Index> golden; 530 golden = expr; 531 532 // Now do the reversing using configured tensor executor. 533 Tensor <T, NumDims, Options, Index> dst(golden.dimensions()); 534 535 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 536 using Executor = 537 internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; 538 539 Executor::run(Assign(dst, expr), d); 540 541 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 542 VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); 543 } 544 } 545 546 template <typename T, int NumDims, typename Device, bool Vectorizable, 547 TiledEvaluation Tiling, int Layout> 548 static void test_async_execute_unary_expr(Device d) 549 { 550 static constexpr int Options = 0 | Layout; 551 552 // Pick a large enough tensor size to bypass small tensor block evaluation 553 // optimization. 554 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); 555 556 Tensor<T, NumDims, Options, Index> src(dims); 557 Tensor<T, NumDims, Options, Index> dst(dims); 558 559 src.setRandom(); 560 const auto expr = src.square(); 561 562 Eigen::Barrier done(1); 563 auto on_done = [&done]() { done.Notify(); }; 564 565 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 566 using DoneCallback = decltype(on_done); 567 using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback, 568 Vectorizable, Tiling>; 569 570 Executor::runAsync(Assign(dst, expr), d, on_done); 571 done.Wait(); 572 573 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 574 T square = src.coeff(i) * src.coeff(i); 575 VERIFY_IS_EQUAL(square, dst.coeff(i)); 576 } 577 } 578 579 template <typename T, int NumDims, typename Device, bool Vectorizable, 580 TiledEvaluation Tiling, int Layout> 581 static void test_async_execute_binary_expr(Device d) 582 { 583 static constexpr int Options = 0 | Layout; 584 585 // Pick a large enough tensor size to bypass small tensor block evaluation 586 // optimization. 587 auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); 588 589 Tensor<T, NumDims, Options, Index> lhs(dims); 590 Tensor<T, NumDims, Options, Index> rhs(dims); 591 Tensor<T, NumDims, Options, Index> dst(dims); 592 593 lhs.setRandom(); 594 rhs.setRandom(); 595 596 const auto expr = lhs + rhs; 597 598 Eigen::Barrier done(1); 599 auto on_done = [&done]() { done.Notify(); }; 600 601 using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; 602 using DoneCallback = decltype(on_done); 603 using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback, 604 Vectorizable, Tiling>; 605 606 Executor::runAsync(Assign(dst, expr), d, on_done); 607 done.Wait(); 608 609 for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { 610 T sum = lhs.coeff(i) + rhs.coeff(i); 611 VERIFY_IS_EQUAL(sum, dst.coeff(i)); 612 } 613 } 614 615 #ifdef EIGEN_DONT_VECTORIZE 616 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL 617 #else 618 #define VECTORIZABLE(VAL) VAL 619 #endif 620 621 #define CALL_SUBTEST_PART(PART) \ 622 CALL_SUBTEST_##PART 623 624 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ 625 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \ 626 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \ 627 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \ 628 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \ 629 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \ 630 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \ 631 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \ 632 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \ 633 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \ 634 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \ 635 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \ 636 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \ 637 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \ 638 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \ 639 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \ 640 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device))) 641 642 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation. 643 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ 644 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \ 645 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \ 646 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \ 647 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \ 648 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \ 649 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \ 650 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \ 651 CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device))) 652 653 EIGEN_DECLARE_TEST(cxx11_tensor_executor) { 654 Eigen::DefaultDevice default_device; 655 // Default device is unused in ASYNC tests. 656 EIGEN_UNUSED_VARIABLE(default_device); 657 658 const auto num_threads = internal::random<int>(20, 24); 659 Eigen::ThreadPool tp(num_threads); 660 Eigen::ThreadPoolDevice tp_device(&tp, num_threads); 661 662 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3); 663 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4); 664 CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5); 665 666 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3); 667 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4); 668 CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5); 669 670 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3); 671 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4); 672 CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5); 673 674 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3); 675 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4); 676 CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5); 677 678 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3); 679 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4); 680 CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5); 681 682 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3); 683 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4); 684 CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5); 685 686 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3); 687 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4); 688 CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5); 689 690 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2); 691 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3); 692 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4); 693 CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5); 694 695 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2); 696 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3); 697 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4); 698 CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5); 699 700 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2); 701 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3); 702 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4); 703 CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5); 704 705 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2); 706 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3); 707 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); 708 CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); 709 710 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); 711 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); 712 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); 713 CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); 714 715 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1); 716 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2); 717 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3); 718 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); 719 CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); 720 721 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); 722 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); 723 CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5); 724 725 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3); 726 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4); 727 CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5); 728 729 // Force CMake to split this test. 730 // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16 731 }