cxx11_tensor_thread_pool.cpp (25491B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_USE_THREADS 11 12 13 #include "main.h" 14 #include <iostream> 15 #include <Eigen/CXX11/Tensor> 16 17 using Eigen::Tensor; 18 19 class TestAllocator : public Allocator { 20 public: 21 ~TestAllocator() EIGEN_OVERRIDE {} 22 EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE { 23 const_cast<TestAllocator*>(this)->alloc_count_++; 24 return internal::aligned_malloc(num_bytes); 25 } 26 EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE { 27 const_cast<TestAllocator*>(this)->dealloc_count_++; 28 internal::aligned_free(buffer); 29 } 30 31 int alloc_count() const { return alloc_count_; } 32 int dealloc_count() const { return dealloc_count_; } 33 34 private: 35 int alloc_count_ = 0; 36 int dealloc_count_ = 0; 37 }; 38 39 void test_multithread_elementwise() 40 { 41 Tensor<float, 3> in1(200, 30, 70); 42 Tensor<float, 3> in2(200, 30, 70); 43 Tensor<double, 3> out(200, 30, 70); 44 45 in1.setRandom(); 46 in2.setRandom(); 47 48 Eigen::ThreadPool tp(internal::random<int>(3, 11)); 49 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); 50 out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>(); 51 52 for (int i = 0; i < 200; ++i) { 53 for (int j = 0; j < 30; ++j) { 54 for (int k = 0; k < 70; ++k) { 55 VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); 56 } 57 } 58 } 59 } 60 61 void test_async_multithread_elementwise() 62 { 63 Tensor<float, 3> in1(200, 30, 70); 64 Tensor<float, 3> in2(200, 30, 70); 65 Tensor<double, 3> out(200, 30, 70); 66 67 in1.setRandom(); 68 in2.setRandom(); 69 70 Eigen::ThreadPool tp(internal::random<int>(3, 11)); 71 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); 72 73 Eigen::Barrier b(1); 74 out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>(); 75 b.Wait(); 76 77 for (int i = 0; i < 200; ++i) { 78 for (int j = 0; j < 30; ++j) { 79 for (int k = 0; k < 70; ++k) { 80 VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); 81 } 82 } 83 } 84 } 85 86 void test_multithread_compound_assignment() 87 { 88 Tensor<float, 3> in1(2,3,7); 89 Tensor<float, 3> in2(2,3,7); 90 Tensor<float, 3> out(2,3,7); 91 92 in1.setRandom(); 93 in2.setRandom(); 94 95 Eigen::ThreadPool tp(internal::random<int>(3, 11)); 96 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); 97 out.device(thread_pool_device) = in1; 98 out.device(thread_pool_device) += in2 * 3.14f; 99 100 for (int i = 0; i < 2; ++i) { 101 for (int j = 0; j < 3; ++j) { 102 for (int k = 0; k < 7; ++k) { 103 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); 104 } 105 } 106 } 107 } 108 109 template<int DataLayout> 110 void test_multithread_contraction() 111 { 112 Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31); 113 Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10); 114 Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10); 115 116 t_left.setRandom(); 117 t_right.setRandom(); 118 119 // this contraction should be equivalent to a single matrix multiplication 120 typedef Tensor<float, 1>::DimensionPair DimPair; 121 Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); 122 123 typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 124 MapXf m_left(t_left.data(), 1500, 1147); 125 MapXf m_right(t_right.data(), 1147, 1400); 126 Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); 127 128 Eigen::ThreadPool tp(4); 129 Eigen::ThreadPoolDevice thread_pool_device(&tp, 4); 130 131 // compute results by separate methods 132 t_result.device(thread_pool_device) = t_left.contract(t_right, dims); 133 m_result = m_left * m_right; 134 135 for (ptrdiff_t i = 0; i < t_result.size(); i++) { 136 VERIFY(&t_result.data()[i] != &m_result.data()[i]); 137 if (fabsf(t_result(i) - m_result(i)) < 1e-4f) { 138 continue; 139 } 140 if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { 141 continue; 142 } 143 std::cout << "mismatch detected at index " << i << ": " << t_result(i) 144 << " vs " << m_result(i) << std::endl; 145 assert(false); 146 } 147 } 148 149 template<int DataLayout> 150 void test_contraction_corner_cases() 151 { 152 Tensor<float, 2, DataLayout> t_left(32, 500); 153 Tensor<float, 2, DataLayout> t_right(32, 28*28); 154 Tensor<float, 2, DataLayout> t_result(500, 28*28); 155 156 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; 157 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; 158 t_result = t_result.constant(NAN); 159 160 // this contraction should be equivalent to a single matrix multiplication 161 typedef Tensor<float, 1>::DimensionPair DimPair; 162 Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}}; 163 164 typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 165 MapXf m_left(t_left.data(), 32, 500); 166 MapXf m_right(t_right.data(), 32, 28*28); 167 Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28); 168 169 Eigen::ThreadPool tp(12); 170 Eigen::ThreadPoolDevice thread_pool_device(&tp, 12); 171 172 // compute results by separate methods 173 t_result.device(thread_pool_device) = t_left.contract(t_right, dims); 174 m_result = m_left.transpose() * m_right; 175 176 for (ptrdiff_t i = 0; i < t_result.size(); i++) { 177 assert(!(numext::isnan)(t_result.data()[i])); 178 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 179 std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 180 assert(false); 181 } 182 } 183 184 t_left.resize(32, 1); 185 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; 186 t_result.resize (1, 28*28); 187 t_result = t_result.constant(NAN); 188 t_result.device(thread_pool_device) = t_left.contract(t_right, dims); 189 new(&m_left) MapXf(t_left.data(), 32, 1); 190 m_result = m_left.transpose() * m_right; 191 for (ptrdiff_t i = 0; i < t_result.size(); i++) { 192 assert(!(numext::isnan)(t_result.data()[i])); 193 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 194 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 195 assert(false); 196 } 197 } 198 199 t_left.resize(32, 500); 200 t_right.resize(32, 4); 201 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; 202 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; 203 t_result.resize (500, 4); 204 t_result = t_result.constant(NAN); 205 t_result.device(thread_pool_device) = t_left.contract(t_right, dims); 206 new(&m_left) MapXf(t_left.data(), 32, 500); 207 new(&m_right) MapXf(t_right.data(), 32, 4); 208 m_result = m_left.transpose() * m_right; 209 for (ptrdiff_t i = 0; i < t_result.size(); i++) { 210 assert(!(numext::isnan)(t_result.data()[i])); 211 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 212 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 213 assert(false); 214 } 215 } 216 217 t_left.resize(32, 1); 218 t_right.resize(32, 4); 219 t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; 220 t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; 221 t_result.resize (1, 4); 222 t_result = t_result.constant(NAN); 223 t_result.device(thread_pool_device) = t_left.contract(t_right, dims); 224 new(&m_left) MapXf(t_left.data(), 32, 1); 225 new(&m_right) MapXf(t_right.data(), 32, 4); 226 m_result = m_left.transpose() * m_right; 227 for (ptrdiff_t i = 0; i < t_result.size(); i++) { 228 assert(!(numext::isnan)(t_result.data()[i])); 229 if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 230 std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 231 assert(false); 232 } 233 } 234 } 235 236 template<int DataLayout> 237 void test_multithread_contraction_agrees_with_singlethread() { 238 int contract_size = internal::random<int>(1, 5000); 239 240 Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80), 241 contract_size, 242 internal::random<int>(1, 100)); 243 244 Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25), 245 internal::random<int>(1, 37), 246 contract_size, 247 internal::random<int>(1, 51)); 248 249 left.setRandom(); 250 right.setRandom(); 251 252 // add constants to shift values away from 0 for more precision 253 left += left.constant(1.5f); 254 right += right.constant(1.5f); 255 256 typedef Tensor<float, 1>::DimensionPair DimPair; 257 Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}}); 258 259 Eigen::ThreadPool tp(internal::random<int>(2, 11)); 260 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11)); 261 262 Tensor<float, 5, DataLayout> st_result; 263 st_result = left.contract(right, dims); 264 265 Tensor<float, 5, DataLayout> tp_result(st_result.dimensions()); 266 tp_result.device(thread_pool_device) = left.contract(right, dims); 267 268 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); 269 for (ptrdiff_t i = 0; i < st_result.size(); i++) { 270 // if both of the values are very small, then do nothing (because the test will fail 271 // due to numerical precision issues when values are small) 272 if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { 273 VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); 274 } 275 } 276 } 277 278 // Apply Sqrt to all output elements. 279 struct SqrtOutputKernel { 280 template <typename Index, typename Scalar> 281 EIGEN_ALWAYS_INLINE void operator()( 282 const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, 283 const TensorContractionParams&, Index, Index, Index num_rows, 284 Index num_cols) const { 285 for (int i = 0; i < num_rows; ++i) { 286 for (int j = 0; j < num_cols; ++j) { 287 output_mapper(i, j) = std::sqrt(output_mapper(i, j)); 288 } 289 } 290 } 291 }; 292 293 template <int DataLayout> 294 static void test_multithread_contraction_with_output_kernel() { 295 typedef Tensor<float, 1>::DimensionPair DimPair; 296 297 const int num_threads = internal::random<int>(2, 11); 298 ThreadPool threads(num_threads); 299 Eigen::ThreadPoolDevice device(&threads, num_threads); 300 301 Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31); 302 Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10); 303 Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10); 304 305 t_left.setRandom(); 306 t_right.setRandom(); 307 // Put trash in mat4 to verify contraction clears output memory. 308 t_result.setRandom(); 309 310 // Add a little offset so that the results won't be close to zero. 311 t_left += t_left.constant(1.0f); 312 t_right += t_right.constant(1.0f); 313 314 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 315 MapXf m_left(t_left.data(), 1500, 248); 316 MapXf m_right(t_right.data(), 248, 1400); 317 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); 318 319 // this contraction should be equivalent to a single matrix multiplication 320 Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); 321 322 // compute results by separate methods 323 t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); 324 325 m_result = m_left * m_right; 326 327 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { 328 VERIFY(&t_result.data()[i] != &m_result.data()[i]); 329 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); 330 } 331 } 332 333 template<int DataLayout> 334 void test_async_multithread_contraction_agrees_with_singlethread() 335 { 336 int contract_size = internal::random<int>(100, 500); 337 338 Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40), 339 contract_size, 340 internal::random<int>(10, 40)); 341 342 Tensor<float, 4, DataLayout> right( 343 internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size, 344 internal::random<int>(1, 20)); 345 346 left.setRandom(); 347 right.setRandom(); 348 349 // add constants to shift values away from 0 for more precision 350 left += left.constant(1.5f); 351 right += right.constant(1.5f); 352 353 typedef Tensor<float, 1>::DimensionPair DimPair; 354 Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}}); 355 356 Eigen::ThreadPool tp(internal::random<int>(2, 11)); 357 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32)); 358 359 Tensor<float, 5, DataLayout> st_result; 360 st_result = left.contract(right, dims); 361 362 Tensor<float, 5, DataLayout> tp_result(st_result.dimensions()); 363 364 Eigen::Barrier barrier(1); 365 tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) = 366 left.contract(right, dims); 367 barrier.Wait(); 368 369 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); 370 for (ptrdiff_t i = 0; i < st_result.size(); i++) { 371 // if both of the values are very small, then do nothing (because the test 372 // will fail due to numerical precision issues when values are small) 373 if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { 374 VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); 375 } 376 } 377 } 378 379 // We are triggering 'evalShardedByInnerDim' optimization. 380 template <int DataLayout> 381 static void test_sharded_by_inner_dim_contraction() 382 { 383 typedef Tensor<float, 1>::DimensionPair DimPair; 384 385 const int num_threads = internal::random<int>(4, 16); 386 ThreadPool threads(num_threads); 387 Eigen::ThreadPoolDevice device(&threads, num_threads); 388 389 Tensor<float, 2, DataLayout> t_left(2, 10000); 390 Tensor<float, 2, DataLayout> t_right(10000, 10); 391 Tensor<float, 2, DataLayout> t_result(2, 10); 392 393 t_left.setRandom(); 394 t_right.setRandom(); 395 // Put trash in t_result to verify contraction clears output memory. 396 t_result.setRandom(); 397 398 // Add a little offset so that the results won't be close to zero. 399 t_left += t_left.constant(1.0f); 400 t_right += t_right.constant(1.0f); 401 402 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 403 MapXf m_left(t_left.data(), 2, 10000); 404 MapXf m_right(t_right.data(), 10000, 10); 405 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); 406 407 // this contraction should be equivalent to a single matrix multiplication 408 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); 409 410 // compute results by separate methods 411 t_result.device(device) = t_left.contract(t_right, dims); 412 m_result = m_left * m_right; 413 414 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { 415 VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); 416 } 417 } 418 419 // We are triggering 'evalShardedByInnerDim' optimization with output kernel. 420 template <int DataLayout> 421 static void test_sharded_by_inner_dim_contraction_with_output_kernel() 422 { 423 typedef Tensor<float, 1>::DimensionPair DimPair; 424 425 const int num_threads = internal::random<int>(4, 16); 426 ThreadPool threads(num_threads); 427 Eigen::ThreadPoolDevice device(&threads, num_threads); 428 429 Tensor<float, 2, DataLayout> t_left(2, 10000); 430 Tensor<float, 2, DataLayout> t_right(10000, 10); 431 Tensor<float, 2, DataLayout> t_result(2, 10); 432 433 t_left.setRandom(); 434 t_right.setRandom(); 435 // Put trash in t_result to verify contraction clears output memory. 436 t_result.setRandom(); 437 438 // Add a little offset so that the results won't be close to zero. 439 t_left += t_left.constant(1.0f); 440 t_right += t_right.constant(1.0f); 441 442 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 443 MapXf m_left(t_left.data(), 2, 10000); 444 MapXf m_right(t_right.data(), 10000, 10); 445 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); 446 447 // this contraction should be equivalent to a single matrix multiplication 448 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); 449 450 // compute results by separate methods 451 t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); 452 m_result = m_left * m_right; 453 454 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { 455 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); 456 } 457 } 458 459 // We are triggering 'evalShardedByInnerDim' optimization. 460 template <int DataLayout> 461 static void test_async_sharded_by_inner_dim_contraction() 462 { 463 typedef Tensor<float, 1>::DimensionPair DimPair; 464 465 const int num_threads = internal::random<int>(4, 16); 466 ThreadPool threads(num_threads); 467 Eigen::ThreadPoolDevice device(&threads, num_threads); 468 469 Tensor<float, 2, DataLayout> t_left(2, 10000); 470 Tensor<float, 2, DataLayout> t_right(10000, 10); 471 Tensor<float, 2, DataLayout> t_result(2, 10); 472 473 t_left.setRandom(); 474 t_right.setRandom(); 475 // Put trash in t_result to verify contraction clears output memory. 476 t_result.setRandom(); 477 478 // Add a little offset so that the results won't be close to zero. 479 t_left += t_left.constant(1.0f); 480 t_right += t_right.constant(1.0f); 481 482 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 483 MapXf m_left(t_left.data(), 2, 10000); 484 MapXf m_right(t_right.data(), 10000, 10); 485 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); 486 487 // this contraction should be equivalent to a single matrix multiplication 488 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); 489 490 // compute results by separate methods 491 Eigen::Barrier barrier(1); 492 t_result.device(device, [&barrier]() { barrier.Notify(); }) = 493 t_left.contract(t_right, dims); 494 barrier.Wait(); 495 496 m_result = m_left * m_right; 497 498 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { 499 VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); 500 } 501 } 502 503 // We are triggering 'evalShardedByInnerDim' optimization with output kernel. 504 template <int DataLayout> 505 static void test_async_sharded_by_inner_dim_contraction_with_output_kernel() 506 { 507 typedef Tensor<float, 1>::DimensionPair DimPair; 508 509 const int num_threads = internal::random<int>(4, 16); 510 ThreadPool threads(num_threads); 511 Eigen::ThreadPoolDevice device(&threads, num_threads); 512 513 Tensor<float, 2, DataLayout> t_left(2, 10000); 514 Tensor<float, 2, DataLayout> t_right(10000, 10); 515 Tensor<float, 2, DataLayout> t_result(2, 10); 516 517 t_left.setRandom(); 518 t_right.setRandom(); 519 // Put trash in t_result to verify contraction clears output memory. 520 t_result.setRandom(); 521 522 // Add a little offset so that the results won't be close to zero. 523 t_left += t_left.constant(1.0f); 524 t_right += t_right.constant(1.0f); 525 526 typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; 527 MapXf m_left(t_left.data(), 2, 10000); 528 MapXf m_right(t_right.data(), 10000, 10); 529 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); 530 531 // this contraction should be equivalent to a single matrix multiplication 532 Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); 533 534 // compute results by separate methods 535 Eigen::Barrier barrier(1); 536 t_result.device(device, [&barrier]() { barrier.Notify(); }) = 537 t_left.contract(t_right, dims, SqrtOutputKernel()); 538 barrier.Wait(); 539 m_result = m_left * m_right; 540 541 for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { 542 VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); 543 } 544 } 545 546 template<int DataLayout> 547 void test_full_contraction() { 548 int contract_size1 = internal::random<int>(1, 500); 549 int contract_size2 = internal::random<int>(1, 500); 550 551 Tensor<float, 2, DataLayout> left(contract_size1, 552 contract_size2); 553 Tensor<float, 2, DataLayout> right(contract_size1, 554 contract_size2); 555 left.setRandom(); 556 right.setRandom(); 557 558 // add constants to shift values away from 0 for more precision 559 left += left.constant(1.5f); 560 right += right.constant(1.5f); 561 562 typedef Tensor<float, 2>::DimensionPair DimPair; 563 Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}}); 564 565 Eigen::ThreadPool tp(internal::random<int>(2, 11)); 566 Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11)); 567 568 Tensor<float, 0, DataLayout> st_result; 569 st_result = left.contract(right, dims); 570 571 Tensor<float, 0, DataLayout> tp_result; 572 tp_result.device(thread_pool_device) = left.contract(right, dims); 573 574 VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); 575 // if both of the values are very small, then do nothing (because the test will fail 576 // due to numerical precision issues when values are small) 577 if (numext::abs(st_result() - tp_result()) >= 1e-4f) { 578 VERIFY_IS_APPROX(st_result(), tp_result()); 579 } 580 } 581 582 template<int DataLayout> 583 void test_multithreaded_reductions() { 584 const int num_threads = internal::random<int>(3, 11); 585 ThreadPool thread_pool(num_threads); 586 Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); 587 588 const int num_rows = internal::random<int>(13, 732); 589 const int num_cols = internal::random<int>(13, 732); 590 Tensor<float, 2, DataLayout> t1(num_rows, num_cols); 591 t1.setRandom(); 592 593 Tensor<float, 0, DataLayout> full_redux; 594 full_redux = t1.sum(); 595 596 Tensor<float, 0, DataLayout> full_redux_tp; 597 full_redux_tp.device(thread_pool_device) = t1.sum(); 598 599 // Check that the single threaded and the multi threaded reductions return 600 // the same result. 601 VERIFY_IS_APPROX(full_redux(), full_redux_tp()); 602 } 603 604 605 void test_memcpy() { 606 607 for (int i = 0; i < 5; ++i) { 608 const int num_threads = internal::random<int>(3, 11); 609 Eigen::ThreadPool tp(num_threads); 610 Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads); 611 612 const int size = internal::random<int>(13, 7632); 613 Tensor<float, 1> t1(size); 614 t1.setRandom(); 615 std::vector<float> result(size); 616 thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); 617 for (int j = 0; j < size; j++) { 618 VERIFY_IS_EQUAL(t1(j), result[j]); 619 } 620 } 621 } 622 623 624 void test_multithread_random() 625 { 626 Eigen::ThreadPool tp(2); 627 Eigen::ThreadPoolDevice device(&tp, 2); 628 Tensor<float, 1> t(1 << 20); 629 t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>(); 630 } 631 632 template<int DataLayout> 633 void test_multithread_shuffle(Allocator* allocator) 634 { 635 Tensor<float, 4, DataLayout> tensor(17,5,7,11); 636 tensor.setRandom(); 637 638 const int num_threads = internal::random<int>(2, 11); 639 ThreadPool threads(num_threads); 640 Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); 641 642 Tensor<float, 4, DataLayout> shuffle(7,5,11,17); 643 array<ptrdiff_t, 4> shuffles = {{2,1,3,0}}; 644 shuffle.device(device) = tensor.shuffle(shuffles); 645 646 for (int i = 0; i < 17; ++i) { 647 for (int j = 0; j < 5; ++j) { 648 for (int k = 0; k < 7; ++k) { 649 for (int l = 0; l < 11; ++l) { 650 VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); 651 } 652 } 653 } 654 } 655 } 656 657 void test_threadpool_allocate(TestAllocator* allocator) 658 { 659 const int num_threads = internal::random<int>(2, 11); 660 const int num_allocs = internal::random<int>(2, 11); 661 ThreadPool threads(num_threads); 662 Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); 663 664 for (int a = 0; a < num_allocs; ++a) { 665 void* ptr = device.allocate(512); 666 device.deallocate(ptr); 667 } 668 VERIFY(allocator != NULL); 669 VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs); 670 VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs); 671 } 672 673 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) 674 { 675 CALL_SUBTEST_1(test_multithread_elementwise()); 676 CALL_SUBTEST_1(test_async_multithread_elementwise()); 677 CALL_SUBTEST_1(test_multithread_compound_assignment()); 678 679 CALL_SUBTEST_2(test_multithread_contraction<ColMajor>()); 680 CALL_SUBTEST_2(test_multithread_contraction<RowMajor>()); 681 682 CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); 683 CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); 684 CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>()); 685 CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>()); 686 687 CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>()); 688 CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>()); 689 690 // Test EvalShardedByInnerDimContext parallelization strategy. 691 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>()); 692 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>()); 693 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); 694 CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); 695 696 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>()); 697 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>()); 698 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); 699 CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); 700 701 // Exercise various cases that have been problematic in the past. 702 CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>()); 703 CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>()); 704 705 CALL_SUBTEST_8(test_full_contraction<ColMajor>()); 706 CALL_SUBTEST_8(test_full_contraction<RowMajor>()); 707 708 CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>()); 709 CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>()); 710 711 CALL_SUBTEST_10(test_memcpy()); 712 CALL_SUBTEST_10(test_multithread_random()); 713 714 TestAllocator test_allocator; 715 CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL)); 716 CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator)); 717 CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator)); 718 719 // Force CMake to split this test. 720 // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11 721 }