TensorBlock.h (60851B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // This Source Code Form is subject to the terms of the Mozilla 5 // Public License v. 2.0. If a copy of the MPL was not distributed 6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H 9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H 10 11 namespace Eigen { 12 namespace internal { 13 14 // -------------------------------------------------------------------------- // 15 // Forward declarations for templates defined below. 16 template <typename Scalar, typename IndexType, int NumDims, int Layout> 17 class TensorBlockIO; 18 19 // -------------------------------------------------------------------------- // 20 // Helper function to compute strides for densely stored buffer of given 21 // dimensions. 22 23 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use 24 // this function instead everywhere. 25 template <int Layout, typename IndexType, int NumDims> 26 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( 27 const DSizes<IndexType, NumDims>& dimensions) { 28 DSizes<IndexType, NumDims> strides; 29 if (NumDims == 0) return strides; 30 31 // TODO(ezhulenev): Use templates to unroll this loop (similar to 32 // h_array_reduce in CXX11meta.h)? Benchmark it. 33 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 34 strides[0] = 1; 35 for (int i = 1; i < NumDims; ++i) { 36 strides[i] = strides[i - 1] * dimensions[i - 1]; 37 } 38 } else { 39 strides[NumDims - 1] = 1; 40 for (int i = NumDims - 2; i >= 0; --i) { 41 strides[i] = strides[i + 1] * dimensions[i + 1]; 42 } 43 } 44 45 return strides; 46 } 47 48 template <int Layout, typename IndexType, size_t NumDims> 49 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( 50 const Eigen::array<IndexType, NumDims>& dimensions) { 51 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions)); 52 } 53 54 template <int Layout, std::ptrdiff_t... Indices> 55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides( 56 const Sizes<Indices...>& sizes) { 57 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes)); 58 } 59 60 // -------------------------------------------------------------------------- // 61 62 // Tensor block shape type defines what are the shape preference for the blocks 63 // extracted from the larger tensor. 64 // 65 // Example: blocks of 100 elements from the large 100x100 tensor: 66 // - tensor: 100x100 67 // - target_block_size: 100 68 // 69 // TensorBlockShapeType: 70 // - kUniformAllDims: 100 blocks of size 10x10 71 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column 72 // or row major layout) 73 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; 74 75 struct TensorBlockResourceRequirements { 76 TensorBlockShapeType shape_type; // target block shape 77 size_t size; // target block size 78 TensorOpCost cost_per_coeff; // cost of computing a single block element 79 80 #ifdef EIGEN_HIPCC 81 // For HIPCC, we need to explicitly declare as a "device fun", the constructor 82 // which is implicitly invoked in the "merge" / "any" routines. else HIPCC 83 // errors out complaining about the lack of a matching constructor 84 EIGEN_DEVICE_FUNC 85 TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, 86 TensorOpCost cost_) 87 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) 88 {} 89 #endif 90 91 template <typename Scalar> 92 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( 93 TensorBlockShapeType shape_type, size_t size_in_bytes, 94 TensorOpCost cost) { 95 const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); 96 return {shape_type, size, cost}; 97 } 98 99 template <typename Scalar> 100 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( 101 TensorBlockShapeType shape_type, size_t size_in_bytes) { 102 // This default cost per coefficient is valid for most materialized tensor 103 // block evaluation implementations, because they typically just read 104 // coefficients from the underlying tensor storage, and write to the tensor 105 // block buffer (scratch or destination memory, reads and writes have linear 106 // access pattern). We ignore the fixed cost of block evaluation, because in 107 // practice it should negligible. 108 // 109 // Lazy block evaluation adds the cost of calling a functor for each 110 // coefficient. 111 // 112 // All non-trivial block evaluation implementations must provide their own 113 // cost approximation (e.g. shuffling inner dimension has a much higher cost 114 // because it reads memory randomly, although the total number of moved 115 // bytes is the same). 116 return withShapeAndSize<Scalar>(shape_type, size_in_bytes, 117 {/*bytes_loaded=*/sizeof(Scalar), 118 /*bytes_stored=*/sizeof(Scalar), 119 /*compute_cycles=*/0}); 120 } 121 122 template <typename Scalar> 123 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( 124 size_t size_in_bytes) { 125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, 126 size_in_bytes); 127 } 128 129 template <typename Scalar> 130 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( 131 size_t size_in_bytes) { 132 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, 133 size_in_bytes); 134 } 135 136 EIGEN_DEVICE_FUNC 137 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements 138 merge(const TensorBlockResourceRequirements& lhs, 139 const TensorBlockResourceRequirements& rhs) { 140 return {merge(lhs.shape_type, rhs.shape_type), // shape_type 141 merge(lhs.size, rhs.size), // size 142 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff 143 } 144 145 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( 146 TensorOpCost cost) { 147 cost_per_coeff += cost; 148 return *this; 149 } 150 151 // This is a resource requirement that should be returned from expressions 152 // that do not have any block evaluation preference (e.g. default tensor 153 // expression with raw buffer access). 154 EIGEN_DEVICE_FUNC 155 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { 156 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; 157 } 158 159 private: 160 using Requirements = TensorBlockResourceRequirements; 161 162 EIGEN_DEVICE_FUNC 163 static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { 164 return numext::maxi(lhs_size, rhs_size); 165 } 166 167 EIGEN_DEVICE_FUNC 168 static EIGEN_STRONG_INLINE TensorBlockShapeType 169 merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { 170 return (lhs == TensorBlockShapeType::kSkewedInnerDims || 171 rhs == TensorBlockShapeType::kSkewedInnerDims) 172 ? TensorBlockShapeType::kSkewedInnerDims 173 : TensorBlockShapeType::kUniformAllDims; 174 } 175 176 EIGEN_DEVICE_FUNC 177 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, 178 TensorOpCost rhs_cost) { 179 return lhs_cost + rhs_cost; 180 } 181 }; 182 183 // -------------------------------------------------------------------------- // 184 // TensorBlockDescriptor specifies a block offset within a tensor and the block 185 // sizes along each of the tensor dimensions. 186 187 template <int NumDims, typename IndexType = Eigen::Index> 188 class TensorBlockDescriptor { 189 public: 190 typedef DSizes<IndexType, NumDims> Dimensions; 191 192 // If we evaluate a Tensor assignment, and expression on the left, already has 193 // a memory buffer, then we might do performance optimization, and evaluate 194 // the root expression directly into the final output memory. Some time it's 195 // possible to reuse it for materializing subexpressions inside an expression 196 // tree, to to avoid dynamic memory allocation. 197 // 198 // The pointer type of the underlying storage is erased, because passing 199 // Scalar type through all the expression evaluation layers is way too many 200 // templates. In practice destination buffer type should always match the 201 // evaluated expression scalar type. 202 class DestinationBuffer { 203 public: 204 enum DestinationBufferKind : int { 205 // The above explicit specification of "int" as the enum basetype is 206 // needed to get around a HIPCC link error ("the field type is not 207 // amp-compatible") 208 // which is issued for class members with the enum type. 209 // TODO(rocm): 210 // remove the "int" basetype once HIPCC has been fixed to not error out 211 // in the above scenario. 212 213 // Destination buffer is not defined (`m_data` == nullptr). 214 kEmpty, 215 216 // Tensor block defined by an owning tensor block descriptor can fit 217 // contiguously into the destination buffer. In this case it's safe to 218 // materialize tensor block in the destination buffer, wrap it in a 219 // TensorMap, and use to build Eigen expression on top of it. 220 kContiguous, 221 222 // Destination buffer strides do not match strides of the contiguously 223 // stored block, and it's impossible to define a TensorMap over this 224 // buffer. However if we are evaluating a root of an expression tree, we 225 // still can materialize an output into this destination, because we can 226 // guarantee that no one will ever access it through block API. 227 // 228 // In theory it is possible to build valid TensorStriding<TensorMap> 229 // expression on top of this destination buffer, however it has 230 // inefficient coeff/packet access, and defeats the purpose of fast block 231 // evaluation API. 232 kStrided 233 }; 234 235 template <typename Scalar> 236 Scalar* data() const { 237 eigen_assert(m_data_type_size == sizeof(Scalar)); 238 return static_cast<Scalar*>(m_data); 239 } 240 241 const Dimensions& strides() const { return m_strides; } 242 const DestinationBufferKind& kind() const { return m_kind; } 243 244 private: 245 friend class TensorBlockDescriptor; 246 247 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} 248 249 template <typename Scalar> 250 DestinationBuffer(Scalar* data, const Dimensions& strides, 251 DestinationBufferKind kind) 252 : m_data(static_cast<void*>(data)), 253 m_data_type_size(sizeof(Scalar)), 254 m_strides(strides), 255 m_kind(kind) {} 256 257 template <int Layout, typename Scalar> 258 static DestinationBuffer make(const TensorBlockDescriptor& desc, 259 Scalar* data, const Dimensions& strides) { 260 return DestinationBuffer(data, strides, kind<Layout>(desc, strides)); 261 } 262 263 template <int Layout> 264 static DestinationBufferKind kind(const TensorBlockDescriptor& desc, 265 const Dimensions& strides) { 266 const Dimensions& desc_dims = desc.dimensions(); 267 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims); 268 for (int i = 0; i < NumDims; ++i) { 269 if (desc_dims[i] == 1) continue; 270 if (desc_strides[i] != strides[i]) return kStrided; 271 } 272 return kContiguous; 273 } 274 275 // Storage pointer is type erased, to reduce template bloat, but we still 276 // keep the size of the underlying element type for error checking. 277 void* m_data; 278 size_t m_data_type_size; 279 280 // Destination buffer dimensions always match the dimensions of a tensor 281 // block descriptor it belongs to, however strides might be different. 282 Dimensions m_strides; 283 284 DestinationBufferKind m_kind; 285 }; 286 287 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, 288 const DestinationBuffer& destination) 289 : m_offset(offset), 290 m_dimensions(dimensions), 291 m_destination(destination) {} 292 293 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) 294 : m_offset(offset), 295 m_dimensions(dimensions), 296 m_destination(DestinationBuffer()) {} 297 298 IndexType offset() const { return m_offset; } 299 const Dimensions& dimensions() const { return m_dimensions; } 300 IndexType dimension(int index) const { return m_dimensions[index]; } 301 IndexType size() const { return array_prod<IndexType>(m_dimensions); } 302 303 const DestinationBuffer& destination() const { return m_destination; } 304 305 template <int Layout, typename Scalar> 306 void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { 307 eigen_assert(dst_base != NULL); 308 m_destination = 309 DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides); 310 } 311 312 template <int Layout, typename Scalar, typename DstStridesIndexType> 313 void AddDestinationBuffer( 314 Scalar* dst_base, 315 const DSizes<DstStridesIndexType, NumDims>& dst_strides) { 316 // DSizes constructor will do index type promotion if it's safe. 317 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides)); 318 } 319 320 TensorBlockDescriptor& DropDestinationBuffer() { 321 m_destination.m_data = NULL; 322 m_destination.m_kind = DestinationBuffer::kEmpty; 323 return *this; 324 } 325 326 bool HasDestinationBuffer() const { 327 return m_destination.kind() != DestinationBuffer::kEmpty; 328 } 329 330 // Returns a copy of `*this` with updated offset. 331 TensorBlockDescriptor WithOffset(IndexType offset) const { 332 return TensorBlockDescriptor(offset, m_dimensions, m_destination); 333 } 334 335 private: 336 // Offset and dimensions are immutable after construction. Block descriptor 337 // can only be mutated by adding or dropping destination. 338 const IndexType m_offset; 339 const Dimensions m_dimensions; 340 DestinationBuffer m_destination; 341 }; 342 343 // -------------------------------------------------------------------------- // 344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor. 345 346 template <int NumDims, int Layout, typename IndexType = Eigen::Index> 347 class TensorBlockMapper { 348 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor; 349 350 public: 351 typedef DSizes<IndexType, NumDims> Dimensions; 352 353 TensorBlockMapper() = default; 354 TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, 355 const TensorBlockResourceRequirements& requirements) 356 : m_tensor_dimensions(dimensions), m_requirements(requirements) { 357 // Compute block dimensions and the total number of blocks. 358 InitializeBlockDimensions(); 359 } 360 361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { 362 return m_total_block_count; 363 } 364 365 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { 366 return m_block_dimensions.TotalSize(); 367 } 368 369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& 370 blockDimensions() const { 371 return m_block_dimensions; 372 } 373 374 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor 375 blockDescriptor(IndexType block_index) const { 376 static const bool isColMajor = Layout == static_cast<int>(ColMajor); 377 378 IndexType offset = 0; 379 DSizes<IndexType, NumDims> dimensions; 380 381 if (NumDims == 0) return BlockDescriptor(offset, dimensions); 382 383 // Iterate outer -> inner dimensions. 384 for (int i = NumDims - 1; i >= 0; --i) { 385 const int dim = isColMajor ? i : NumDims - i - 1; 386 387 const IndexType idx = block_index / m_block_strides[dim]; 388 block_index -= idx * m_block_strides[dim]; 389 390 const IndexType coord = idx * m_block_dimensions[dim]; 391 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, 392 m_block_dimensions[dim]); 393 offset += coord * m_tensor_strides[dim]; 394 } 395 396 return {offset, dimensions}; 397 } 398 399 private: 400 void InitializeBlockDimensions() { 401 // Requested block shape and size. 402 const TensorBlockShapeType shape_type = m_requirements.shape_type; 403 IndexType target_block_size = 404 numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size)); 405 406 IndexType tensor_size = m_tensor_dimensions.TotalSize(); 407 408 // Corner case: one of the dimensions is zero. Logic below is too complex 409 // to handle this case on a general basis, just use unit block size. 410 // Note: we must not yield blocks with zero dimensions (recipe for 411 // overflows/underflows, divisions by zero and NaNs later). 412 if (tensor_size == 0) { 413 for (int i = 0; i < NumDims; ++i) { 414 m_block_dimensions[i] = 1; 415 } 416 m_total_block_count = 0; 417 return; 418 } 419 420 // If tensor fits into a target block size, evaluate it as a single block. 421 if (tensor_size <= target_block_size) { 422 m_block_dimensions = m_tensor_dimensions; 423 m_total_block_count = 1; 424 // The only valid block index is `0`, and in this case we do not need 425 // to compute real strides for tensor or blocks (see blockDescriptor). 426 for (int i = 0; i < NumDims; ++i) { 427 m_tensor_strides[i] = 0; 428 m_block_strides[i] = 1; 429 } 430 return; 431 } 432 433 static const bool isColMajor = Layout == static_cast<int>(ColMajor); 434 435 // Block shape skewed towards inner dimension. 436 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { 437 IndexType coeff_to_allocate = target_block_size; 438 439 for (int i = 0; i < NumDims; ++i) { 440 const int dim = isColMajor ? i : NumDims - i - 1; 441 m_block_dimensions[dim] = 442 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); 443 coeff_to_allocate = divup( 444 coeff_to_allocate, 445 numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim])); 446 } 447 eigen_assert(coeff_to_allocate == 1); 448 449 } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { 450 // Tensor will not fit within 'target_block_size' budget: calculate tensor 451 // block dimension sizes based on "square" dimension size target. 452 const IndexType dim_size_target = convert_index<IndexType>( 453 std::pow(static_cast<float>(target_block_size), 454 1.0f / static_cast<float>(m_block_dimensions.rank()))); 455 456 for (int i = 0; i < NumDims; ++i) { 457 // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it 458 // a multiple of the packet size. Note that reducing 459 // 'block_dim_size' in this manner can increase the number of 460 // blocks, and so will amplify any per-block overhead. 461 m_block_dimensions[i] = 462 numext::mini(dim_size_target, m_tensor_dimensions[i]); 463 } 464 465 // Add any un-allocated coefficients to inner dimension(s). 466 IndexType total_size = m_block_dimensions.TotalSize(); 467 for (int i = 0; i < NumDims; ++i) { 468 const int dim = isColMajor ? i : NumDims - i - 1; 469 470 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { 471 const IndexType total_size_other_dims = 472 total_size / m_block_dimensions[dim]; 473 const IndexType alloc_avail = 474 divup<IndexType>(target_block_size, total_size_other_dims); 475 if (alloc_avail == m_block_dimensions[dim]) { 476 // Insufficient excess coefficients to allocate. 477 break; 478 } 479 m_block_dimensions[dim] = 480 numext::mini(m_tensor_dimensions[dim], alloc_avail); 481 total_size = total_size_other_dims * m_block_dimensions[dim]; 482 } 483 } 484 485 } else { 486 eigen_assert(false); // unknown block shape 487 } 488 489 eigen_assert(m_block_dimensions.TotalSize() >= 490 numext::mini<IndexType>(target_block_size, 491 m_tensor_dimensions.TotalSize())); 492 493 // Calculate block counts by dimension and total block count. 494 DSizes<IndexType, NumDims> block_count; 495 for (int i = 0; i < NumDims; ++i) { 496 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); 497 } 498 m_total_block_count = array_prod(block_count); 499 500 // Calculate block strides (used for enumerating blocks). 501 m_tensor_strides = strides<Layout>(m_tensor_dimensions); 502 m_block_strides = strides<Layout>(block_count); 503 } 504 505 DSizes<IndexType, NumDims> m_tensor_dimensions; 506 TensorBlockResourceRequirements m_requirements; 507 508 DSizes<IndexType, NumDims> m_block_dimensions; 509 IndexType m_total_block_count; 510 511 DSizes<IndexType, NumDims> m_tensor_strides; 512 DSizes<IndexType, NumDims> m_block_strides; 513 }; 514 515 // -------------------------------------------------------------------------- // 516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers 517 // for block evaluation (output or input block materialization). Given that 518 // Eigen expression traversal order is deterministic, all temporary allocations 519 // are happening in the same order, and usually have exactly the same size. 520 // Scratch allocator keeps a trace of all dynamic allocations, and after the 521 // first block evaluation is completed, we should be able to reuse all the 522 // temporary buffers for the next block evaluation. 523 524 template <typename Device> 525 class TensorBlockScratchAllocator { 526 public: 527 explicit TensorBlockScratchAllocator(const Device& device) 528 : m_device(device), m_allocation_index(0) {} 529 530 ~TensorBlockScratchAllocator() { 531 for (size_t i = 0; i < m_allocations.size(); ++i) { 532 m_device.deallocate(m_allocations[i].ptr); 533 } 534 } 535 536 void* allocate(size_t size) { 537 // TODO(ezhulenev): Remove when replaced with inlined vector. 538 if (m_allocations.capacity() == 0) m_allocations.reserve(8); 539 540 // Check if we already have an existing allocation att current index. 541 const int num_allocations = static_cast<int>(m_allocations.size()); 542 const bool has_allocation = m_allocation_index < num_allocations; 543 544 // Allocation index can't be larger than the number of allocations. 545 eigen_assert(m_allocation_index <= num_allocations); 546 547 // If we have existing allocation, and its size is larger or equal to 548 // requested size, we do nothing. 549 550 // If current allocation can't fit requested size, we deallocate it, and 551 // replace with a larger allocation. 552 if (has_allocation && m_allocations[m_allocation_index].size < size) { 553 m_device.deallocate(m_allocations[m_allocation_index].ptr); 554 m_allocations[m_allocation_index].ptr = m_device.allocate(size); 555 m_allocations[m_allocation_index].size = size; 556 } 557 558 // Make a new allocation if we don't have and existing one. 559 if (!has_allocation) { 560 Allocation allocation; 561 allocation.ptr = m_device.allocate(size); 562 allocation.size = size; 563 m_allocations.push_back(allocation); 564 } 565 566 eigen_assert(m_allocations[m_allocation_index].ptr != NULL); 567 eigen_assert(m_allocations[m_allocation_index].size >= size); 568 569 return m_allocations[m_allocation_index++].ptr; 570 } 571 572 void reset() { m_allocation_index = 0; } 573 574 private: 575 struct Allocation { 576 void* ptr; 577 size_t size; 578 }; 579 580 const Device& m_device; 581 int m_allocation_index; 582 // TODO(ezhulenev): This should be an inlined vector. 583 std::vector<Allocation> m_allocations; 584 }; 585 586 // -------------------------------------------------------------------------- // 587 // TensorBlockKind represents all possible block kinds, that can be produced by 588 // TensorEvaluator::evalBlock function. 589 enum TensorBlockKind { 590 // Tensor block that is a lazy expression that must be assigned to a 591 // destination using TensorBlockAssign. 592 kExpr, 593 594 // Tensor block that is a view into a memory buffer owned by an underlying 595 // Tensor expression (e.g. it can be a view into a Tensor buffer). 596 kView, 597 598 // Tensor block that was materialized in a scratch memory buffer, allocated 599 // with TensorBlockScratchAllocator. This block must be copied to a 600 // destination, similar to a block of `kExpr` type. 601 kMaterializedInScratch, 602 603 // Tensor block that was materialized directly into the final output memory 604 // buffer. For example if the left side of an assignment is a Tensor, we can 605 // directly materialize the block in the destination memory. 606 // 607 // If strides in the output buffer do not match tensor block strides, the 608 // Tensor expression will be invalid, and should not be used by 609 // TensorBlockAssign or for constructing another block expression. 610 kMaterializedInOutput 611 }; 612 613 // -------------------------------------------------------------------------- // 614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in 615 // TensorEvaluators that do not support block evaluation. 616 617 class TensorBlockNotImplemented { 618 public: 619 typedef void XprType; 620 }; 621 622 // -------------------------------------------------------------------------- // 623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type 624 // is not void). It's required to be able to define lazy block expression for 625 // argument types, that do not support block evaluation. 626 627 template <typename XprType> 628 struct XprScalar { 629 typedef typename XprType::Scalar type; 630 }; 631 template <> 632 struct XprScalar<void> { 633 typedef void type; 634 }; 635 636 // -------------------------------------------------------------------------- // 637 // TensorMaterializedBlock is a fully evaluated block of the original tensor, 638 // and XprType is just a TensorMap over the data. This block type is typically 639 // used to materialize blocks of tensor expressions, that can't be efficiently 640 // represented as lazy Tensor expressions with fast coeff/packet operations, 641 // e.g. we materialize all broadcasts into evaluated blocks. 642 // 643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory 644 // buffer that backs the original expression (e.g. block is just a view into a 645 // Tensor), or a memory buffer allocated with scratch allocator, and in this 646 // case the scratch allocator will deallocate it at the end of block based 647 // expression execution. 648 // 649 // If the block was evaluated directly into the output buffer, and strides in 650 // the output buffer do not match block strides, the TensorMap expression will 651 // be invalid, and should never be used in block assignment or any other tensor 652 // expression. 653 654 template <typename Scalar, int NumDims, int Layout, 655 typename IndexType = Eigen::Index> 656 class TensorMaterializedBlock { 657 public: 658 typedef DSizes<IndexType, NumDims> Dimensions; 659 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType; 660 661 TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, 662 const Dimensions& dimensions, bool valid_expr = true) 663 : m_kind(kind), 664 m_data(data), 665 m_dimensions(dimensions), 666 m_expr(m_data, m_dimensions), 667 m_valid_expr(valid_expr) { 668 eigen_assert(m_kind == internal::TensorBlockKind::kView || 669 m_kind == internal::TensorBlockKind::kMaterializedInScratch || 670 m_kind == internal::TensorBlockKind::kMaterializedInOutput); 671 } 672 673 TensorBlockKind kind() const { return m_kind; } 674 // NOTE(ezhulenev): Returning XprType by value like in other block types 675 // causes asan failures. The theory is that XprType::Nested doesn't work 676 // properly for TensorMap. 677 const XprType& expr() const { 678 eigen_assert(m_valid_expr); 679 return m_expr; 680 } 681 const Scalar* data() const { return m_data; } 682 void cleanup() {} 683 684 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; 685 686 // TensorMaterializedBlock can be backed by different types of storage: 687 // 688 // (1) Contiguous block of memory allocated with scratch allocator. 689 // (2) Contiguous block of memory reused from tensor block descriptor 690 // destination buffer. 691 // (3) Strided block of memory reused from tensor block descriptor 692 // destination buffer. 693 // 694 class Storage { 695 public: 696 Scalar* data() const { return m_data; } 697 const Dimensions& dimensions() const { return m_dimensions; } 698 const Dimensions& strides() const { return m_strides; } 699 700 TensorMaterializedBlock AsTensorMaterializedBlock() const { 701 return TensorMaterializedBlock( 702 m_materialized_in_output 703 ? internal::TensorBlockKind::kMaterializedInOutput 704 : internal::TensorBlockKind::kMaterializedInScratch, 705 m_data, m_dimensions, !m_strided_storage); 706 } 707 708 private: 709 friend class TensorMaterializedBlock; 710 711 Storage(Scalar* data, const Dimensions& dimensions, 712 const Dimensions& strides, bool materialized_in_output, 713 bool strided_storage) 714 : m_data(data), 715 m_dimensions(dimensions), 716 m_strides(strides), 717 m_materialized_in_output(materialized_in_output), 718 m_strided_storage(strided_storage) {} 719 720 Scalar* m_data; 721 Dimensions m_dimensions; 722 Dimensions m_strides; 723 bool m_materialized_in_output; 724 bool m_strided_storage; 725 }; 726 727 // Creates a storage for materialized block either from the block descriptor 728 // destination buffer, or allocates a new buffer with scratch allocator. 729 template <typename TensorBlockScratch> 730 EIGEN_STRONG_INLINE static Storage prepareStorage( 731 TensorBlockDesc& desc, TensorBlockScratch& scratch, 732 bool allow_strided_storage = false) { 733 // Try to reuse destination as an output block buffer. 734 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; 735 736 if (desc.destination().kind() == DestinationBuffer::kContiguous) { 737 Scalar* buffer = desc.destination().template data<Scalar>(); 738 desc.DropDestinationBuffer(); 739 return Storage(buffer, desc.dimensions(), 740 internal::strides<Layout>(desc.dimensions()), 741 /*materialized_in_output=*/true, 742 /*strided_storage=*/false); 743 744 } else if (desc.destination().kind() == DestinationBuffer::kStrided && 745 allow_strided_storage) { 746 Scalar* buffer = desc.destination().template data<Scalar>(); 747 desc.DropDestinationBuffer(); 748 return Storage(buffer, desc.dimensions(), desc.destination().strides(), 749 /*materialized_in_output=*/true, /*strided_storage=*/true); 750 751 } else { 752 void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); 753 return Storage(static_cast<Scalar*>(mem), desc.dimensions(), 754 internal::strides<Layout>(desc.dimensions()), 755 /*materialized_in_output=*/false, 756 /*strided_storage=*/false); 757 } 758 } 759 760 // Creates a materialized block for the given descriptor from a memory buffer. 761 template <typename DataDimensions, typename TensorBlockScratch> 762 EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( 763 const Scalar* data, const DataDimensions& data_dims, 764 TensorBlockDesc& desc, TensorBlockScratch& scratch) { 765 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size()); 766 767 // If a tensor block dimensions covers a contiguous block of the underlying 768 // memory, we can skip block buffer memory allocation, and construct a block 769 // from existing `data` memory buffer. 770 // 771 // Example: (RowMajor layout) 772 // data_dims: [11, 12, 13, 14] 773 // desc.dimensions(): [1, 1, 3, 14] 774 // 775 // In this case we can construct a TensorBlock starting at 776 // `data + desc.offset()`, with a `desc.dimensions()` block sizes. 777 static const bool is_col_major = Layout == ColMajor; 778 779 // Find out how many inner dimensions have a matching size. 780 int num_matching_inner_dims = 0; 781 for (int i = 0; i < NumDims; ++i) { 782 int dim = is_col_major ? i : NumDims - i - 1; 783 if (data_dims[dim] != desc.dimensions()[dim]) break; 784 ++num_matching_inner_dims; 785 } 786 787 // All the outer dimensions must be of size `1`, except a single dimension 788 // before the matching inner dimension (`3` in the example above). 789 bool can_use_direct_access = true; 790 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { 791 int dim = is_col_major ? i : NumDims - i - 1; 792 if (desc.dimension(dim) != 1) { 793 can_use_direct_access = false; 794 break; 795 } 796 } 797 798 if (can_use_direct_access) { 799 const Scalar* block_start = data + desc.offset(); 800 return TensorMaterializedBlock(internal::TensorBlockKind::kView, 801 block_start, desc.dimensions()); 802 803 } else { 804 // Reuse destination buffer or allocate new buffer with scratch allocator. 805 const Storage storage = prepareStorage(desc, scratch); 806 807 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> 808 TensorBlockIO; 809 typedef typename TensorBlockIO::Dst TensorBlockIODst; 810 typedef typename TensorBlockIO::Src TensorBlockIOSrc; 811 812 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), 813 data, desc.offset()); 814 TensorBlockIODst dst(storage.dimensions(), storage.strides(), 815 storage.data()); 816 817 TensorBlockIO::Copy(dst, src); 818 return storage.AsTensorMaterializedBlock(); 819 } 820 } 821 822 private: 823 TensorBlockKind m_kind; 824 const Scalar* m_data; 825 Dimensions m_dimensions; 826 XprType m_expr; 827 bool m_valid_expr; 828 }; 829 830 // -------------------------------------------------------------------------- // 831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp 832 // functor to the blocks produced by the underlying Tensor expression. 833 834 template <typename UnaryOp, typename ArgTensorBlock> 835 class TensorCwiseUnaryBlock { 836 static const bool NoArgBlockAccess = 837 internal::is_void<typename ArgTensorBlock::XprType>::value; 838 839 public: 840 typedef typename conditional< 841 NoArgBlockAccess, void, 842 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >:: 843 type XprType; 844 845 typedef typename XprScalar<XprType>::type Scalar; 846 847 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) 848 : m_arg_block(arg_block), m_functor(functor) {} 849 850 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } 851 852 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } 853 const Scalar* data() const { return NULL; } 854 void cleanup() { m_arg_block.cleanup(); } 855 856 private: 857 ArgTensorBlock m_arg_block; 858 UnaryOp m_functor; 859 }; 860 861 // -------------------------------------------------------------------------- // 862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp 863 // functor to the blocks produced by the underlying Tensor expression. 864 865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock> 866 class TensorCwiseBinaryBlock { 867 static const bool NoArgBlockAccess = 868 internal::is_void<typename LhsTensorBlock::XprType>::value || 869 internal::is_void<typename RhsTensorBlock::XprType>::value; 870 871 public: 872 typedef typename conditional< 873 NoArgBlockAccess, void, 874 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, 875 const typename RhsTensorBlock::XprType> >::type 876 XprType; 877 878 typedef typename XprScalar<XprType>::type Scalar; 879 880 TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, 881 const RhsTensorBlock& right_block, 882 const BinaryOp& functor) 883 : m_left_block(left_block), 884 m_right_block(right_block), 885 m_functor(functor) {} 886 887 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } 888 889 XprType expr() const { 890 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); 891 } 892 893 const Scalar* data() const { return NULL; } 894 895 void cleanup() { 896 m_left_block.cleanup(); 897 m_right_block.cleanup(); 898 } 899 900 private: 901 LhsTensorBlock m_left_block; 902 RhsTensorBlock m_right_block; 903 BinaryOp m_functor; 904 }; 905 906 // -------------------------------------------------------------------------- // 907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct 908 // an arbitrary tensor expression from a block of the underlying type (this is a 909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions). 910 911 template <typename BlockFactory, typename ArgTensorBlock> 912 class TensorUnaryExprBlock { 913 typedef typename ArgTensorBlock::XprType ArgXprType; 914 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value; 915 916 public: 917 typedef typename conditional< 918 NoArgBlockAccess, void, 919 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType; 920 921 typedef typename XprScalar<XprType>::type Scalar; 922 923 TensorUnaryExprBlock(const ArgTensorBlock& arg_block, 924 const BlockFactory& factory) 925 : m_arg_block(arg_block), m_factory(factory) {} 926 927 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } 928 XprType expr() const { return m_factory.expr(m_arg_block.expr()); } 929 const Scalar* data() const { return NULL; } 930 void cleanup() { m_arg_block.cleanup(); } 931 932 private: 933 ArgTensorBlock m_arg_block; 934 BlockFactory m_factory; 935 }; 936 937 // -------------------------------------------------------------------------- // 938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct 939 // an arbitrary tensor expression from three blocks of the underlying type. 940 941 template <typename BlockFactory, typename Arg1TensorBlock, 942 typename Arg2TensorBlock, typename Arg3TensorBlock> 943 class TensorTernaryExprBlock { 944 typedef typename Arg1TensorBlock::XprType Arg1XprType; 945 typedef typename Arg2TensorBlock::XprType Arg2XprType; 946 typedef typename Arg3TensorBlock::XprType Arg3XprType; 947 948 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value || 949 internal::is_void<Arg2XprType>::value || 950 internal::is_void<Arg3XprType>::value; 951 952 public: 953 typedef typename conditional< 954 NoArgBlockAccess, void, 955 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, 956 Arg3XprType>::type>::type XprType; 957 958 typedef typename XprScalar<XprType>::type Scalar; 959 960 TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, 961 const Arg2TensorBlock& arg2_block, 962 const Arg3TensorBlock& arg3_block, 963 const BlockFactory& factory) 964 : m_arg1_block(arg1_block), 965 m_arg2_block(arg2_block), 966 m_arg3_block(arg3_block), 967 m_factory(factory) {} 968 969 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } 970 XprType expr() const { 971 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), 972 m_arg3_block.expr()); 973 } 974 const Scalar* data() const { return NULL; } 975 void cleanup() { 976 m_arg1_block.cleanup(); 977 m_arg2_block.cleanup(); 978 m_arg3_block.cleanup(); 979 } 980 981 private: 982 Arg1TensorBlock m_arg1_block; 983 Arg2TensorBlock m_arg2_block; 984 Arg3TensorBlock m_arg3_block; 985 BlockFactory m_factory; 986 }; 987 988 // -------------------------------------------------------------------------- // 989 // StridedLinearBufferCopy provides a method to copy data between two linear 990 // buffers with different strides, with optimized paths for scatter/gather. 991 992 template <typename Scalar, typename IndexType> 993 class StridedLinearBufferCopy { 994 typedef typename packet_traits<Scalar>::type Packet; 995 enum { 996 Vectorizable = packet_traits<Scalar>::Vectorizable, 997 PacketSize = packet_traits<Scalar>::size 998 }; 999 1000 public: 1001 // Specifying linear copy kind statically gives ~30% speedup for small sizes. 1002 enum class Kind { 1003 Linear = 0, // src_stride == 1 && dst_stride == 1 1004 Scatter = 1, // src_stride == 1 && dst_stride != 1 1005 FillLinear = 2, // src_stride == 0 && dst_stride == 1 1006 FillScatter = 3, // src_stride == 0 && dst_stride != 1 1007 Gather = 4, // dst_stride == 1 1008 Random = 5 // everything else 1009 }; 1010 1011 struct Dst { 1012 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} 1013 1014 IndexType offset; 1015 IndexType stride; 1016 Scalar* data; 1017 }; 1018 1019 struct Src { 1020 Src(IndexType o, IndexType s, const Scalar* d) 1021 : offset(o), stride(s), data(d) {} 1022 1023 IndexType offset; 1024 IndexType stride; 1025 const Scalar* data; 1026 }; 1027 1028 template <typename StridedLinearBufferCopy::Kind kind> 1029 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, 1030 const Src& src, 1031 const size_t count) { 1032 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, 1033 src.data); 1034 } 1035 1036 private: 1037 template <typename StridedLinearBufferCopy::Kind kind> 1038 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( 1039 const IndexType count, const IndexType dst_offset, 1040 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, 1041 const IndexType src_offset, const IndexType src_stride, 1042 const Scalar* EIGEN_RESTRICT src_data) { 1043 const Scalar* src = &src_data[src_offset]; 1044 Scalar* dst = &dst_data[dst_offset]; 1045 1046 if (!Vectorizable) { 1047 for (Index i = 0; i < count; ++i) { 1048 dst[i * dst_stride] = src[i * src_stride]; 1049 } 1050 return; 1051 } 1052 1053 const IndexType vectorized_size = count - PacketSize; 1054 IndexType i = 0; 1055 1056 if (kind == StridedLinearBufferCopy::Kind::Linear) { 1057 // ******************************************************************** // 1058 // Linear copy from `src` to `dst`. 1059 const IndexType unrolled_size = count - 4 * PacketSize; 1060 eigen_assert(src_stride == 1 && dst_stride == 1); 1061 for (; i <= unrolled_size; i += 4 * PacketSize) { 1062 for (int j = 0; j < 4; ++j) { 1063 Packet p = ploadu<Packet>(src + i + j * PacketSize); 1064 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); 1065 } 1066 } 1067 for (; i <= vectorized_size; i += PacketSize) { 1068 Packet p = ploadu<Packet>(src + i); 1069 pstoreu<Scalar, Packet>(dst + i, p); 1070 } 1071 for (; i < count; ++i) { 1072 dst[i] = src[i]; 1073 } 1074 // ******************************************************************** // 1075 } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { 1076 // Scatter from `src` to `dst`. 1077 eigen_assert(src_stride == 1 && dst_stride != 1); 1078 for (; i <= vectorized_size; i += PacketSize) { 1079 Packet p = ploadu<Packet>(src + i); 1080 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); 1081 } 1082 for (; i < count; ++i) { 1083 dst[i * dst_stride] = src[i]; 1084 } 1085 // ******************************************************************** // 1086 } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { 1087 // Fill `dst` with value at `*src`. 1088 eigen_assert(src_stride == 0 && dst_stride == 1); 1089 const IndexType unrolled_size = count - 4 * PacketSize; 1090 Packet p = pload1<Packet>(src); 1091 for (; i <= unrolled_size; i += 4 * PacketSize) { 1092 for (int j = 0; j < 4; ++j) { 1093 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); 1094 } 1095 } 1096 for (; i <= vectorized_size; i += PacketSize) { 1097 pstoreu<Scalar, Packet>(dst + i, p); 1098 } 1099 for (; i < count; ++i) { 1100 dst[i] = *src; 1101 } 1102 // ******************************************************************** // 1103 } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { 1104 // Scatter `*src` into `dst`. 1105 eigen_assert(src_stride == 0 && dst_stride != 1); 1106 Packet p = pload1<Packet>(src); 1107 for (; i <= vectorized_size; i += PacketSize) { 1108 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); 1109 } 1110 for (; i < count; ++i) { 1111 dst[i * dst_stride] = *src; 1112 } 1113 // ******************************************************************** // 1114 } else if (kind == StridedLinearBufferCopy::Kind::Gather) { 1115 // Gather from `src` into `dst`. 1116 eigen_assert(dst_stride == 1); 1117 for (; i <= vectorized_size; i += PacketSize) { 1118 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride); 1119 pstoreu<Scalar, Packet>(dst + i, p); 1120 } 1121 for (; i < count; ++i) { 1122 dst[i] = src[i * src_stride]; 1123 } 1124 // ******************************************************************** // 1125 } else if (kind == StridedLinearBufferCopy::Kind::Random) { 1126 // Random. 1127 for (; i < count; ++i) { 1128 dst[i * dst_stride] = src[i * src_stride]; 1129 } 1130 } else { 1131 eigen_assert(false); 1132 } 1133 } 1134 }; 1135 1136 // -------------------------------------------------------------------------- // 1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. 1138 // It's possible to specify src->dst dimension mapping for the copy operation. 1139 // Dimensions of `dst` specify how many elements have to be copied, for the 1140 // `src` we need to know only stride to navigate through source memory buffer. 1141 1142 template <typename Scalar, typename IndexType, int NumDims, int Layout> 1143 class TensorBlockIO { 1144 static const bool IsColMajor = (Layout == ColMajor); 1145 1146 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy; 1147 1148 public: 1149 typedef DSizes<IndexType, NumDims> Dimensions; 1150 typedef DSizes<int, NumDims> DimensionsMap; 1151 1152 struct Dst { 1153 Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, 1154 IndexType dst_offset = 0) 1155 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} 1156 1157 Dimensions dims; 1158 Dimensions strides; 1159 Scalar* data; 1160 IndexType offset; 1161 }; 1162 1163 struct Src { 1164 Src(const Dimensions& src_strides, const Scalar* src, 1165 IndexType src_offset = 0) 1166 : strides(src_strides), data(src), offset(src_offset) {} 1167 1168 Dimensions strides; 1169 const Scalar* data; 1170 IndexType offset; 1171 }; 1172 1173 // Copies data to `dst` from `src`, using provided dimensions mapping: 1174 // 1175 // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] 1176 // 1177 // Returns the number of copied elements. 1178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( 1179 const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { 1180 // Copy single scalar value from `src` to `dst`. 1181 if (NumDims == 0) { 1182 *(dst.data + dst.offset) = *(src.data + src.offset); 1183 return 1; 1184 } 1185 1186 // Both `dst` and `src` must have contiguous innermost dimension. We also 1187 // accept the special case with stride '0', because it's used as a trick to 1188 // implement broadcasting. 1189 { 1190 int inner_dim = IsColMajor ? 0 : NumDims - 1; 1191 EIGEN_UNUSED_VARIABLE(inner_dim); 1192 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); 1193 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); 1194 } 1195 1196 // Give a shorter name to `dst_to_src_dim_map`. 1197 const DimensionsMap& dim_map = dst_to_src_dim_map; 1198 1199 // Do not squeeze reordered inner dimensions. 1200 int num_squeezable_dims = NumSqueezableInnerDims(dim_map); 1201 1202 // NOTE: We find the innermost dimension (contiguous in memory) in the dst 1203 // block, and we write data linearly into that dimension, reading it from 1204 // the src. If dimensions are reordered, we might end up reading data from 1205 // the src with `stride != 1`. 1206 // 1207 // NOTE: Random-Read/Linear-Write can be up to ~2X faster than 1208 // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 1209 1210 // Find the innermost dimension in the dst whose size is not 1. This is the 1211 // effective inner dim. 1212 int num_size_one_inner_dims = 0; 1213 for (int i = 0; i < num_squeezable_dims; ++i) { 1214 const int dst_dim = IsColMajor ? i : NumDims - i - 1; 1215 if (dst.dims[dst_dim] != 1) break; 1216 num_size_one_inner_dims++; 1217 } 1218 1219 // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. 1220 if (num_size_one_inner_dims == NumDims) { 1221 *(dst.data + dst.offset) = *(src.data + src.offset); 1222 return 1; 1223 } 1224 1225 // Outermost dimension in the dst with `stride == 1` (contiguous in memory). 1226 const int dst_stride1_dim = IsColMajor 1227 ? num_size_one_inner_dims 1228 : NumDims - num_size_one_inner_dims - 1; 1229 1230 // Dimension in the src that corresponds to the dst innermost dimension. 1231 const int src_dim_for_dst_stride1_dim = 1232 NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; 1233 1234 // Size of the innermost dimension (length of contiguous blocks of memory). 1235 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; 1236 1237 // Squeeze multiple inner dims into one if they are contiguous in `dst` and 1238 // `src` memory, so we can do less linear copy calls. 1239 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { 1240 const int dst_dim = IsColMajor ? i : NumDims - i - 1; 1241 const IndexType dst_stride = dst.strides[dst_dim]; 1242 const IndexType src_stride = src.strides[dim_map[dst_dim]]; 1243 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { 1244 dst_inner_dim_size *= dst.dims[dst_dim]; 1245 ++num_size_one_inner_dims; 1246 } else { 1247 break; 1248 } 1249 } 1250 1251 // Setup strides to read data from `src` and write to `dst`. 1252 IndexType input_offset = src.offset; 1253 IndexType output_offset = dst.offset; 1254 IndexType input_stride = 1255 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; 1256 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; 1257 1258 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; 1259 array<BlockIteratorState, at_least_1_dim> it; 1260 1261 // Initialize block iterator state. Squeeze away any dimension of size 1. 1262 int idx = 0; // currently initialized iterator state index 1263 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { 1264 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; 1265 if (dst.dims[dst_dim] == 1) continue; 1266 1267 it[idx].size = dst.dims[dst_dim]; 1268 it[idx].input_stride = src.strides[dim_map[dst_dim]]; 1269 it[idx].output_stride = dst.strides[dst_dim]; 1270 1271 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); 1272 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); 1273 1274 idx++; 1275 } 1276 1277 // Iterate copying data from src to dst. 1278 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); 1279 1280 #define COPY_INNER_DIM(KIND) \ 1281 IndexType num_copied = 0; \ 1282 for (num_copied = 0; num_copied < block_total_size; \ 1283 num_copied += dst_inner_dim_size) { \ 1284 LinCopy::template Run<KIND>( \ 1285 typename LinCopy::Dst(output_offset, output_stride, dst.data), \ 1286 typename LinCopy::Src(input_offset, input_stride, src.data), \ 1287 dst_inner_dim_size); \ 1288 \ 1289 for (int j = 0; j < idx; ++j) { \ 1290 if (++it[j].count < it[j].size) { \ 1291 input_offset += it[j].input_stride; \ 1292 output_offset += it[j].output_stride; \ 1293 break; \ 1294 } \ 1295 it[j].count = 0; \ 1296 input_offset -= it[j].input_span; \ 1297 output_offset -= it[j].output_span; \ 1298 } \ 1299 } \ 1300 return num_copied; 1301 1302 if (input_stride == 1 && output_stride == 1) { 1303 COPY_INNER_DIM(LinCopy::Kind::Linear); 1304 } else if (input_stride == 1 && output_stride != 1) { 1305 COPY_INNER_DIM(LinCopy::Kind::Scatter); 1306 } else if (input_stride == 0 && output_stride == 1) { 1307 COPY_INNER_DIM(LinCopy::Kind::FillLinear); 1308 } else if (input_stride == 0 && output_stride != 1) { 1309 COPY_INNER_DIM(LinCopy::Kind::FillScatter); 1310 } else if (output_stride == 1) { 1311 COPY_INNER_DIM(LinCopy::Kind::Gather); 1312 } else { 1313 COPY_INNER_DIM(LinCopy::Kind::Random); 1314 } 1315 1316 #undef COPY_INNER_DIM 1317 } 1318 1319 // Copy from `src` to `dst` with an identity src->dst dimension map. Returns 1320 // the number of copied elements. 1321 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, 1322 const Src& src) { 1323 DimensionsMap dst_to_src_map; 1324 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; 1325 return Copy(dst, src, dst_to_src_map); 1326 } 1327 1328 private: 1329 struct BlockIteratorState { 1330 BlockIteratorState() 1331 : size(0), 1332 count(0), 1333 input_stride(0), 1334 output_stride(0), 1335 input_span(0), 1336 output_span(0) {} 1337 1338 IndexType size; 1339 IndexType count; 1340 IndexType input_stride; 1341 IndexType output_stride; 1342 IndexType input_span; 1343 IndexType output_span; 1344 }; 1345 1346 // Compute how many inner dimensions it's allowed to squeeze when doing IO 1347 // between two tensor blocks. It's safe to squeeze inner dimensions, only 1348 // if they are not reordered. 1349 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { 1350 int num_squeezable_dims = 0; 1351 for (int i = 0; i < NumDims; ++i) { 1352 const int dim = IsColMajor ? i : NumDims - i - 1; 1353 if (dim_map[dim] != dim) break; 1354 num_squeezable_dims++; 1355 } 1356 return num_squeezable_dims; 1357 } 1358 }; 1359 1360 // -------------------------------------------------------------------------- // 1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to 1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`. 1363 // 1364 // Currently there is no way to write from a Tensor expression to a block of 1365 // memory, if dimensions are reordered. If you need to do that, you should 1366 // materialize a Tensor block expression into a memory buffer, and then use 1367 // TensorBlockIO to copy data between two memory buffers with a custom 1368 // `target->src` dimension map (see definition above). 1369 // 1370 // Also currently the innermost dimension of `target` must have a stride '1' 1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`, 1372 // but in practice it's never needed, and there is a similar TensorBlockIO 1373 // workaround for that. 1374 // 1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO 1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO 1377 // to use expressions instead of pointers, and after that TensorBlockAssignment 1378 // will become an alias to IO. 1379 template <typename Scalar, int NumDims, typename TensorBlockExpr, 1380 typename IndexType = Eigen::Index> 1381 class TensorBlockAssignment { 1382 // We will use coeff/packet path to evaluate block expressions. 1383 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> 1384 TensorBlockEvaluator; 1385 1386 typedef DSizes<IndexType, NumDims> Dimensions; 1387 1388 enum { 1389 Vectorizable = packet_traits<Scalar>::Vectorizable, 1390 PacketSize = packet_traits<Scalar>::size 1391 }; 1392 1393 template <bool Vectorizable, typename Evaluator> 1394 struct InnerDimAssign { 1395 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, 1396 const Evaluator& eval, 1397 IndexType eval_offset) { 1398 for (IndexType i = 0; i < count; ++i) { 1399 target[i] = eval.coeff(eval_offset + i); 1400 } 1401 } 1402 }; 1403 1404 template <typename Evaluator> 1405 struct InnerDimAssign<true, Evaluator> { 1406 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, 1407 const Evaluator& eval, 1408 IndexType eval_offset) { 1409 typedef typename packet_traits<Scalar>::type Packet; 1410 1411 const IndexType unrolled_size = count - 4 * PacketSize; 1412 const IndexType vectorized_size = count - PacketSize; 1413 IndexType i = 0; 1414 1415 for (; i <= unrolled_size; i += 4 * PacketSize) { 1416 for (int j = 0; j < 4; ++j) { 1417 const IndexType idx = eval_offset + i + j * PacketSize; 1418 Packet p = eval.template packet<Unaligned>(idx); 1419 pstoreu<Scalar>(target + i + j * PacketSize, p); 1420 } 1421 } 1422 1423 for (; i <= vectorized_size; i += PacketSize) { 1424 Packet p = eval.template packet<Unaligned>(eval_offset + i); 1425 pstoreu<Scalar>(target + i, p); 1426 } 1427 1428 for (; i < count; ++i) { 1429 target[i] = eval.coeff(eval_offset + i); 1430 } 1431 } 1432 }; 1433 1434 public: 1435 struct Target { 1436 Target(const Dimensions& target_dims, const Dimensions& target_strides, 1437 Scalar* target_data, IndexType target_offset = 0) 1438 : dims(target_dims), 1439 strides(target_strides), 1440 data(target_data), 1441 offset(target_offset) {} 1442 1443 Dimensions dims; 1444 Dimensions strides; 1445 Scalar* data; 1446 IndexType offset; 1447 }; 1448 1449 static Target target(const Dimensions& target_dims, 1450 const Dimensions& target_strides, Scalar* target_data, 1451 IndexType target_offset = 0) { 1452 return Target(target_dims, target_strides, target_data, target_offset); 1453 } 1454 1455 template <typename TargetDimsIndexType, typename TargetStridesIndexType> 1456 static Target target( 1457 const DSizes<TargetDimsIndexType, NumDims>& target_dims, 1458 const DSizes<TargetStridesIndexType, NumDims>& target_strides, 1459 Scalar* target_data, IndexType target_offset = 0) { 1460 // DSizes constructor will do index type promotion if it's safe. 1461 return Target(Dimensions(target_dims), Dimensions(target_strides), 1462 target_data, target_offset); 1463 } 1464 1465 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( 1466 const Target& target, const TensorBlockExpr& expr) { 1467 // Prepare evaluator for block expression. 1468 DefaultDevice default_device; 1469 TensorBlockEvaluator eval(expr, default_device); 1470 1471 // Tensor block expression dimension should match destination dimensions. 1472 eigen_assert(dimensions_match(target.dims, eval.dimensions())); 1473 1474 static const int Layout = TensorBlockEvaluator::Layout; 1475 static const bool is_col_major = Layout == ColMajor; 1476 1477 // Initialize output inner dimension size based on a layout. 1478 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); 1479 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; 1480 IndexType output_inner_dim_size = target.dims[inner_dim_idx]; 1481 1482 // Target inner dimension stride must be '1'. 1483 eigen_assert(target.strides[inner_dim_idx] == 1); 1484 1485 // Squeeze multiple inner dims into one if they are contiguous in `target`. 1486 IndexType num_squeezed_dims = 0; 1487 for (Index i = 1; i < NumDims; ++i) { 1488 const Index dim = is_col_major ? i : NumDims - i - 1; 1489 const IndexType target_stride = target.strides[dim]; 1490 1491 if (output_inner_dim_size == target_stride) { 1492 output_inner_dim_size *= target.dims[dim]; 1493 num_squeezed_dims++; 1494 } else { 1495 break; 1496 } 1497 } 1498 1499 // Initialize output block iterator state. Dimension in this array are 1500 // always in inner_most -> outer_most order (col major layout). 1501 array<BlockIteratorState, NumDims> it; 1502 1503 int idx = 0; // currently initialized iterator state index 1504 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { 1505 const Index dim = is_col_major ? i + 1 : NumDims - i - 2; 1506 1507 it[idx].count = 0; 1508 it[idx].size = target.dims[dim]; 1509 it[idx].output_stride = target.strides[dim]; 1510 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); 1511 idx++; 1512 } 1513 1514 // We read block expression from the beginning, and start writing data to 1515 // `target` at given offset. 1516 IndexType input_offset = 0; 1517 IndexType output_offset = target.offset; 1518 1519 // Iterate copying data from `eval` to `target`. 1520 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { 1521 // Assign to `target` at current offset. 1522 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, 1523 TensorBlockEvaluator>::Run(target.data + output_offset, 1524 output_inner_dim_size, eval, 1525 input_offset); 1526 1527 // Move input offset forward by the number of assigned coefficients. 1528 input_offset += output_inner_dim_size; 1529 1530 // Update index. 1531 for (int j = 0; j < idx; ++j) { 1532 if (++it[j].count < it[j].size) { 1533 output_offset += it[j].output_stride; 1534 break; 1535 } 1536 it[j].count = 0; 1537 output_offset -= it[j].output_span; 1538 } 1539 } 1540 } 1541 1542 private: 1543 struct BlockIteratorState { 1544 BlockIteratorState() 1545 : count(0), size(0), output_stride(0), output_span(0) {} 1546 1547 IndexType count; 1548 IndexType size; 1549 IndexType output_stride; 1550 IndexType output_span; 1551 }; 1552 }; 1553 1554 // -------------------------------------------------------------------------- // 1555 1556 } // namespace internal 1557 } // namespace Eigen 1558 1559 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H