cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorBlock.h (60851B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // This Source Code Form is subject to the terms of the Mozilla
      5 // Public License v. 2.0. If a copy of the MPL was not distributed
      6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      7 
      8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
      9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
     10 
     11 namespace Eigen {
     12 namespace internal {
     13 
     14 // -------------------------------------------------------------------------- //
     15 // Forward declarations for templates defined below.
     16 template <typename Scalar, typename IndexType, int NumDims, int Layout>
     17 class TensorBlockIO;
     18 
     19 // -------------------------------------------------------------------------- //
     20 // Helper function to compute strides for densely stored buffer of given
     21 // dimensions.
     22 
     23 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
     24 // this function instead everywhere.
     25 template <int Layout, typename IndexType, int NumDims>
     26 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
     27     const DSizes<IndexType, NumDims>& dimensions) {
     28   DSizes<IndexType, NumDims> strides;
     29   if (NumDims == 0) return strides;
     30 
     31   // TODO(ezhulenev): Use templates to unroll this loop (similar to
     32   // h_array_reduce in CXX11meta.h)? Benchmark it.
     33   if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
     34     strides[0] = 1;
     35     for (int i = 1; i < NumDims; ++i) {
     36       strides[i] = strides[i - 1] * dimensions[i - 1];
     37     }
     38   } else {
     39     strides[NumDims - 1] = 1;
     40     for (int i = NumDims - 2; i >= 0; --i) {
     41       strides[i] = strides[i + 1] * dimensions[i + 1];
     42     }
     43   }
     44 
     45   return strides;
     46 }
     47 
     48 template <int Layout, typename IndexType, size_t NumDims>
     49 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
     50     const Eigen::array<IndexType, NumDims>& dimensions) {
     51   return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
     52 }
     53 
     54 template <int Layout, std::ptrdiff_t... Indices>
     55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
     56     const Sizes<Indices...>& sizes) {
     57   return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
     58 }
     59 
     60 // -------------------------------------------------------------------------- //
     61 
     62 // Tensor block shape type defines what are the shape preference for the blocks
     63 // extracted from the larger tensor.
     64 //
     65 // Example: blocks of 100 elements from the large 100x100 tensor:
     66 // - tensor: 100x100
     67 // - target_block_size: 100
     68 //
     69 // TensorBlockShapeType:
     70 //  - kUniformAllDims: 100 blocks of size 10x10
     71 //  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
     72 //                      or row major layout)
     73 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
     74 
     75 struct TensorBlockResourceRequirements {
     76   TensorBlockShapeType shape_type;  // target block shape
     77   size_t size;                      // target block size
     78   TensorOpCost cost_per_coeff;      // cost of computing a single block element
     79 
     80 #ifdef EIGEN_HIPCC
     81   // For HIPCC, we need to explicitly declare as a "device fun", the constructor
     82   // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
     83   // errors out complaining about the lack of a matching constructor
     84   EIGEN_DEVICE_FUNC
     85   TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
     86 				  TensorOpCost cost_)
     87     : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
     88   {}
     89 #endif
     90 
     91   template <typename Scalar>
     92   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
     93       TensorBlockShapeType shape_type, size_t size_in_bytes,
     94       TensorOpCost cost) {
     95     const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
     96     return {shape_type, size, cost};
     97   }
     98 
     99   template <typename Scalar>
    100   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
    101       TensorBlockShapeType shape_type, size_t size_in_bytes) {
    102     // This default cost per coefficient is valid for most materialized tensor
    103     // block evaluation implementations, because they typically just read
    104     // coefficients from the underlying tensor storage, and write to the tensor
    105     // block buffer (scratch or destination memory, reads and writes have linear
    106     // access pattern). We ignore the fixed cost of block evaluation, because in
    107     // practice it should negligible.
    108     //
    109     // Lazy block evaluation adds the cost of calling a functor for each
    110     // coefficient.
    111     //
    112     // All non-trivial block evaluation implementations must provide their own
    113     // cost approximation (e.g. shuffling inner dimension has a much higher cost
    114     // because it reads memory randomly, although the total number of moved
    115     // bytes is the same).
    116     return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
    117                                     {/*bytes_loaded=*/sizeof(Scalar),
    118                                      /*bytes_stored=*/sizeof(Scalar),
    119                                      /*compute_cycles=*/0});
    120   }
    121 
    122   template <typename Scalar>
    123   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
    124       size_t size_in_bytes) {
    125     return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
    126                                     size_in_bytes);
    127   }
    128 
    129   template <typename Scalar>
    130   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
    131       size_t size_in_bytes) {
    132     return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
    133                                     size_in_bytes);
    134   }
    135 
    136   EIGEN_DEVICE_FUNC
    137   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
    138   merge(const TensorBlockResourceRequirements& lhs,
    139         const TensorBlockResourceRequirements& rhs) {
    140     return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
    141             merge(lhs.size, rhs.size),                       // size
    142             merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
    143   }
    144 
    145   EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
    146       TensorOpCost cost) {
    147     cost_per_coeff += cost;
    148     return *this;
    149   }
    150 
    151   // This is a resource requirement that should be returned from expressions
    152   // that do not have any block evaluation preference (e.g. default tensor
    153   // expression with raw buffer access).
    154   EIGEN_DEVICE_FUNC
    155   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
    156     return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
    157   }
    158 
    159  private:
    160   using Requirements = TensorBlockResourceRequirements;
    161 
    162   EIGEN_DEVICE_FUNC
    163   static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
    164     return numext::maxi(lhs_size, rhs_size);
    165   }
    166 
    167   EIGEN_DEVICE_FUNC
    168   static EIGEN_STRONG_INLINE TensorBlockShapeType
    169   merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
    170     return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
    171             rhs == TensorBlockShapeType::kSkewedInnerDims)
    172                ? TensorBlockShapeType::kSkewedInnerDims
    173                : TensorBlockShapeType::kUniformAllDims;
    174   }
    175 
    176   EIGEN_DEVICE_FUNC
    177   static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
    178                                                 TensorOpCost rhs_cost) {
    179     return lhs_cost + rhs_cost;
    180   }
    181 };
    182 
    183 // -------------------------------------------------------------------------- //
    184 // TensorBlockDescriptor specifies a block offset within a tensor and the block
    185 // sizes along each of the tensor dimensions.
    186 
    187 template <int NumDims, typename IndexType = Eigen::Index>
    188 class TensorBlockDescriptor {
    189  public:
    190   typedef DSizes<IndexType, NumDims> Dimensions;
    191 
    192   // If we evaluate a Tensor assignment, and expression on the left, already has
    193   // a memory buffer, then we might do performance optimization, and evaluate
    194   // the root expression directly into the final output memory. Some time it's
    195   // possible to reuse it for materializing subexpressions inside an expression
    196   // tree, to to avoid dynamic memory allocation.
    197   //
    198   // The pointer type of the underlying storage is erased, because passing
    199   // Scalar type through all the expression evaluation layers is way too many
    200   // templates. In practice destination buffer type should always match the
    201   // evaluated expression scalar type.
    202   class DestinationBuffer {
    203    public:
    204     enum DestinationBufferKind : int {
    205       // The above explicit specification of "int" as the enum basetype is
    206       // needed to get around a HIPCC link error ("the field type is not
    207       // amp-compatible")
    208       // which is issued for class members with the enum type.
    209       // TODO(rocm):
    210       // remove the "int" basetype once HIPCC has been fixed to not error out
    211       // in the above scenario.
    212 
    213       // Destination buffer is not defined (`m_data` == nullptr).
    214       kEmpty,
    215 
    216       // Tensor block defined by an owning tensor block descriptor can fit
    217       // contiguously into the destination buffer. In this case it's safe to
    218       // materialize tensor block in the destination buffer, wrap it in a
    219       // TensorMap, and use to build Eigen expression on top of it.
    220       kContiguous,
    221 
    222       // Destination buffer strides do not match strides of the contiguously
    223       // stored block, and it's impossible to define a TensorMap over this
    224       // buffer. However if we are evaluating a root of an expression tree, we
    225       // still can materialize an output into this destination, because we can
    226       // guarantee that no one will ever access it through block API.
    227       //
    228       // In theory it is possible to build valid TensorStriding<TensorMap>
    229       // expression on top of this destination buffer, however it has
    230       // inefficient coeff/packet access, and defeats the purpose of fast block
    231       // evaluation API.
    232       kStrided
    233     };
    234 
    235     template <typename Scalar>
    236     Scalar* data() const {
    237       eigen_assert(m_data_type_size == sizeof(Scalar));
    238       return static_cast<Scalar*>(m_data);
    239     }
    240 
    241     const Dimensions& strides() const { return m_strides; }
    242     const DestinationBufferKind& kind() const { return m_kind; }
    243 
    244    private:
    245     friend class TensorBlockDescriptor;
    246 
    247     DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
    248 
    249     template <typename Scalar>
    250     DestinationBuffer(Scalar* data, const Dimensions& strides,
    251                       DestinationBufferKind kind)
    252         : m_data(static_cast<void*>(data)),
    253           m_data_type_size(sizeof(Scalar)),
    254           m_strides(strides),
    255           m_kind(kind) {}
    256 
    257     template <int Layout, typename Scalar>
    258     static DestinationBuffer make(const TensorBlockDescriptor& desc,
    259                                   Scalar* data, const Dimensions& strides) {
    260       return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
    261     }
    262 
    263     template <int Layout>
    264     static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
    265                                       const Dimensions& strides) {
    266       const Dimensions& desc_dims = desc.dimensions();
    267       const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
    268       for (int i = 0; i < NumDims; ++i) {
    269         if (desc_dims[i] == 1) continue;
    270         if (desc_strides[i] != strides[i]) return kStrided;
    271       }
    272       return kContiguous;
    273     }
    274 
    275     // Storage pointer is type erased, to reduce template bloat, but we still
    276     // keep the size of the underlying element type for error checking.
    277     void* m_data;
    278     size_t m_data_type_size;
    279 
    280     // Destination buffer dimensions always match the dimensions of a tensor
    281     // block descriptor it belongs to, however strides might be different.
    282     Dimensions m_strides;
    283 
    284     DestinationBufferKind m_kind;
    285   };
    286 
    287   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
    288                         const DestinationBuffer& destination)
    289       : m_offset(offset),
    290         m_dimensions(dimensions),
    291         m_destination(destination) {}
    292 
    293   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
    294       : m_offset(offset),
    295         m_dimensions(dimensions),
    296         m_destination(DestinationBuffer()) {}
    297 
    298   IndexType offset() const { return m_offset; }
    299   const Dimensions& dimensions() const { return m_dimensions; }
    300   IndexType dimension(int index) const { return m_dimensions[index]; }
    301   IndexType size() const { return array_prod<IndexType>(m_dimensions); }
    302 
    303   const DestinationBuffer& destination() const { return m_destination; }
    304 
    305   template <int Layout, typename Scalar>
    306   void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
    307     eigen_assert(dst_base != NULL);
    308     m_destination =
    309         DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
    310   }
    311 
    312   template <int Layout, typename Scalar, typename DstStridesIndexType>
    313   void AddDestinationBuffer(
    314       Scalar* dst_base,
    315       const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
    316     // DSizes constructor will do index type promotion if it's safe.
    317     AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
    318   }
    319 
    320   TensorBlockDescriptor& DropDestinationBuffer() {
    321     m_destination.m_data = NULL;
    322     m_destination.m_kind = DestinationBuffer::kEmpty;
    323     return *this;
    324   }
    325 
    326   bool HasDestinationBuffer() const {
    327     return m_destination.kind() != DestinationBuffer::kEmpty;
    328   }
    329 
    330   // Returns a copy of `*this` with updated offset.
    331   TensorBlockDescriptor WithOffset(IndexType offset) const {
    332     return TensorBlockDescriptor(offset, m_dimensions, m_destination);
    333   }
    334 
    335  private:
    336   // Offset and dimensions are immutable after construction. Block descriptor
    337   // can only be mutated by adding or dropping destination.
    338   const IndexType m_offset;
    339   const Dimensions m_dimensions;
    340   DestinationBuffer m_destination;
    341 };
    342 
    343 // -------------------------------------------------------------------------- //
    344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
    345 
    346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
    347 class TensorBlockMapper {
    348   typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
    349 
    350  public:
    351   typedef DSizes<IndexType, NumDims> Dimensions;
    352 
    353   TensorBlockMapper() = default;
    354   TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
    355                     const TensorBlockResourceRequirements& requirements)
    356       : m_tensor_dimensions(dimensions), m_requirements(requirements) {
    357     // Compute block dimensions and the total number of blocks.
    358     InitializeBlockDimensions();
    359   }
    360 
    361   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
    362     return m_total_block_count;
    363   }
    364 
    365   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
    366     return m_block_dimensions.TotalSize();
    367   }
    368 
    369   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
    370   blockDimensions() const {
    371     return m_block_dimensions;
    372   }
    373 
    374   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
    375   blockDescriptor(IndexType block_index) const {
    376     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
    377 
    378     IndexType offset = 0;
    379     DSizes<IndexType, NumDims> dimensions;
    380 
    381     if (NumDims == 0) return BlockDescriptor(offset, dimensions);
    382 
    383     // Iterate outer -> inner dimensions.
    384     for (int i = NumDims - 1; i >= 0; --i) {
    385       const int dim = isColMajor ? i : NumDims - i - 1;
    386 
    387       const IndexType idx = block_index / m_block_strides[dim];
    388       block_index -= idx * m_block_strides[dim];
    389 
    390       const IndexType coord = idx * m_block_dimensions[dim];
    391       dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
    392                                      m_block_dimensions[dim]);
    393       offset += coord * m_tensor_strides[dim];
    394     }
    395 
    396     return {offset, dimensions};
    397   }
    398 
    399  private:
    400   void InitializeBlockDimensions() {
    401     // Requested block shape and size.
    402     const TensorBlockShapeType shape_type = m_requirements.shape_type;
    403     IndexType target_block_size =
    404         numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
    405 
    406     IndexType tensor_size = m_tensor_dimensions.TotalSize();
    407 
    408     // Corner case: one of the dimensions is zero. Logic below is too complex
    409     // to handle this case on a general basis, just use unit block size.
    410     // Note: we must not yield blocks with zero dimensions (recipe for
    411     // overflows/underflows, divisions by zero and NaNs later).
    412     if (tensor_size == 0) {
    413       for (int i = 0; i < NumDims; ++i) {
    414         m_block_dimensions[i] = 1;
    415       }
    416       m_total_block_count = 0;
    417       return;
    418     }
    419 
    420     // If tensor fits into a target block size, evaluate it as a single block.
    421     if (tensor_size <= target_block_size) {
    422       m_block_dimensions = m_tensor_dimensions;
    423       m_total_block_count = 1;
    424       // The only valid block index is `0`, and in this case we do not need
    425       // to compute real strides for tensor or blocks (see blockDescriptor).
    426       for (int i = 0; i < NumDims; ++i) {
    427         m_tensor_strides[i] = 0;
    428         m_block_strides[i] = 1;
    429       }
    430       return;
    431     }
    432 
    433     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
    434 
    435     // Block shape skewed towards inner dimension.
    436     if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
    437       IndexType coeff_to_allocate = target_block_size;
    438 
    439       for (int i = 0; i < NumDims; ++i) {
    440         const int dim = isColMajor ? i : NumDims - i - 1;
    441         m_block_dimensions[dim] =
    442             numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
    443         coeff_to_allocate = divup(
    444             coeff_to_allocate,
    445             numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
    446       }
    447       eigen_assert(coeff_to_allocate == 1);
    448 
    449     } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
    450       // Tensor will not fit within 'target_block_size' budget: calculate tensor
    451       // block dimension sizes based on "square" dimension size target.
    452       const IndexType dim_size_target = convert_index<IndexType>(
    453           std::pow(static_cast<float>(target_block_size),
    454                    1.0f / static_cast<float>(m_block_dimensions.rank())));
    455 
    456       for (int i = 0; i < NumDims; ++i) {
    457         // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
    458         // a multiple of the packet size. Note that reducing
    459         // 'block_dim_size' in this manner can increase the number of
    460         // blocks, and so will amplify any per-block overhead.
    461         m_block_dimensions[i] =
    462             numext::mini(dim_size_target, m_tensor_dimensions[i]);
    463       }
    464 
    465       // Add any un-allocated coefficients to inner dimension(s).
    466       IndexType total_size = m_block_dimensions.TotalSize();
    467       for (int i = 0; i < NumDims; ++i) {
    468         const int dim = isColMajor ? i : NumDims - i - 1;
    469 
    470         if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
    471           const IndexType total_size_other_dims =
    472               total_size / m_block_dimensions[dim];
    473           const IndexType alloc_avail =
    474               divup<IndexType>(target_block_size, total_size_other_dims);
    475           if (alloc_avail == m_block_dimensions[dim]) {
    476             // Insufficient excess coefficients to allocate.
    477             break;
    478           }
    479           m_block_dimensions[dim] =
    480               numext::mini(m_tensor_dimensions[dim], alloc_avail);
    481           total_size = total_size_other_dims * m_block_dimensions[dim];
    482         }
    483       }
    484 
    485     } else {
    486       eigen_assert(false);  // unknown block shape
    487     }
    488 
    489     eigen_assert(m_block_dimensions.TotalSize() >=
    490                  numext::mini<IndexType>(target_block_size,
    491                                          m_tensor_dimensions.TotalSize()));
    492 
    493     // Calculate block counts by dimension and total block count.
    494     DSizes<IndexType, NumDims> block_count;
    495     for (int i = 0; i < NumDims; ++i) {
    496       block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
    497     }
    498     m_total_block_count = array_prod(block_count);
    499 
    500     // Calculate block strides (used for enumerating blocks).
    501     m_tensor_strides = strides<Layout>(m_tensor_dimensions);
    502     m_block_strides = strides<Layout>(block_count);
    503   }
    504 
    505   DSizes<IndexType, NumDims> m_tensor_dimensions;
    506   TensorBlockResourceRequirements m_requirements;
    507 
    508   DSizes<IndexType, NumDims> m_block_dimensions;
    509   IndexType m_total_block_count;
    510 
    511   DSizes<IndexType, NumDims> m_tensor_strides;
    512   DSizes<IndexType, NumDims> m_block_strides;
    513 };
    514 
    515 // -------------------------------------------------------------------------- //
    516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
    517 // for block evaluation (output or input block materialization). Given that
    518 // Eigen expression traversal order is deterministic, all temporary allocations
    519 // are happening in the same order, and usually have exactly the same size.
    520 // Scratch allocator keeps a trace of all dynamic allocations, and after the
    521 // first block evaluation is completed, we should be able to reuse all the
    522 // temporary buffers for the next block evaluation.
    523 
    524 template <typename Device>
    525 class TensorBlockScratchAllocator {
    526  public:
    527   explicit TensorBlockScratchAllocator(const Device& device)
    528       : m_device(device), m_allocation_index(0) {}
    529 
    530   ~TensorBlockScratchAllocator() {
    531     for (size_t i = 0; i < m_allocations.size(); ++i) {
    532       m_device.deallocate(m_allocations[i].ptr);
    533     }
    534   }
    535 
    536   void* allocate(size_t size) {
    537     // TODO(ezhulenev): Remove when replaced with inlined vector.
    538     if (m_allocations.capacity() == 0) m_allocations.reserve(8);
    539 
    540     // Check if we already have an existing allocation att current index.
    541     const int num_allocations = static_cast<int>(m_allocations.size());
    542     const bool has_allocation = m_allocation_index < num_allocations;
    543 
    544     // Allocation index can't be larger than the number of allocations.
    545     eigen_assert(m_allocation_index <= num_allocations);
    546 
    547     // If we have existing allocation, and its size is larger or equal to
    548     // requested size, we do nothing.
    549 
    550     // If current allocation can't fit requested size, we deallocate it, and
    551     // replace with a larger allocation.
    552     if (has_allocation && m_allocations[m_allocation_index].size < size) {
    553       m_device.deallocate(m_allocations[m_allocation_index].ptr);
    554       m_allocations[m_allocation_index].ptr = m_device.allocate(size);
    555       m_allocations[m_allocation_index].size = size;
    556     }
    557 
    558     // Make a new allocation if we don't have and existing one.
    559     if (!has_allocation) {
    560       Allocation allocation;
    561       allocation.ptr = m_device.allocate(size);
    562       allocation.size = size;
    563       m_allocations.push_back(allocation);
    564     }
    565 
    566     eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
    567     eigen_assert(m_allocations[m_allocation_index].size >= size);
    568 
    569     return m_allocations[m_allocation_index++].ptr;
    570   }
    571 
    572   void reset() { m_allocation_index = 0; }
    573 
    574  private:
    575   struct Allocation {
    576     void* ptr;
    577     size_t size;
    578   };
    579 
    580   const Device& m_device;
    581   int m_allocation_index;
    582   // TODO(ezhulenev): This should be an inlined vector.
    583   std::vector<Allocation> m_allocations;
    584 };
    585 
    586 // -------------------------------------------------------------------------- //
    587 // TensorBlockKind represents all possible block kinds, that can be produced by
    588 // TensorEvaluator::evalBlock function.
    589 enum TensorBlockKind {
    590   // Tensor block that is a lazy expression that must be assigned to a
    591   // destination using TensorBlockAssign.
    592   kExpr,
    593 
    594   // Tensor block that is a view into a memory buffer owned by an underlying
    595   // Tensor expression (e.g. it can be a view into a Tensor buffer).
    596   kView,
    597 
    598   // Tensor block that was materialized in a scratch memory buffer, allocated
    599   // with TensorBlockScratchAllocator. This block must be copied to a
    600   // destination, similar to a block of `kExpr` type.
    601   kMaterializedInScratch,
    602 
    603   // Tensor block that was materialized directly into the final output memory
    604   // buffer. For example if the left side of an assignment is a Tensor, we can
    605   // directly materialize the block in the destination memory.
    606   //
    607   // If strides in the output buffer do not match tensor block strides, the
    608   // Tensor expression will be invalid, and should not be used by
    609   // TensorBlockAssign or for constructing another block expression.
    610   kMaterializedInOutput
    611 };
    612 
    613 // -------------------------------------------------------------------------- //
    614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
    615 // TensorEvaluators that do not support block evaluation.
    616 
    617 class TensorBlockNotImplemented {
    618  public:
    619   typedef void XprType;
    620 };
    621 
    622 // -------------------------------------------------------------------------- //
    623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
    624 // is not void). It's required to be able to define lazy block expression for
    625 // argument types, that do not support block evaluation.
    626 
    627 template <typename XprType>
    628 struct XprScalar {
    629   typedef typename XprType::Scalar type;
    630 };
    631 template <>
    632 struct XprScalar<void> {
    633   typedef void type;
    634 };
    635 
    636 // -------------------------------------------------------------------------- //
    637 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
    638 // and XprType is just a TensorMap over the data. This block type is typically
    639 // used to materialize blocks of tensor expressions, that can't be efficiently
    640 // represented as lazy Tensor expressions with fast coeff/packet operations,
    641 // e.g. we materialize all broadcasts into evaluated blocks.
    642 //
    643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
    644 // buffer that backs the original expression (e.g. block is just a view into a
    645 // Tensor), or a memory buffer allocated with scratch allocator, and in this
    646 // case the scratch allocator will deallocate it at the end of block based
    647 // expression execution.
    648 //
    649 // If the block was evaluated directly into the output buffer, and strides in
    650 // the output buffer do not match block strides, the TensorMap expression will
    651 // be invalid, and should never be used in block assignment or any other tensor
    652 // expression.
    653 
    654 template <typename Scalar, int NumDims, int Layout,
    655           typename IndexType = Eigen::Index>
    656 class TensorMaterializedBlock {
    657  public:
    658   typedef DSizes<IndexType, NumDims> Dimensions;
    659   typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
    660 
    661   TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
    662                           const Dimensions& dimensions, bool valid_expr = true)
    663       : m_kind(kind),
    664         m_data(data),
    665         m_dimensions(dimensions),
    666         m_expr(m_data, m_dimensions),
    667         m_valid_expr(valid_expr) {
    668     eigen_assert(m_kind == internal::TensorBlockKind::kView ||
    669                  m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
    670                  m_kind == internal::TensorBlockKind::kMaterializedInOutput);
    671   }
    672 
    673   TensorBlockKind kind() const { return m_kind; }
    674   // NOTE(ezhulenev): Returning XprType by value like in other block types
    675   // causes asan failures. The theory is that XprType::Nested doesn't work
    676   // properly for TensorMap.
    677   const XprType& expr() const {
    678     eigen_assert(m_valid_expr);
    679     return m_expr;
    680   }
    681   const Scalar* data() const { return m_data; }
    682   void cleanup() {}
    683 
    684   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
    685 
    686   // TensorMaterializedBlock can be backed by different types of storage:
    687   //
    688   //   (1) Contiguous block of memory allocated with scratch allocator.
    689   //   (2) Contiguous block of memory reused from tensor block descriptor
    690   //       destination buffer.
    691   //   (3) Strided block of memory reused from tensor block descriptor
    692   //       destination buffer.
    693   //
    694   class Storage {
    695    public:
    696     Scalar* data() const { return m_data; }
    697     const Dimensions& dimensions() const { return m_dimensions; }
    698     const Dimensions& strides() const { return m_strides; }
    699 
    700     TensorMaterializedBlock AsTensorMaterializedBlock() const {
    701       return TensorMaterializedBlock(
    702           m_materialized_in_output
    703               ? internal::TensorBlockKind::kMaterializedInOutput
    704               : internal::TensorBlockKind::kMaterializedInScratch,
    705           m_data, m_dimensions, !m_strided_storage);
    706     }
    707 
    708    private:
    709     friend class TensorMaterializedBlock;
    710 
    711     Storage(Scalar* data, const Dimensions& dimensions,
    712             const Dimensions& strides, bool materialized_in_output,
    713             bool strided_storage)
    714         : m_data(data),
    715           m_dimensions(dimensions),
    716           m_strides(strides),
    717           m_materialized_in_output(materialized_in_output),
    718           m_strided_storage(strided_storage) {}
    719 
    720     Scalar* m_data;
    721     Dimensions m_dimensions;
    722     Dimensions m_strides;
    723     bool m_materialized_in_output;
    724     bool m_strided_storage;
    725   };
    726 
    727   // Creates a storage for materialized block either from the block descriptor
    728   // destination buffer, or allocates a new buffer with scratch allocator.
    729   template <typename TensorBlockScratch>
    730   EIGEN_STRONG_INLINE static Storage prepareStorage(
    731       TensorBlockDesc& desc, TensorBlockScratch& scratch,
    732       bool allow_strided_storage = false) {
    733     // Try to reuse destination as an output block buffer.
    734     typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
    735 
    736     if (desc.destination().kind() == DestinationBuffer::kContiguous) {
    737       Scalar* buffer = desc.destination().template data<Scalar>();
    738       desc.DropDestinationBuffer();
    739       return Storage(buffer, desc.dimensions(),
    740                      internal::strides<Layout>(desc.dimensions()),
    741                      /*materialized_in_output=*/true,
    742                      /*strided_storage=*/false);
    743 
    744     } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
    745                allow_strided_storage) {
    746       Scalar* buffer = desc.destination().template data<Scalar>();
    747       desc.DropDestinationBuffer();
    748       return Storage(buffer, desc.dimensions(), desc.destination().strides(),
    749                      /*materialized_in_output=*/true, /*strided_storage=*/true);
    750 
    751     } else {
    752       void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
    753       return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
    754                      internal::strides<Layout>(desc.dimensions()),
    755                      /*materialized_in_output=*/false,
    756                      /*strided_storage=*/false);
    757     }
    758   }
    759 
    760   // Creates a materialized block for the given descriptor from a memory buffer.
    761   template <typename DataDimensions, typename TensorBlockScratch>
    762   EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
    763       const Scalar* data, const DataDimensions& data_dims,
    764       TensorBlockDesc& desc, TensorBlockScratch& scratch) {
    765     eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
    766 
    767     // If a tensor block dimensions covers a contiguous block of the underlying
    768     // memory, we can skip block buffer memory allocation, and construct a block
    769     // from existing `data` memory buffer.
    770     //
    771     // Example: (RowMajor layout)
    772     //   data_dims:          [11, 12, 13, 14]
    773     //   desc.dimensions():  [1,   1,  3, 14]
    774     //
    775     // In this case we can construct a TensorBlock starting at
    776     // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
    777     static const bool is_col_major = Layout == ColMajor;
    778 
    779     // Find out how many inner dimensions have a matching size.
    780     int num_matching_inner_dims = 0;
    781     for (int i = 0; i < NumDims; ++i) {
    782       int dim = is_col_major ? i : NumDims - i - 1;
    783       if (data_dims[dim] != desc.dimensions()[dim]) break;
    784       ++num_matching_inner_dims;
    785     }
    786 
    787     // All the outer dimensions must be of size `1`, except a single dimension
    788     // before the matching inner dimension (`3` in the example above).
    789     bool can_use_direct_access = true;
    790     for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
    791       int dim = is_col_major ? i : NumDims - i - 1;
    792       if (desc.dimension(dim) != 1) {
    793         can_use_direct_access = false;
    794         break;
    795       }
    796     }
    797 
    798     if (can_use_direct_access) {
    799       const Scalar* block_start = data + desc.offset();
    800       return TensorMaterializedBlock(internal::TensorBlockKind::kView,
    801                                      block_start, desc.dimensions());
    802 
    803     } else {
    804       // Reuse destination buffer or allocate new buffer with scratch allocator.
    805       const Storage storage = prepareStorage(desc, scratch);
    806 
    807       typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
    808           TensorBlockIO;
    809       typedef typename TensorBlockIO::Dst TensorBlockIODst;
    810       typedef typename TensorBlockIO::Src TensorBlockIOSrc;
    811 
    812       TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
    813                            data, desc.offset());
    814       TensorBlockIODst dst(storage.dimensions(), storage.strides(),
    815                            storage.data());
    816 
    817       TensorBlockIO::Copy(dst, src);
    818       return storage.AsTensorMaterializedBlock();
    819     }
    820   }
    821 
    822  private:
    823   TensorBlockKind m_kind;
    824   const Scalar* m_data;
    825   Dimensions m_dimensions;
    826   XprType m_expr;
    827   bool m_valid_expr;
    828 };
    829 
    830 // -------------------------------------------------------------------------- //
    831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
    832 // functor to the blocks produced by the underlying Tensor expression.
    833 
    834 template <typename UnaryOp, typename ArgTensorBlock>
    835 class TensorCwiseUnaryBlock {
    836   static const bool NoArgBlockAccess =
    837       internal::is_void<typename ArgTensorBlock::XprType>::value;
    838 
    839  public:
    840   typedef typename conditional<
    841       NoArgBlockAccess, void,
    842       TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
    843       type XprType;
    844 
    845   typedef typename XprScalar<XprType>::type Scalar;
    846 
    847   TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
    848       : m_arg_block(arg_block), m_functor(functor) {}
    849 
    850   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
    851 
    852   XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
    853   const Scalar* data() const { return NULL; }
    854   void cleanup() { m_arg_block.cleanup(); }
    855 
    856  private:
    857   ArgTensorBlock m_arg_block;
    858   UnaryOp m_functor;
    859 };
    860 
    861 // -------------------------------------------------------------------------- //
    862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
    863 // functor to the blocks produced by the underlying Tensor expression.
    864 
    865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
    866 class TensorCwiseBinaryBlock {
    867   static const bool NoArgBlockAccess =
    868       internal::is_void<typename LhsTensorBlock::XprType>::value ||
    869       internal::is_void<typename RhsTensorBlock::XprType>::value;
    870 
    871  public:
    872   typedef typename conditional<
    873       NoArgBlockAccess, void,
    874       TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
    875                           const typename RhsTensorBlock::XprType> >::type
    876       XprType;
    877 
    878   typedef typename XprScalar<XprType>::type Scalar;
    879 
    880   TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
    881                          const RhsTensorBlock& right_block,
    882                          const BinaryOp& functor)
    883       : m_left_block(left_block),
    884         m_right_block(right_block),
    885         m_functor(functor) {}
    886 
    887   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
    888 
    889   XprType expr() const {
    890     return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
    891   }
    892 
    893   const Scalar* data() const { return NULL; }
    894 
    895   void cleanup() {
    896     m_left_block.cleanup();
    897     m_right_block.cleanup();
    898   }
    899 
    900  private:
    901   LhsTensorBlock m_left_block;
    902   RhsTensorBlock m_right_block;
    903   BinaryOp m_functor;
    904 };
    905 
    906 // -------------------------------------------------------------------------- //
    907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
    908 // an arbitrary tensor expression from a block of the underlying type (this is a
    909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
    910 
    911 template <typename BlockFactory, typename ArgTensorBlock>
    912 class TensorUnaryExprBlock {
    913   typedef typename ArgTensorBlock::XprType ArgXprType;
    914   static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
    915 
    916  public:
    917   typedef typename conditional<
    918       NoArgBlockAccess, void,
    919       typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
    920 
    921   typedef typename XprScalar<XprType>::type Scalar;
    922 
    923   TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
    924                        const BlockFactory& factory)
    925       : m_arg_block(arg_block), m_factory(factory) {}
    926 
    927   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
    928   XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
    929   const Scalar* data() const { return NULL; }
    930   void cleanup() { m_arg_block.cleanup(); }
    931 
    932  private:
    933   ArgTensorBlock m_arg_block;
    934   BlockFactory m_factory;
    935 };
    936 
    937 // -------------------------------------------------------------------------- //
    938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
    939 // an arbitrary tensor expression from three blocks of the underlying type.
    940 
    941 template <typename BlockFactory, typename Arg1TensorBlock,
    942           typename Arg2TensorBlock, typename Arg3TensorBlock>
    943 class TensorTernaryExprBlock {
    944   typedef typename Arg1TensorBlock::XprType Arg1XprType;
    945   typedef typename Arg2TensorBlock::XprType Arg2XprType;
    946   typedef typename Arg3TensorBlock::XprType Arg3XprType;
    947 
    948   static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
    949                                        internal::is_void<Arg2XprType>::value ||
    950                                        internal::is_void<Arg3XprType>::value;
    951 
    952  public:
    953   typedef typename conditional<
    954       NoArgBlockAccess, void,
    955       typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
    956                                               Arg3XprType>::type>::type XprType;
    957 
    958   typedef typename XprScalar<XprType>::type Scalar;
    959 
    960   TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
    961                          const Arg2TensorBlock& arg2_block,
    962                          const Arg3TensorBlock& arg3_block,
    963                          const BlockFactory& factory)
    964       : m_arg1_block(arg1_block),
    965         m_arg2_block(arg2_block),
    966         m_arg3_block(arg3_block),
    967         m_factory(factory) {}
    968 
    969   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
    970   XprType expr() const {
    971     return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
    972                           m_arg3_block.expr());
    973   }
    974   const Scalar* data() const { return NULL; }
    975   void cleanup() {
    976     m_arg1_block.cleanup();
    977     m_arg2_block.cleanup();
    978     m_arg3_block.cleanup();
    979   }
    980 
    981  private:
    982   Arg1TensorBlock m_arg1_block;
    983   Arg2TensorBlock m_arg2_block;
    984   Arg3TensorBlock m_arg3_block;
    985   BlockFactory m_factory;
    986 };
    987 
    988 // -------------------------------------------------------------------------- //
    989 // StridedLinearBufferCopy provides a method to copy data between two linear
    990 // buffers with different strides, with optimized paths for scatter/gather.
    991 
    992 template <typename Scalar, typename IndexType>
    993 class StridedLinearBufferCopy {
    994   typedef typename packet_traits<Scalar>::type Packet;
    995   enum {
    996     Vectorizable = packet_traits<Scalar>::Vectorizable,
    997     PacketSize = packet_traits<Scalar>::size
    998   };
    999 
   1000  public:
   1001   // Specifying linear copy kind statically gives ~30% speedup for small sizes.
   1002   enum class Kind {
   1003     Linear = 0,       // src_stride == 1 && dst_stride == 1
   1004     Scatter = 1,      // src_stride == 1 && dst_stride != 1
   1005     FillLinear = 2,   // src_stride == 0 && dst_stride == 1
   1006     FillScatter = 3,  // src_stride == 0 && dst_stride != 1
   1007     Gather = 4,       // dst_stride == 1
   1008     Random = 5        // everything else
   1009   };
   1010 
   1011   struct Dst {
   1012     Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
   1013 
   1014     IndexType offset;
   1015     IndexType stride;
   1016     Scalar* data;
   1017   };
   1018 
   1019   struct Src {
   1020     Src(IndexType o, IndexType s, const Scalar* d)
   1021         : offset(o), stride(s), data(d) {}
   1022 
   1023     IndexType offset;
   1024     IndexType stride;
   1025     const Scalar* data;
   1026   };
   1027 
   1028   template <typename StridedLinearBufferCopy::Kind kind>
   1029   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
   1030                                                         const Src& src,
   1031                                                         const size_t count) {
   1032     Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
   1033               src.data);
   1034   }
   1035 
   1036  private:
   1037   template <typename StridedLinearBufferCopy::Kind kind>
   1038   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
   1039       const IndexType count, const IndexType dst_offset,
   1040       const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
   1041       const IndexType src_offset, const IndexType src_stride,
   1042       const Scalar* EIGEN_RESTRICT src_data) {
   1043     const Scalar* src = &src_data[src_offset];
   1044     Scalar* dst = &dst_data[dst_offset];
   1045 
   1046     if (!Vectorizable) {
   1047       for (Index i = 0; i < count; ++i) {
   1048         dst[i * dst_stride] = src[i * src_stride];
   1049       }
   1050       return;
   1051     }
   1052 
   1053     const IndexType vectorized_size = count - PacketSize;
   1054     IndexType i = 0;
   1055 
   1056     if (kind == StridedLinearBufferCopy::Kind::Linear) {
   1057       // ******************************************************************** //
   1058       // Linear copy from `src` to `dst`.
   1059       const IndexType unrolled_size = count - 4 * PacketSize;
   1060       eigen_assert(src_stride == 1 && dst_stride == 1);
   1061       for (; i <= unrolled_size; i += 4 * PacketSize) {
   1062         for (int j = 0; j < 4; ++j) {
   1063           Packet p = ploadu<Packet>(src + i + j * PacketSize);
   1064           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
   1065         }
   1066       }
   1067       for (; i <= vectorized_size; i += PacketSize) {
   1068         Packet p = ploadu<Packet>(src + i);
   1069         pstoreu<Scalar, Packet>(dst + i, p);
   1070       }
   1071       for (; i < count; ++i) {
   1072         dst[i] = src[i];
   1073       }
   1074       // ******************************************************************** //
   1075     } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
   1076       // Scatter from `src` to `dst`.
   1077       eigen_assert(src_stride == 1 && dst_stride != 1);
   1078       for (; i <= vectorized_size; i += PacketSize) {
   1079         Packet p = ploadu<Packet>(src + i);
   1080         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
   1081       }
   1082       for (; i < count; ++i) {
   1083         dst[i * dst_stride] = src[i];
   1084       }
   1085       // ******************************************************************** //
   1086     } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
   1087       // Fill `dst` with value at `*src`.
   1088       eigen_assert(src_stride == 0 && dst_stride == 1);
   1089       const IndexType unrolled_size = count - 4 * PacketSize;
   1090       Packet p = pload1<Packet>(src);
   1091       for (; i <= unrolled_size; i += 4 * PacketSize) {
   1092         for (int j = 0; j < 4; ++j) {
   1093           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
   1094         }
   1095       }
   1096       for (; i <= vectorized_size; i += PacketSize) {
   1097         pstoreu<Scalar, Packet>(dst + i, p);
   1098       }
   1099       for (; i < count; ++i) {
   1100         dst[i] = *src;
   1101       }
   1102       // ******************************************************************** //
   1103     } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
   1104       // Scatter `*src` into `dst`.
   1105       eigen_assert(src_stride == 0 && dst_stride != 1);
   1106       Packet p = pload1<Packet>(src);
   1107       for (; i <= vectorized_size; i += PacketSize) {
   1108         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
   1109       }
   1110       for (; i < count; ++i) {
   1111         dst[i * dst_stride] = *src;
   1112       }
   1113       // ******************************************************************** //
   1114     } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
   1115       // Gather from `src` into `dst`.
   1116       eigen_assert(dst_stride == 1);
   1117       for (; i <= vectorized_size; i += PacketSize) {
   1118         Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
   1119         pstoreu<Scalar, Packet>(dst + i, p);
   1120       }
   1121       for (; i < count; ++i) {
   1122         dst[i] = src[i * src_stride];
   1123       }
   1124       // ******************************************************************** //
   1125     } else if (kind == StridedLinearBufferCopy::Kind::Random) {
   1126       // Random.
   1127       for (; i < count; ++i) {
   1128         dst[i * dst_stride] = src[i * src_stride];
   1129       }
   1130     } else {
   1131       eigen_assert(false);
   1132     }
   1133   }
   1134 };
   1135 
   1136 // -------------------------------------------------------------------------- //
   1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
   1138 // It's possible to specify src->dst dimension mapping for the copy operation.
   1139 // Dimensions of `dst` specify how many elements have to be copied, for the
   1140 // `src` we need to know only stride to navigate through source memory buffer.
   1141 
   1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
   1143 class TensorBlockIO {
   1144   static const bool IsColMajor = (Layout == ColMajor);
   1145 
   1146   typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
   1147 
   1148  public:
   1149   typedef DSizes<IndexType, NumDims> Dimensions;
   1150   typedef DSizes<int, NumDims> DimensionsMap;
   1151 
   1152   struct Dst {
   1153     Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
   1154         IndexType dst_offset = 0)
   1155         : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
   1156 
   1157     Dimensions dims;
   1158     Dimensions strides;
   1159     Scalar* data;
   1160     IndexType offset;
   1161   };
   1162 
   1163   struct Src {
   1164     Src(const Dimensions& src_strides, const Scalar* src,
   1165         IndexType src_offset = 0)
   1166         : strides(src_strides), data(src), offset(src_offset) {}
   1167 
   1168     Dimensions strides;
   1169     const Scalar* data;
   1170     IndexType offset;
   1171   };
   1172 
   1173   // Copies data to `dst` from `src`, using provided dimensions mapping:
   1174   //
   1175   //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
   1176   //
   1177   // Returns the number of copied elements.
   1178   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
   1179       const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
   1180     // Copy single scalar value from `src` to `dst`.
   1181     if (NumDims == 0) {
   1182       *(dst.data + dst.offset) = *(src.data + src.offset);
   1183       return 1;
   1184     }
   1185 
   1186     // Both `dst` and `src` must have contiguous innermost dimension. We also
   1187     // accept the special case with stride '0', because it's used as a trick to
   1188     // implement broadcasting.
   1189     {
   1190       int inner_dim = IsColMajor ? 0 : NumDims - 1;
   1191       EIGEN_UNUSED_VARIABLE(inner_dim);
   1192       eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
   1193       eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
   1194     }
   1195 
   1196     // Give a shorter name to `dst_to_src_dim_map`.
   1197     const DimensionsMap& dim_map = dst_to_src_dim_map;
   1198 
   1199     // Do not squeeze reordered inner dimensions.
   1200     int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
   1201 
   1202     // NOTE: We find the innermost dimension (contiguous in memory) in the dst
   1203     // block, and we write data linearly into that dimension, reading it from
   1204     // the src. If dimensions are reordered, we might end up reading data from
   1205     // the src with `stride != 1`.
   1206     //
   1207     // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
   1208     // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
   1209 
   1210     // Find the innermost dimension in the dst whose size is not 1. This is the
   1211     // effective inner dim.
   1212     int num_size_one_inner_dims = 0;
   1213     for (int i = 0; i < num_squeezable_dims; ++i) {
   1214       const int dst_dim = IsColMajor ? i : NumDims - i - 1;
   1215       if (dst.dims[dst_dim] != 1) break;
   1216       num_size_one_inner_dims++;
   1217     }
   1218 
   1219     // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
   1220     if (num_size_one_inner_dims == NumDims) {
   1221       *(dst.data + dst.offset) = *(src.data + src.offset);
   1222       return 1;
   1223     }
   1224 
   1225     // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
   1226     const int dst_stride1_dim = IsColMajor
   1227                                     ? num_size_one_inner_dims
   1228                                     : NumDims - num_size_one_inner_dims - 1;
   1229 
   1230     // Dimension in the src that corresponds to the dst innermost dimension.
   1231     const int src_dim_for_dst_stride1_dim =
   1232         NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
   1233 
   1234     // Size of the innermost dimension (length of contiguous blocks of memory).
   1235     IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
   1236 
   1237     // Squeeze multiple inner dims into one if they are contiguous in `dst` and
   1238     // `src` memory, so we can do less linear copy calls.
   1239     for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
   1240       const int dst_dim = IsColMajor ? i : NumDims - i - 1;
   1241       const IndexType dst_stride = dst.strides[dst_dim];
   1242       const IndexType src_stride = src.strides[dim_map[dst_dim]];
   1243       if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
   1244         dst_inner_dim_size *= dst.dims[dst_dim];
   1245         ++num_size_one_inner_dims;
   1246       } else {
   1247         break;
   1248       }
   1249     }
   1250 
   1251     // Setup strides to read data from `src` and write to `dst`.
   1252     IndexType input_offset = src.offset;
   1253     IndexType output_offset = dst.offset;
   1254     IndexType input_stride =
   1255         NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
   1256     IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
   1257 
   1258     const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
   1259     array<BlockIteratorState, at_least_1_dim> it;
   1260 
   1261     // Initialize block iterator state. Squeeze away any dimension of size 1.
   1262     int idx = 0;  // currently initialized iterator state index
   1263     for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
   1264       const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
   1265       if (dst.dims[dst_dim] == 1) continue;
   1266 
   1267       it[idx].size = dst.dims[dst_dim];
   1268       it[idx].input_stride = src.strides[dim_map[dst_dim]];
   1269       it[idx].output_stride = dst.strides[dst_dim];
   1270 
   1271       it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
   1272       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
   1273 
   1274       idx++;
   1275     }
   1276 
   1277     // Iterate copying data from src to dst.
   1278     const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
   1279 
   1280 #define COPY_INNER_DIM(KIND)                                           \
   1281   IndexType num_copied = 0;                                            \
   1282   for (num_copied = 0; num_copied < block_total_size;                  \
   1283        num_copied += dst_inner_dim_size) {                             \
   1284     LinCopy::template Run<KIND>(                                       \
   1285         typename LinCopy::Dst(output_offset, output_stride, dst.data), \
   1286         typename LinCopy::Src(input_offset, input_stride, src.data),   \
   1287         dst_inner_dim_size);                                           \
   1288                                                                        \
   1289     for (int j = 0; j < idx; ++j) {                                    \
   1290       if (++it[j].count < it[j].size) {                                \
   1291         input_offset += it[j].input_stride;                            \
   1292         output_offset += it[j].output_stride;                          \
   1293         break;                                                         \
   1294       }                                                                \
   1295       it[j].count = 0;                                                 \
   1296       input_offset -= it[j].input_span;                                \
   1297       output_offset -= it[j].output_span;                              \
   1298     }                                                                  \
   1299   }                                                                    \
   1300   return num_copied;
   1301 
   1302     if (input_stride == 1 && output_stride == 1) {
   1303       COPY_INNER_DIM(LinCopy::Kind::Linear);
   1304     } else if (input_stride == 1 && output_stride != 1) {
   1305       COPY_INNER_DIM(LinCopy::Kind::Scatter);
   1306     } else if (input_stride == 0 && output_stride == 1) {
   1307       COPY_INNER_DIM(LinCopy::Kind::FillLinear);
   1308     } else if (input_stride == 0 && output_stride != 1) {
   1309       COPY_INNER_DIM(LinCopy::Kind::FillScatter);
   1310     } else if (output_stride == 1) {
   1311       COPY_INNER_DIM(LinCopy::Kind::Gather);
   1312     } else {
   1313       COPY_INNER_DIM(LinCopy::Kind::Random);
   1314     }
   1315 
   1316 #undef COPY_INNER_DIM
   1317   }
   1318 
   1319   // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
   1320   // the number of copied elements.
   1321   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
   1322                                                               const Src& src) {
   1323     DimensionsMap dst_to_src_map;
   1324     for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
   1325     return Copy(dst, src, dst_to_src_map);
   1326   }
   1327 
   1328  private:
   1329   struct BlockIteratorState {
   1330     BlockIteratorState()
   1331         : size(0),
   1332           count(0),
   1333           input_stride(0),
   1334           output_stride(0),
   1335           input_span(0),
   1336           output_span(0) {}
   1337 
   1338     IndexType size;
   1339     IndexType count;
   1340     IndexType input_stride;
   1341     IndexType output_stride;
   1342     IndexType input_span;
   1343     IndexType output_span;
   1344   };
   1345 
   1346   // Compute how many inner dimensions it's allowed to squeeze when doing IO
   1347   // between two tensor blocks. It's safe to squeeze inner dimensions, only
   1348   // if they are not reordered.
   1349   static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
   1350     int num_squeezable_dims = 0;
   1351     for (int i = 0; i < NumDims; ++i) {
   1352       const int dim = IsColMajor ? i : NumDims - i - 1;
   1353       if (dim_map[dim] != dim) break;
   1354       num_squeezable_dims++;
   1355     }
   1356     return num_squeezable_dims;
   1357   }
   1358 };
   1359 
   1360 // -------------------------------------------------------------------------- //
   1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
   1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
   1363 //
   1364 // Currently there is no way to write from a Tensor expression to a block of
   1365 // memory, if dimensions are reordered. If you need to do that, you should
   1366 // materialize a Tensor block expression into a memory buffer, and then use
   1367 // TensorBlockIO to copy data between two memory buffers with a custom
   1368 // `target->src` dimension map (see definition above).
   1369 //
   1370 // Also currently the innermost dimension of `target` must have a stride '1'
   1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
   1372 // but in practice it's never needed, and there is a similar TensorBlockIO
   1373 // workaround for that.
   1374 //
   1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
   1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
   1377 // to use expressions instead of pointers, and after that TensorBlockAssignment
   1378 // will become an alias to IO.
   1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
   1380           typename IndexType = Eigen::Index>
   1381 class TensorBlockAssignment {
   1382   // We will use coeff/packet path to evaluate block expressions.
   1383   typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
   1384       TensorBlockEvaluator;
   1385 
   1386   typedef DSizes<IndexType, NumDims> Dimensions;
   1387 
   1388   enum {
   1389     Vectorizable = packet_traits<Scalar>::Vectorizable,
   1390     PacketSize = packet_traits<Scalar>::size
   1391   };
   1392 
   1393   template <bool Vectorizable, typename Evaluator>
   1394   struct InnerDimAssign {
   1395     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
   1396                                         const Evaluator& eval,
   1397                                         IndexType eval_offset) {
   1398       for (IndexType i = 0; i < count; ++i) {
   1399         target[i] = eval.coeff(eval_offset + i);
   1400       }
   1401     }
   1402   };
   1403 
   1404   template <typename Evaluator>
   1405   struct InnerDimAssign<true, Evaluator> {
   1406     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
   1407                                         const Evaluator& eval,
   1408                                         IndexType eval_offset) {
   1409       typedef typename packet_traits<Scalar>::type Packet;
   1410 
   1411       const IndexType unrolled_size = count - 4 * PacketSize;
   1412       const IndexType vectorized_size = count - PacketSize;
   1413       IndexType i = 0;
   1414 
   1415       for (; i <= unrolled_size; i += 4 * PacketSize) {
   1416         for (int j = 0; j < 4; ++j) {
   1417           const IndexType idx = eval_offset + i + j * PacketSize;
   1418           Packet p = eval.template packet<Unaligned>(idx);
   1419           pstoreu<Scalar>(target + i + j * PacketSize, p);
   1420         }
   1421       }
   1422 
   1423       for (; i <= vectorized_size; i += PacketSize) {
   1424         Packet p = eval.template packet<Unaligned>(eval_offset + i);
   1425         pstoreu<Scalar>(target + i, p);
   1426       }
   1427 
   1428       for (; i < count; ++i) {
   1429         target[i] = eval.coeff(eval_offset + i);
   1430       }
   1431     }
   1432   };
   1433 
   1434  public:
   1435   struct Target {
   1436     Target(const Dimensions& target_dims, const Dimensions& target_strides,
   1437            Scalar* target_data, IndexType target_offset = 0)
   1438         : dims(target_dims),
   1439           strides(target_strides),
   1440           data(target_data),
   1441           offset(target_offset) {}
   1442 
   1443     Dimensions dims;
   1444     Dimensions strides;
   1445     Scalar* data;
   1446     IndexType offset;
   1447   };
   1448 
   1449   static Target target(const Dimensions& target_dims,
   1450                        const Dimensions& target_strides, Scalar* target_data,
   1451                        IndexType target_offset = 0) {
   1452     return Target(target_dims, target_strides, target_data, target_offset);
   1453   }
   1454 
   1455   template <typename TargetDimsIndexType, typename TargetStridesIndexType>
   1456   static Target target(
   1457       const DSizes<TargetDimsIndexType, NumDims>& target_dims,
   1458       const DSizes<TargetStridesIndexType, NumDims>& target_strides,
   1459       Scalar* target_data, IndexType target_offset = 0) {
   1460     // DSizes constructor will do index type promotion if it's safe.
   1461     return Target(Dimensions(target_dims), Dimensions(target_strides),
   1462                   target_data, target_offset);
   1463   }
   1464 
   1465   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
   1466       const Target& target, const TensorBlockExpr& expr) {
   1467     // Prepare evaluator for block expression.
   1468     DefaultDevice default_device;
   1469     TensorBlockEvaluator eval(expr, default_device);
   1470 
   1471     // Tensor block expression dimension should match destination dimensions.
   1472     eigen_assert(dimensions_match(target.dims, eval.dimensions()));
   1473 
   1474     static const int Layout = TensorBlockEvaluator::Layout;
   1475     static const bool is_col_major = Layout == ColMajor;
   1476 
   1477     // Initialize output inner dimension size based on a layout.
   1478     const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
   1479     const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
   1480     IndexType output_inner_dim_size = target.dims[inner_dim_idx];
   1481 
   1482     // Target inner dimension stride must be '1'.
   1483     eigen_assert(target.strides[inner_dim_idx] == 1);
   1484 
   1485     // Squeeze multiple inner dims into one if they are contiguous in `target`.
   1486     IndexType num_squeezed_dims = 0;
   1487     for (Index i = 1; i < NumDims; ++i) {
   1488       const Index dim = is_col_major ? i : NumDims - i - 1;
   1489       const IndexType target_stride = target.strides[dim];
   1490 
   1491       if (output_inner_dim_size == target_stride) {
   1492         output_inner_dim_size *= target.dims[dim];
   1493         num_squeezed_dims++;
   1494       } else {
   1495         break;
   1496       }
   1497     }
   1498 
   1499     // Initialize output block iterator state. Dimension in this array are
   1500     // always in inner_most -> outer_most order (col major layout).
   1501     array<BlockIteratorState, NumDims> it;
   1502 
   1503     int idx = 0;  // currently initialized iterator state index
   1504     for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
   1505       const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
   1506 
   1507       it[idx].count = 0;
   1508       it[idx].size = target.dims[dim];
   1509       it[idx].output_stride = target.strides[dim];
   1510       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
   1511       idx++;
   1512     }
   1513 
   1514     // We read block expression from the beginning, and start writing data to
   1515     // `target` at given offset.
   1516     IndexType input_offset = 0;
   1517     IndexType output_offset = target.offset;
   1518 
   1519     // Iterate copying data from `eval` to `target`.
   1520     for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
   1521       // Assign to `target` at current offset.
   1522       InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
   1523                      TensorBlockEvaluator>::Run(target.data + output_offset,
   1524                                                 output_inner_dim_size, eval,
   1525                                                 input_offset);
   1526 
   1527       // Move input offset forward by the number of assigned coefficients.
   1528       input_offset += output_inner_dim_size;
   1529 
   1530       // Update index.
   1531       for (int j = 0; j < idx; ++j) {
   1532         if (++it[j].count < it[j].size) {
   1533           output_offset += it[j].output_stride;
   1534           break;
   1535         }
   1536         it[j].count = 0;
   1537         output_offset -= it[j].output_span;
   1538       }
   1539     }
   1540   }
   1541 
   1542  private:
   1543   struct BlockIteratorState {
   1544     BlockIteratorState()
   1545         : count(0), size(0), output_stride(0), output_span(0) {}
   1546 
   1547     IndexType count;
   1548     IndexType size;
   1549     IndexType output_stride;
   1550     IndexType output_span;
   1551   };
   1552 };
   1553 
   1554 // -------------------------------------------------------------------------- //
   1555 
   1556 }  // namespace internal
   1557 }  // namespace Eigen
   1558 
   1559 #endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H