cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorScan.h (20091B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 template <typename Op, typename XprType>
     18 struct traits<TensorScanOp<Op, XprType> >
     19     : public traits<XprType> {
     20   typedef typename XprType::Scalar Scalar;
     21   typedef traits<XprType> XprTraits;
     22   typedef typename XprTraits::StorageKind StorageKind;
     23   typedef typename XprType::Nested Nested;
     24   typedef typename remove_reference<Nested>::type _Nested;
     25   static const int NumDimensions = XprTraits::NumDimensions;
     26   static const int Layout = XprTraits::Layout;
     27   typedef typename XprTraits::PointerType PointerType;
     28 };
     29 
     30 template<typename Op, typename XprType>
     31 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
     32 {
     33   typedef const TensorScanOp<Op, XprType>& type;
     34 };
     35 
     36 template<typename Op, typename XprType>
     37 struct nested<TensorScanOp<Op, XprType>, 1,
     38             typename eval<TensorScanOp<Op, XprType> >::type>
     39 {
     40   typedef TensorScanOp<Op, XprType> type;
     41 };
     42 } // end namespace internal
     43 
     44 /** \class TensorScan
     45   * \ingroup CXX11_Tensor_Module
     46   *
     47   * \brief Tensor scan class.
     48   */
     49 template <typename Op, typename XprType>
     50 class TensorScanOp
     51     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
     52 public:
     53   typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
     54   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
     55   typedef typename XprType::CoeffReturnType CoeffReturnType;
     56   typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
     57   typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
     58   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
     59 
     60   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
     61       const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
     62       : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
     63 
     64   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     65   const Index axis() const { return m_axis; }
     66   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     67   const XprType& expression() const { return m_expr; }
     68   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     69   const Op accumulator() const { return m_accumulator; }
     70   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     71   bool exclusive() const { return m_exclusive; }
     72 
     73 protected:
     74   typename XprType::Nested m_expr;
     75   const Index m_axis;
     76   const Op m_accumulator;
     77   const bool m_exclusive;
     78 };
     79 
     80 
     81 namespace internal {
     82 
     83 template <typename Self>
     84 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
     85                                       typename Self::CoeffReturnType* data) {
     86   // Compute the scan along the axis, starting at the given offset
     87   typename Self::CoeffReturnType accum = self.accumulator().initialize();
     88   if (self.stride() == 1) {
     89     if (self.exclusive()) {
     90       for (Index curr = offset; curr < offset + self.size(); ++curr) {
     91         data[curr] = self.accumulator().finalize(accum);
     92         self.accumulator().reduce(self.inner().coeff(curr), &accum);
     93       }
     94     } else {
     95       for (Index curr = offset; curr < offset + self.size(); ++curr) {
     96         self.accumulator().reduce(self.inner().coeff(curr), &accum);
     97         data[curr] = self.accumulator().finalize(accum);
     98       }
     99     }
    100   } else {
    101     if (self.exclusive()) {
    102       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
    103         Index curr = offset + idx3 * self.stride();
    104         data[curr] = self.accumulator().finalize(accum);
    105         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    106       }
    107     } else {
    108       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
    109         Index curr = offset + idx3 * self.stride();
    110         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    111         data[curr] = self.accumulator().finalize(accum);
    112       }
    113     }
    114   }
    115 }
    116 
    117 template <typename Self>
    118 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
    119                                       typename Self::CoeffReturnType* data) {
    120   using Scalar = typename Self::CoeffReturnType;
    121   using Packet = typename Self::PacketReturnType;
    122   // Compute the scan along the axis, starting at the calculated offset
    123   Packet accum = self.accumulator().template initializePacket<Packet>();
    124   if (self.stride() == 1) {
    125     if (self.exclusive()) {
    126       for (Index curr = offset; curr < offset + self.size(); ++curr) {
    127         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
    128         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
    129       }
    130     } else {
    131       for (Index curr = offset; curr < offset + self.size(); ++curr) {
    132         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
    133         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
    134       }
    135     }
    136   } else {
    137     if (self.exclusive()) {
    138       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
    139         const Index curr = offset + idx3 * self.stride();
    140         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
    141         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
    142       }
    143     } else {
    144       for (Index idx3 = 0; idx3 < self.size(); idx3++) {
    145         const Index curr = offset + idx3 * self.stride();
    146         self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
    147         internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
    148       }
    149     }
    150   }
    151 }
    152 
    153 template <typename Self, bool Vectorize, bool Parallel>
    154 struct ReduceBlock {
    155   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
    156                                       typename Self::CoeffReturnType* data) {
    157     for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
    158       // Calculate the starting offset for the scan
    159       Index offset = idx1 + idx2;
    160       ReduceScalar(self, offset, data);
    161     }
    162   }
    163 };
    164 
    165 // Specialization for vectorized reduction.
    166 template <typename Self>
    167 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
    168   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
    169                                       typename Self::CoeffReturnType* data) {
    170     using Packet = typename Self::PacketReturnType;
    171     const int PacketSize = internal::unpacket_traits<Packet>::size;
    172     Index idx2 = 0;
    173     for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
    174       // Calculate the starting offset for the packet scan
    175       Index offset = idx1 + idx2;
    176       ReducePacket(self, offset, data);
    177     }
    178     for (; idx2 < self.stride(); idx2++) {
    179       // Calculate the starting offset for the scan
    180       Index offset = idx1 + idx2;
    181       ReduceScalar(self, offset, data);
    182     }
    183   }
    184 };
    185 
    186 // Single-threaded CPU implementation of scan
    187 template <typename Self, typename Reducer, typename Device,
    188           bool Vectorize =
    189               (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
    190                internal::reducer_traits<Reducer, Device>::PacketAccess)>
    191 struct ScanLauncher {
    192   void operator()(Self& self, typename Self::CoeffReturnType* data) {
    193     Index total_size = internal::array_prod(self.dimensions());
    194 
    195     // We fix the index along the scan axis to 0 and perform a
    196     // scan per remaining entry. The iteration is split into two nested
    197     // loops to avoid an integer division by keeping track of each idx1 and
    198     // idx2.
    199     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
    200       ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
    201       block_reducer(self, idx1, data);
    202     }
    203   }
    204 };
    205 
    206 #ifdef EIGEN_USE_THREADS
    207 
    208 // Adjust block_size to avoid false sharing of cachelines among
    209 // threads. Currently set to twice the cache line size on Intel and ARM
    210 // processors.
    211 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
    212   EIGEN_CONSTEXPR Index kBlockAlignment = 128;
    213   const Index items_per_cacheline =
    214       numext::maxi<Index>(1, kBlockAlignment / item_size);
    215   return items_per_cacheline * divup(block_size, items_per_cacheline);
    216 }
    217 
    218 template <typename Self>
    219 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
    220   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
    221                                       typename Self::CoeffReturnType* data) {
    222     using Scalar = typename Self::CoeffReturnType;
    223     using Packet = typename Self::PacketReturnType;
    224     const int PacketSize = internal::unpacket_traits<Packet>::size;
    225     Index num_scalars = self.stride();
    226     Index num_packets = 0;
    227     if (self.stride() >= PacketSize) {
    228       num_packets = self.stride() / PacketSize;
    229       self.device().parallelFor(
    230           num_packets,
    231         TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
    232                      16 * PacketSize * self.size(), true, PacketSize),
    233         // Make the shard size large enough that two neighboring threads
    234         // won't write to the same cacheline of `data`.
    235         [=](Index blk_size) {
    236           return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
    237         },
    238         [&](Index first, Index last) {
    239           for (Index packet = first; packet < last; ++packet) {
    240             const Index idx2 = packet * PacketSize;
    241             ReducePacket(self, idx1 + idx2, data);
    242           }
    243         });
    244       num_scalars -= num_packets * PacketSize;
    245     }
    246     self.device().parallelFor(
    247         num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
    248         // Make the shard size large enough that two neighboring threads
    249         // won't write to the same cacheline of `data`.
    250         [=](Index blk_size) {
    251           return AdjustBlockSize(sizeof(Scalar), blk_size);
    252         },
    253         [&](Index first, Index last) {
    254           for (Index scalar = first; scalar < last; ++scalar) {
    255             const Index idx2 = num_packets * PacketSize + scalar;
    256             ReduceScalar(self, idx1 + idx2, data);
    257           }
    258         });
    259   }
    260 };
    261 
    262 template <typename Self>
    263 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
    264   EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
    265                                       typename Self::CoeffReturnType* data) {
    266     using Scalar = typename Self::CoeffReturnType;
    267     self.device().parallelFor(
    268         self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
    269         // Make the shard size large enough that two neighboring threads
    270         // won't write to the same cacheline of `data`.
    271         [=](Index blk_size) {
    272           return AdjustBlockSize(sizeof(Scalar), blk_size);
    273         },
    274         [&](Index first, Index last) {
    275           for (Index idx2 = first; idx2 < last; ++idx2) {
    276             ReduceScalar(self, idx1 + idx2, data);
    277           }
    278         });
    279   }
    280 };
    281 
    282 // Specialization for multi-threaded execution.
    283 template <typename Self, typename Reducer, bool Vectorize>
    284 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
    285   void operator()(Self& self, typename Self::CoeffReturnType* data) {
    286     using Scalar = typename Self::CoeffReturnType;
    287     using Packet = typename Self::PacketReturnType;
    288     const int PacketSize = internal::unpacket_traits<Packet>::size;
    289     const Index total_size = internal::array_prod(self.dimensions());
    290     const Index inner_block_size = self.stride() * self.size();
    291     bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
    292 
    293     if ((parallelize_by_outer_blocks && total_size <= 4096) ||
    294         (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
    295       ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
    296       launcher(self, data);
    297       return;
    298     }
    299 
    300     if (parallelize_by_outer_blocks) {
    301       // Parallelize over outer blocks.
    302       const Index num_outer_blocks = total_size / inner_block_size;
    303       self.device().parallelFor(
    304           num_outer_blocks,
    305           TensorOpCost(inner_block_size, inner_block_size,
    306                        16 * PacketSize * inner_block_size, Vectorize,
    307                        PacketSize),
    308           [=](Index blk_size) {
    309             return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
    310           },
    311           [&](Index first, Index last) {
    312             for (Index idx1 = first; idx1 < last; ++idx1) {
    313               ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
    314               block_reducer(self, idx1 * inner_block_size, data);
    315             }
    316           });
    317     } else {
    318       // Parallelize over inner packets/scalars dimensions when the reduction
    319       // axis is not an inner dimension.
    320       ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
    321       for (Index idx1 = 0; idx1 < total_size;
    322            idx1 += self.stride() * self.size()) {
    323         block_reducer(self, idx1, data);
    324       }
    325     }
    326   }
    327 };
    328 #endif  // EIGEN_USE_THREADS
    329 
    330 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
    331 
    332 // GPU implementation of scan
    333 // TODO(ibab) This placeholder implementation performs multiple scans in
    334 // parallel, but it would be better to use a parallel scan algorithm and
    335 // optimize memory access.
    336 template <typename Self, typename Reducer>
    337 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
    338   // Compute offset as in the CPU version
    339   Index val = threadIdx.x + blockIdx.x * blockDim.x;
    340   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
    341 
    342   if (offset + (self.size() - 1) * self.stride() < total_size) {
    343     // Compute the scan along the axis, starting at the calculated offset
    344     typename Self::CoeffReturnType accum = self.accumulator().initialize();
    345     for (Index idx = 0; idx < self.size(); idx++) {
    346       Index curr = offset + idx * self.stride();
    347       if (self.exclusive()) {
    348         data[curr] = self.accumulator().finalize(accum);
    349         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    350       } else {
    351         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    352         data[curr] = self.accumulator().finalize(accum);
    353       }
    354     }
    355   }
    356   __syncthreads();
    357 
    358 }
    359 
    360 template <typename Self, typename Reducer, bool Vectorize>
    361 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
    362   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
    363      Index total_size = internal::array_prod(self.dimensions());
    364      Index num_blocks = (total_size / self.size() + 63) / 64;
    365      Index block_size = 64;
    366 
    367      LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
    368   }
    369 };
    370 #endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
    371 
    372 }  // namespace internal
    373 
    374 // Eval as rvalue
    375 template <typename Op, typename ArgType, typename Device>
    376 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    377 
    378   typedef TensorScanOp<Op, ArgType> XprType;
    379   typedef typename XprType::Index Index;
    380   typedef const ArgType ChildTypeNoConst;
    381   typedef const ArgType ChildType;
    382   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
    383   typedef DSizes<Index, NumDims> Dimensions;
    384   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
    385   typedef typename XprType::CoeffReturnType CoeffReturnType;
    386   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
    387   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
    388   typedef StorageMemory<Scalar, Device> Storage;
    389   typedef typename Storage::Type EvaluatorPointerType;
    390 
    391   enum {
    392     IsAligned = false,
    393     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    394     BlockAccess = false,
    395     PreferBlockAccess = false,
    396     Layout = TensorEvaluator<ArgType, Device>::Layout,
    397     CoordAccess = false,
    398     RawAccess = true
    399   };
    400 
    401   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    402   typedef internal::TensorBlockNotImplemented TensorBlock;
    403   //===--------------------------------------------------------------------===//
    404 
    405   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    406       : m_impl(op.expression(), device),
    407         m_device(device),
    408         m_exclusive(op.exclusive()),
    409         m_accumulator(op.accumulator()),
    410         m_size(m_impl.dimensions()[op.axis()]),
    411         m_stride(1), m_consume_dim(op.axis()),
    412         m_output(NULL) {
    413 
    414     // Accumulating a scalar isn't supported.
    415     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    416     eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
    417 
    418     // Compute stride of scan axis
    419     const Dimensions& dims = m_impl.dimensions();
    420     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    421       for (int i = 0; i < op.axis(); ++i) {
    422         m_stride = m_stride * dims[i];
    423       }
    424     } else {
    425       // dims can only be indexed through unsigned integers,
    426       // so let's use an unsigned type to let the compiler knows.
    427       // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
    428       unsigned int axis = internal::convert_index<unsigned int>(op.axis());
    429       for (unsigned int i = NumDims - 1; i > axis; --i) {
    430         m_stride = m_stride * dims[i];
    431       }
    432     }
    433   }
    434 
    435   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
    436     return m_impl.dimensions();
    437   }
    438 
    439   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
    440     return m_stride;
    441   }
    442 
    443   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
    444     return m_consume_dim;
    445   }
    446 
    447   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
    448     return m_size;
    449   }
    450 
    451   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
    452     return m_accumulator;
    453   }
    454 
    455   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
    456     return m_exclusive;
    457   }
    458 
    459   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
    460     return m_impl;
    461   }
    462 
    463   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
    464     return m_device;
    465   }
    466 
    467   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    468     m_impl.evalSubExprsIfNeeded(NULL);
    469     internal::ScanLauncher<Self, Op, Device> launcher;
    470     if (data) {
    471       launcher(*this, data);
    472       return false;
    473     }
    474 
    475     const Index total_size = internal::array_prod(dimensions());
    476     m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
    477     launcher(*this, m_output);
    478     return true;
    479   }
    480 
    481   template<int LoadMode>
    482   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
    483     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
    484   }
    485 
    486   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
    487   {
    488     return m_output;
    489   }
    490 
    491   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    492   {
    493     return m_output[index];
    494   }
    495 
    496   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
    497     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
    498   }
    499 
    500   EIGEN_STRONG_INLINE void cleanup() {
    501     if (m_output) {
    502       m_device.deallocate_temp(m_output);
    503       m_output = NULL;
    504     }
    505     m_impl.cleanup();
    506   }
    507 
    508 #ifdef EIGEN_USE_SYCL
    509  // binding placeholder accessors to a command group handler for SYCL
    510   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    511     m_impl.bind(cgh);
    512     m_output.bind(cgh);
    513   }
    514 #endif
    515 protected:
    516   TensorEvaluator<ArgType, Device> m_impl;
    517   const Device EIGEN_DEVICE_REF m_device;
    518   const bool m_exclusive;
    519   Op m_accumulator;
    520   const Index m_size;
    521   Index m_stride;
    522   Index m_consume_dim;
    523   EvaluatorPointerType m_output;
    524 };
    525 
    526 }  // end namespace Eigen
    527 
    528 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H