cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorDeviceSycl.h (40367B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Mehdi Goli    Codeplay Software Ltd.
      5 // Ralph Potter  Codeplay Software Ltd.
      6 // Luke Iwanski  Codeplay Software Ltd.
      7 // Contact: <eigen@codeplay.com>
      8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
      9 
     10 //
     11 // This Source Code Form is subject to the terms of the Mozilla
     12 // Public License v. 2.0. If a copy of the MPL was not distributed
     13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     14 
     15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
     16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
     17 #include <unordered_set>
     18 
     19 namespace Eigen {
     20 
     21 namespace TensorSycl {
     22 namespace internal {
     23 
     24 /// Cache all the device information needed
     25 struct SyclDeviceInfo {
     26   SyclDeviceInfo(cl::sycl::queue queue)
     27       : local_mem_type(
     28             queue.get_device()
     29                 .template get_info<cl::sycl::info::device::local_mem_type>()),
     30         max_work_item_sizes(
     31             queue.get_device()
     32                 .template get_info<
     33                     cl::sycl::info::device::max_work_item_sizes>()),
     34         max_mem_alloc_size(
     35             queue.get_device()
     36                 .template get_info<
     37                     cl::sycl::info::device::max_mem_alloc_size>()),
     38         max_compute_units(queue.get_device()
     39                               .template get_info<
     40                                   cl::sycl::info::device::max_compute_units>()),
     41         max_work_group_size(
     42             queue.get_device()
     43                 .template get_info<
     44                     cl::sycl::info::device::max_work_group_size>()),
     45         local_mem_size(
     46             queue.get_device()
     47                 .template get_info<cl::sycl::info::device::local_mem_size>()),
     48         platform_name(queue.get_device()
     49                           .get_platform()
     50                           .template get_info<cl::sycl::info::platform::name>()),
     51         device_name(queue.get_device()
     52                         .template get_info<cl::sycl::info::device::name>()),
     53         device_vendor(
     54             queue.get_device()
     55                 .template get_info<cl::sycl::info::device::vendor>()) {}
     56 
     57   cl::sycl::info::local_mem_type local_mem_type;
     58   cl::sycl::id<3> max_work_item_sizes;
     59   unsigned long max_mem_alloc_size;
     60   unsigned long max_compute_units;
     61   unsigned long max_work_group_size;
     62   size_t local_mem_size;
     63   std::string platform_name;
     64   std::string device_name;
     65   std::string device_vendor;
     66 };
     67 
     68 }  // end namespace internal
     69 }  // end namespace TensorSycl
     70 
     71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
     72 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
     73 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
     74 // TensorFlow via the Eigen SYCL Backend.
     75 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
     76     -> decltype(cl::sycl::device::get_devices()) {
     77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
     78   return {cl::sycl::device(cl::sycl::default_selector())};
     79 #else
     80   std::vector<cl::sycl::device> supported_devices;
     81   auto platform_list = cl::sycl::platform::get_platforms();
     82   for (const auto &platform : platform_list) {
     83     auto device_list = platform.get_devices();
     84     auto platform_name =
     85         platform.template get_info<cl::sycl::info::platform::name>();
     86     std::transform(platform_name.begin(), platform_name.end(),
     87                    platform_name.begin(), ::tolower);
     88     for (const auto &device : device_list) {
     89       auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
     90       std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
     91       bool unsupported_condition =
     92           (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
     93            vendor.find("apu") == std::string::npos) ||
     94           (platform_name.find("experimental") != std::string::npos) ||
     95           device.is_host();
     96       if (!unsupported_condition) {
     97         supported_devices.push_back(device);
     98       }
     99     }
    100   }
    101   return supported_devices;
    102 #endif
    103 }
    104 
    105 class QueueInterface {
    106  public:
    107   /// Creating device by using cl::sycl::selector or cl::sycl::device.
    108   template <typename DeviceOrSelector>
    109   explicit QueueInterface(
    110       const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
    111       unsigned num_threads = std::thread::hardware_concurrency())
    112       : m_queue(dev_or_sel, handler),
    113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    114         m_prog(m_queue.get_context(), get_sycl_supported_devices()),
    115 #endif
    116         m_thread_pool(num_threads),
    117         m_device_info(m_queue) {
    118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    119     m_prog.build_with_kernel_type<DeviceOrSelector>();
    120     auto f = [&](cl::sycl::handler &cgh) {
    121       cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
    122                                         [=]() {})
    123     };
    124     EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
    125 #endif
    126   }
    127 
    128   template <typename DeviceOrSelector>
    129   explicit QueueInterface(
    130       const DeviceOrSelector &dev_or_sel,
    131       unsigned num_threads = std::thread::hardware_concurrency())
    132       : QueueInterface(dev_or_sel,
    133                        [this](cl::sycl::exception_list l) {
    134                          this->exception_caught_ = this->sycl_async_handler(l);
    135                        },
    136                        num_threads) {}
    137 
    138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    139   EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
    140 #endif
    141 
    142   /// Attach an existing buffer to the pointer map, Eigen will not reuse it
    143   EIGEN_STRONG_INLINE void *attach_buffer(
    144       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
    145     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    146     return static_cast<void *>(pMapper.add_pointer(buf));
    147   }
    148 
    149   /// Detach previously attached buffer
    150   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
    151     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    152     TensorSycl::internal::SYCLfree<false>(p, pMapper);
    153   }
    154 
    155   /// Allocating device pointer. This pointer is actually an 8 bytes host
    156   /// pointer used as key to access the sycl device buffer. The reason is that
    157   /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
    158   /// expressions. So we create a key pointer to be used in Eigen expression
    159   /// construction. When we convert the Eigen construction into the sycl
    160   /// construction we use this pointer as a key in our buffer_map and we make
    161   /// sure that we dedicate only one buffer only for this pointer. The device
    162   /// pointer would be deleted by calling deallocate function.
    163   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
    164 #if EIGEN_MAX_ALIGN_BYTES > 0
    165     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
    166     if (align > 0) {
    167       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
    168     }
    169 #endif
    170     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    171     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
    172   }
    173 
    174   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
    175 #if EIGEN_MAX_ALIGN_BYTES > 0
    176     size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
    177     if (align > 0) {
    178       num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
    179     }
    180 #endif
    181     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
    183     if (scratch_buffers.empty()) {
    184       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
    185       ;
    186     } else {
    187       for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
    188         auto buff = pMapper.get_buffer(*it);
    189         if (buff.get_size() >= num_bytes) {
    190           auto ptr = *it;
    191           scratch_buffers.erase(it);
    192           return ptr;
    193         } else {
    194           ++it;
    195         }
    196       }
    197       return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
    198     }
    199 #else
    200     return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
    201 #endif
    202   }
    203   template <typename data_t>
    204   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
    205       cl::sycl::access::mode::read_write, data_t>
    206   get(data_t *data) const {
    207     return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
    208   }
    209   template <typename data_t>
    210   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
    211       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
    212                                         data_t>
    213           data) const {
    214     return static_cast<data_t *>(data.get_virtual_pointer());
    215   }
    216 
    217   EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
    218     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
    220     scratch_buffers.insert(p);
    221 #else
    222     TensorSycl::internal::SYCLfree(p, pMapper);
    223 #endif
    224   }
    225   template <cl::sycl::access::mode AcMd, typename T>
    226   EIGEN_STRONG_INLINE void deallocate_temp(
    227       const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
    228     deallocate_temp(p.get_virtual_pointer());
    229   }
    230 
    231   /// This is used to deallocate the device pointer. p is used as a key inside
    232   /// the map to find the device buffer and delete it.
    233   EIGEN_STRONG_INLINE void deallocate(void *p) const {
    234     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    235     TensorSycl::internal::SYCLfree(p, pMapper);
    236   }
    237 
    238   EIGEN_STRONG_INLINE void deallocate_all() const {
    239     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    240     TensorSycl::internal::SYCLfreeAll(pMapper);
    241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
    242     scratch_buffers.clear();
    243 #endif
    244   }
    245 
    246   /// The memcpyHostToDevice is used to copy the data from host to device
    247   /// The destination pointer could be deleted before the copy happend which is
    248   /// why a callback function is needed. By default if none is provided, the
    249   /// function is blocking.
    250   EIGEN_STRONG_INLINE void memcpyHostToDevice(
    251       void *dst, const void *src, size_t n,
    252       std::function<void()> callback) const {
    253     static const auto write_mode = cl::sycl::access::mode::discard_write;
    254     static const auto global_access = cl::sycl::access::target::global_buffer;
    255     typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
    256         write_accessor;
    257     if (n == 0) {
    258       if (callback) callback();
    259       return;
    260     }
    261     n /= sizeof(buffer_scalar_t);
    262     auto f = [&](cl::sycl::handler &cgh) {
    263       write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
    264       buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
    265       auto non_deleter = [](buffer_scalar_t const *) {};
    266       std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
    267       cgh.copy(s_ptr, dst_acc);
    268     };
    269     cl::sycl::event e;
    270     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
    271     synchronize_and_callback(e, callback);
    272   }
    273 
    274   /// The memcpyDeviceToHost is used to copy the data from device to host.
    275   /// The source pointer could be deleted before the copy happend which is
    276   /// why a callback function is needed. By default if none is provided, the
    277   /// function is blocking.
    278   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
    279       void *dst, const void *src, size_t n,
    280       std::function<void()> callback) const {
    281     static const auto read_mode = cl::sycl::access::mode::read;
    282     static const auto global_access = cl::sycl::access::target::global_buffer;
    283     typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
    284         read_accessor;
    285     if (n == 0) {
    286       if (callback) callback();
    287       return;
    288     }
    289     n /= sizeof(buffer_scalar_t);
    290     auto f = [&](cl::sycl::handler &cgh) {
    291       read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
    292       buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
    293       auto non_deleter = [](buffer_scalar_t *) {};
    294       std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
    295       cgh.copy(src_acc, s_ptr);
    296     };
    297     cl::sycl::event e;
    298     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
    299     synchronize_and_callback(e, callback);
    300   }
    301 
    302   /// The memcpy function.
    303   /// No callback is required here as both arguments are on the device
    304   /// and SYCL can handle the dependency.
    305   EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
    306     static const auto read_mode = cl::sycl::access::mode::read;
    307     static const auto write_mode = cl::sycl::access::mode::discard_write;
    308     if (n == 0) {
    309       return;
    310     }
    311     n /= sizeof(buffer_scalar_t);
    312     auto f = [&](cl::sycl::handler &cgh) {
    313       auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
    314       auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
    315       cgh.copy(src_acc, dst_acc);
    316     };
    317     cl::sycl::event e;
    318     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
    319     async_synchronize(e);
    320   }
    321 
    322   /// the memset function.
    323   /// No callback is required here as both arguments are on the device
    324   /// and SYCL can handle the dependency.
    325   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
    326     static const auto write_mode = cl::sycl::access::mode::discard_write;
    327     if (n == 0) {
    328       return;
    329     }
    330     n /= sizeof(buffer_scalar_t);
    331     auto f = [&](cl::sycl::handler &cgh) {
    332       auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
    333       // The cast to uint8_t is here to match the behaviour of the standard
    334       // memset. The cast to buffer_scalar_t is needed to match the type of the
    335       // accessor (in case buffer_scalar_t is not uint8_t)
    336       cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
    337     };
    338     cl::sycl::event e;
    339     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
    340     async_synchronize(e);
    341   }
    342 
    343   /// Get a range accessor to the virtual pointer's device memory. This range
    344   /// accessor will allow access to the memory from the pointer to the end of
    345   /// the buffer.
    346   ///
    347   /// NOTE: Inside a kernel the range accessor will always be indexed from the
    348   /// start of the buffer, so the offset in the accessor is only used by
    349   /// methods like handler::copy and will not be available inside a kernel.
    350   template <cl::sycl::access::mode AcMd, typename T>
    351   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
    352   get_range_accessor(const void *ptr) const {
    353     static const auto global_access = cl::sycl::access::target::global_buffer;
    354     static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
    355     typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
    356     typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
    357 
    358     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    359 
    360     auto original_buffer = pMapper.get_buffer(ptr);
    361     const ptrdiff_t offset = pMapper.get_offset(ptr);
    362     const ptrdiff_t typed_offset = offset / sizeof(T);
    363     eigen_assert(typed_offset >= 0);
    364     const auto typed_size = original_buffer.get_size() / sizeof(T);
    365     auto buffer = original_buffer.template reinterpret<
    366         typename Eigen::internal::remove_const<T>::type>(
    367         cl::sycl::range<1>(typed_size));
    368     const ptrdiff_t size = buffer.get_count() - typed_offset;
    369     eigen_assert(size >= 0);
    370     typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
    371                                1, AcMd, global_access, is_place_holder>
    372         placeholder_accessor_t;
    373     const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
    374     return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
    375                                            cl::sycl::id<1>(typed_offset)),
    376                     static_cast<size_t>(typed_offset),
    377                     reinterpret_cast<std::intptr_t>(start_ptr));
    378   }
    379 
    380   /// Get a range accessor to the virtual pointer's device memory with a
    381   /// specified size.
    382   template <cl::sycl::access::mode AcMd, typename Index>
    383   EIGEN_STRONG_INLINE cl::sycl::accessor<
    384       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
    385   get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
    386                      const Index n_bytes) const {
    387     static const auto global_access = cl::sycl::access::target::global_buffer;
    388     eigen_assert(n_bytes >= 0);
    389     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    390     auto buffer = pMapper.get_buffer(ptr);
    391     const ptrdiff_t offset = pMapper.get_offset(ptr);
    392     eigen_assert(offset >= 0);
    393     eigen_assert(offset + n_bytes <= buffer.get_size());
    394     return buffer.template get_access<AcMd, global_access>(
    395         cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
    396   }
    397 
    398   /// Creation of sycl accessor for a buffer. This function first tries to find
    399   /// the buffer in the buffer_map. If found it gets the accessor from it, if
    400   /// not, the function then adds an entry by creating a sycl buffer for that
    401   /// particular pointer.
    402   template <cl::sycl::access::mode AcMd>
    403   EIGEN_STRONG_INLINE cl::sycl::accessor<
    404       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
    405   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
    406     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    407     return pMapper.get_buffer(ptr)
    408         .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
    409             cgh);
    410   }
    411 
    412   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
    413       const void *ptr) const {
    414     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    415     return pMapper.get_buffer(ptr);
    416   }
    417 
    418   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
    419     std::lock_guard<std::mutex> lock(pmapper_mutex_);
    420     return pMapper.get_offset(ptr);
    421   }
    422 
    423   template <typename OutScalar, typename sycl_kernel, typename Lhs,
    424             typename Rhs, typename OutPtr, typename Range, typename Index,
    425             typename... T>
    426   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
    427                                                   const Rhs &rhs, OutPtr outptr,
    428                                                   Range thread_range,
    429                                                   Index scratchSize,
    430                                                   T... var) const {
    431     auto kernel_functor = [=](cl::sycl::handler &cgh) {
    432       // binding the placeholder accessors to a commandgroup handler
    433       lhs.bind(cgh);
    434       rhs.bind(cgh);
    435       outptr.bind(cgh);
    436       typedef cl::sycl::accessor<OutScalar, 1,
    437                                  cl::sycl::access::mode::read_write,
    438                                  cl::sycl::access::target::local>
    439           LocalAccessor;
    440 
    441       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
    442       cgh.parallel_for(
    443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    444           program().template get_kernel<sycl_kernel>(),
    445 #endif
    446           thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
    447     };
    448     cl::sycl::event e;
    449     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    450     async_synchronize(e);
    451   }
    452 
    453   template <typename OutScalar, typename sycl_kernel, typename InPtr,
    454             typename OutPtr, typename Range, typename Index, typename... T>
    455   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
    456                                                  OutPtr &outptr,
    457                                                  Range thread_range,
    458                                                  Index scratchSize,
    459                                                  T... var) const {
    460     auto kernel_functor = [=](cl::sycl::handler &cgh) {
    461       // binding the placeholder accessors to a commandgroup handler
    462       inptr.bind(cgh);
    463       outptr.bind(cgh);
    464       typedef cl::sycl::accessor<OutScalar, 1,
    465                                  cl::sycl::access::mode::read_write,
    466                                  cl::sycl::access::target::local>
    467           LocalAccessor;
    468 
    469       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
    470       cgh.parallel_for(
    471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    472           program().template get_kernel<sycl_kernel>(),
    473 #endif
    474           thread_range, sycl_kernel(scratch, inptr, outptr, var...));
    475     };
    476     cl::sycl::event e;
    477     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    478     async_synchronize(e);
    479   }
    480 
    481     template <typename OutScalar, typename sycl_kernel, typename InPtr,
    482            typename Range, typename Index, typename... T>
    483   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
    484                                                  Range thread_range,
    485                                                  Index scratchSize,
    486                                                  T... var) const {
    487     auto kernel_functor = [=](cl::sycl::handler &cgh) {
    488       // binding the placeholder accessors to a commandgroup handler
    489       inptr.bind(cgh);
    490       typedef cl::sycl::accessor<OutScalar, 1,
    491                                  cl::sycl::access::mode::read_write,
    492                                  cl::sycl::access::target::local>
    493           LocalAccessor;
    494 
    495       LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
    496       cgh.parallel_for(
    497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    498           program().template get_kernel<sycl_kernel>(),
    499 #endif
    500           thread_range, sycl_kernel(scratch, inptr, var...));
    501     };
    502     cl::sycl::event e;
    503     EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    504     async_synchronize(e);
    505   }
    506 
    507 
    508   EIGEN_STRONG_INLINE void synchronize() const {
    509 #ifdef EIGEN_EXCEPTIONS
    510     m_queue.wait_and_throw();
    511 #else
    512     m_queue.wait();
    513 #endif
    514   }
    515 
    516 
    517   EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
    518     set_latest_event(e);
    519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
    520     synchronize();
    521 #endif
    522   }
    523 
    524   template <typename Index>
    525   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
    526                                               Index &rng, Index &GRange) const {
    527     tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
    528     tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
    529                                            EIGEN_SYCL_LOCAL_THREAD_DIM1),
    530                         static_cast<Index>(tileSize));
    531     rng = n;
    532     if (rng == 0) rng = static_cast<Index>(1);
    533     GRange = rng;
    534     if (tileSize > GRange)
    535       tileSize = GRange;
    536     else if (GRange > tileSize) {
    537       Index xMode = static_cast<Index>(GRange % tileSize);
    538       if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
    539     }
    540   }
    541 
    542   /// This is used to prepare the number of threads and also the number of
    543   /// threads per block for sycl kernels
    544   template <typename Index>
    545   EIGEN_STRONG_INLINE void parallel_for_setup(
    546       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
    547       cl::sycl::range<2> &local_range) const {
    548     std::array<Index, 2> input_range = input_dim;
    549     Index max_workgroup_Size =
    550         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
    551     max_workgroup_Size =
    552         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
    553                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
    554                  static_cast<Index>(max_workgroup_Size));
    555     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
    556     local_range[1] =
    557         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
    558     input_range[1] = input_dim[1];
    559     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
    560     global_range[1] = input_range[1];
    561     if (local_range[1] > global_range[1])
    562       local_range[1] = global_range[1];
    563     else if (global_range[1] > local_range[1]) {
    564       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
    565       if (xMode != 0)
    566         global_range[1] += static_cast<Index>(local_range[1] - xMode);
    567     }
    568     local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
    569     input_range[0] = input_dim[0];
    570     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
    571     global_range[0] = input_range[0];
    572     if (local_range[0] > global_range[0])
    573       local_range[0] = global_range[0];
    574     else if (global_range[0] > local_range[0]) {
    575       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
    576       if (xMode != 0)
    577         global_range[0] += static_cast<Index>(local_range[0] - xMode);
    578     }
    579   }
    580 
    581   /// This is used to prepare the number of threads and also the number of
    582   /// threads per block for sycl kernels
    583   template <typename Index>
    584   EIGEN_STRONG_INLINE void parallel_for_setup(
    585       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
    586       cl::sycl::range<3> &local_range) const {
    587     std::array<Index, 3> input_range = input_dim;
    588     Index max_workgroup_Size =
    589         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
    590     max_workgroup_Size =
    591         std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
    592                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
    593                  static_cast<Index>(max_workgroup_Size));
    594     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
    595     local_range[2] =
    596         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
    597     input_range[2] = input_dim[2];
    598     if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
    599     global_range[2] = input_range[2];
    600     if (local_range[2] > global_range[2])
    601       local_range[2] = global_range[2];
    602     else if (global_range[2] > local_range[2]) {
    603       Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
    604       if (xMode != 0)
    605         global_range[2] += static_cast<Index>(local_range[2] - xMode);
    606     }
    607     pow_of_2 = static_cast<Index>(
    608         std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
    609     local_range[1] =
    610         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
    611     input_range[1] = input_dim[1];
    612     if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
    613     global_range[1] = input_range[1];
    614     if (local_range[1] > global_range[1])
    615       local_range[1] = global_range[1];
    616     else if (global_range[1] > local_range[1]) {
    617       Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
    618       if (xMode != 0)
    619         global_range[1] += static_cast<Index>(local_range[1] - xMode);
    620     }
    621     local_range[0] = static_cast<Index>(max_workgroup_Size /
    622                                         (local_range[1] * local_range[2]));
    623     input_range[0] = input_dim[0];
    624     if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
    625     global_range[0] = input_range[0];
    626     if (local_range[0] > global_range[0])
    627       local_range[0] = global_range[0];
    628     else if (global_range[0] > local_range[0]) {
    629       Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
    630       if (xMode != 0)
    631         global_range[0] += static_cast<Index>(local_range[0] - xMode);
    632     }
    633   }
    634 
    635   EIGEN_STRONG_INLINE bool has_local_memory() const {
    636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
    637     return false;
    638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
    639     return true;
    640 #else
    641     return m_device_info.local_mem_type ==
    642            cl::sycl::info::local_mem_type::local;
    643 #endif
    644   }
    645 
    646   EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
    647     return m_device_info.max_mem_alloc_size;
    648   }
    649 
    650   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
    651     return m_device_info.max_compute_units;
    652   }
    653 
    654   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
    655     return m_device_info.max_work_group_size;
    656   }
    657 
    658   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
    659     return m_device_info.max_work_item_sizes;
    660   }
    661 
    662   /// No need for sycl it should act the same as CPU version
    663   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
    664 
    665   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
    666     // OpenCL doesnot have such concept
    667     return 2;
    668   }
    669 
    670   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
    671     return m_device_info.local_mem_size;
    672   }
    673 
    674   // This function returns the nearest power of 2 Work-group size which is <=
    675   // maximum device workgroup size.
    676   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
    677     return getPowerOfTwo(m_device_info.max_work_group_size, false);
    678   }
    679 
    680   EIGEN_STRONG_INLINE std::string getPlatformName() const {
    681     return m_device_info.platform_name;
    682   }
    683 
    684   EIGEN_STRONG_INLINE std::string getDeviceName() const {
    685     return m_device_info.device_name;
    686   }
    687 
    688   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
    689     return m_device_info.device_vendor;
    690   }
    691 
    692   // This function returns the nearest power of 2
    693   // if roundup is true returns result>=wgsize
    694   // else it return result <= wgsize
    695   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
    696     if (roundUp) --wGSize;
    697     wGSize |= (wGSize >> 1);
    698     wGSize |= (wGSize >> 2);
    699     wGSize |= (wGSize >> 4);
    700     wGSize |= (wGSize >> 8);
    701     wGSize |= (wGSize >> 16);
    702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
    703     wGSize |= (wGSize >> 32);
    704 #endif
    705     return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
    706   }
    707 
    708   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
    709 
    710   // This function checks if the runtime recorded an error for the
    711   // underlying stream device.
    712   EIGEN_STRONG_INLINE bool ok() const {
    713     if (!exception_caught_) {
    714       synchronize();
    715     }
    716     return !exception_caught_;
    717   }
    718 
    719   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
    720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
    721     std::lock_guard<std::mutex> lock(event_mutex_);
    722     return latest_events_[std::this_thread::get_id()];
    723 #else
    724     eigen_assert(false);
    725     return cl::sycl::event();
    726 #endif
    727   }
    728 
    729   // destructor
    730   ~QueueInterface() {
    731     pMapper.clear();
    732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
    733     scratch_buffers.clear();
    734 #endif
    735   }
    736 
    737  protected:
    738   EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
    739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
    740     std::lock_guard<std::mutex> lock(event_mutex_);
    741     latest_events_[std::this_thread::get_id()] = e;
    742 #else
    743     EIGEN_UNUSED_VARIABLE(e);
    744 #endif
    745   }
    746 
    747   void synchronize_and_callback(cl::sycl::event e,
    748                                 const std::function<void()> &callback) const {
    749     set_latest_event(e);
    750     if (callback) {
    751       auto callback_ = [=]() {
    752 #ifdef EIGEN_EXCEPTIONS
    753         cl::sycl::event(e).wait_and_throw();
    754 #else
    755         cl::sycl::event(e).wait();
    756 #endif
    757         callback();
    758       };
    759       m_thread_pool.Schedule(std::move(callback_));
    760     } else {
    761 #ifdef EIGEN_EXCEPTIONS
    762       m_queue.wait_and_throw();
    763 #else
    764       m_queue.wait();
    765 #endif
    766     }
    767   }
    768 
    769   bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
    770     bool exception_caught = false;
    771     for (const auto &e : exceptions) {
    772       if (e) {
    773         exception_caught = true;
    774         EIGEN_THROW_X(e);
    775       }
    776     }
    777     return exception_caught;
    778   }
    779 
    780   /// class members:
    781   bool exception_caught_ = false;
    782 
    783   mutable std::mutex pmapper_mutex_;
    784 
    785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
    786   mutable std::mutex event_mutex_;
    787   mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
    788 #endif
    789 
    790   /// std::map is the container used to make sure that we create only one buffer
    791   /// per pointer. The lifespan of the buffer now depends on the lifespan of
    792   /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
    793   /// host we should manually deallocate it.
    794   mutable TensorSycl::internal::PointerMapper pMapper;
    795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
    796   mutable std::unordered_set<void *> scratch_buffers;
    797 #endif
    798   /// sycl queue
    799   mutable cl::sycl::queue m_queue;
    800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    801   mutable cl::sycl::program m_prog;
    802 #endif
    803 
    804   /// The thread pool is used to wait on events and call callbacks
    805   /// asynchronously
    806   mutable Eigen::ThreadPool m_thread_pool;
    807 
    808   const TensorSycl::internal::SyclDeviceInfo m_device_info;
    809 };
    810 
    811 struct SyclDeviceBase {
    812   /// QueueInterface is not owned. it is the caller's responsibility to destroy
    813   /// it
    814   const QueueInterface *m_queue_stream;
    815   explicit SyclDeviceBase(const QueueInterface *queue_stream)
    816       : m_queue_stream(queue_stream) {}
    817   EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
    818     return m_queue_stream;
    819   }
    820 };
    821 
    822 // Here is a sycl device struct which accept the sycl queue interface
    823 // as an input
    824 struct SyclDevice : public SyclDeviceBase {
    825   explicit SyclDevice(const QueueInterface *queue_stream)
    826       : SyclDeviceBase(queue_stream) {}
    827 
    828   // this is the accessor used to construct the evaluator
    829   template <cl::sycl::access::mode AcMd, typename T>
    830   EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
    831   get_range_accessor(const void *ptr) const {
    832     return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
    833   }
    834 
    835   // get sycl accessor
    836   template <cl::sycl::access::mode AcMd>
    837   EIGEN_STRONG_INLINE cl::sycl::accessor<
    838       buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
    839   get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
    840     return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
    841   }
    842 
    843   /// Accessing the created sycl device buffer for the device pointer
    844   EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
    845       const void *ptr) const {
    846     return queue_stream()->get_sycl_buffer(ptr);
    847   }
    848 
    849   /// This is used to prepare the number of threads and also the number of
    850   /// threads per block for sycl kernels
    851   template <typename Index>
    852   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
    853                                               Index &rng, Index &GRange) const {
    854     queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
    855   }
    856 
    857   /// This is used to prepare the number of threads and also the number of
    858   /// threads per block for sycl kernels
    859   template <typename Index>
    860   EIGEN_STRONG_INLINE void parallel_for_setup(
    861       const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
    862       cl::sycl::range<2> &local_range) const {
    863     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
    864   }
    865 
    866   /// This is used to prepare the number of threads and also the number of
    867   /// threads per block for sycl kernels
    868   template <typename Index>
    869   EIGEN_STRONG_INLINE void parallel_for_setup(
    870       const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
    871       cl::sycl::range<3> &local_range) const {
    872     queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
    873   }
    874 
    875   /// allocate device memory
    876   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
    877     return queue_stream()->allocate(num_bytes);
    878   }
    879 
    880   EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
    881     return queue_stream()->allocate_temp(num_bytes);
    882   }
    883 
    884   /// deallocate device memory
    885   EIGEN_STRONG_INLINE void deallocate(void *p) const {
    886     queue_stream()->deallocate(p);
    887   }
    888 
    889   EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
    890     queue_stream()->deallocate_temp(buffer);
    891   }
    892   template <cl::sycl::access::mode AcMd, typename T>
    893   EIGEN_STRONG_INLINE void deallocate_temp(
    894       const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
    895     queue_stream()->deallocate_temp(buffer);
    896   }
    897   EIGEN_STRONG_INLINE void deallocate_all() const {
    898     queue_stream()->deallocate_all();
    899   }
    900 
    901   template <typename data_t>
    902   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
    903       cl::sycl::access::mode::read_write, data_t>
    904   get(data_t *data) const {
    905     return queue_stream()->get(data);
    906   }
    907   template <typename data_t>
    908   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
    909       TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
    910                                         data_t>
    911           data) const {
    912     return queue_stream()->get(data);
    913   }
    914 
    915   /// attach existing buffer
    916   EIGEN_STRONG_INLINE void *attach_buffer(
    917       cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
    918     return queue_stream()->attach_buffer(buf);
    919   }
    920   /// detach buffer
    921   EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
    922     queue_stream()->detach_buffer(p);
    923   }
    924   EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
    925     return queue_stream()->get_offset(ptr);
    926   }
    927 
    928   // some runtime conditions that can be applied here
    929   EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
    930 
    931   /// memcpyHostToDevice
    932   template <typename Index>
    933   EIGEN_STRONG_INLINE void memcpyHostToDevice(
    934       Index *dst, const Index *src, size_t n,
    935       std::function<void()> callback = {}) const {
    936     queue_stream()->memcpyHostToDevice(dst, src, n, callback);
    937   }
    938   /// memcpyDeviceToHost
    939   template <typename Index>
    940   EIGEN_STRONG_INLINE void memcpyDeviceToHost(
    941       void *dst, const Index *src, size_t n,
    942       std::function<void()> callback = {}) const {
    943     queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
    944   }
    945   /// the memcpy function
    946   template <typename Index>
    947   EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
    948     queue_stream()->memcpy(dst, src, n);
    949   }
    950   /// the memset function
    951   EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
    952     queue_stream()->memset(data, c, n);
    953   }
    954   /// returning the sycl queue
    955   EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
    956     return queue_stream()->sycl_queue();
    957   }
    958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
    959   EIGEN_STRONG_INLINE cl::sycl::program &program() const {
    960     return queue_stream()->program();
    961   }
    962 #endif
    963 
    964   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
    965 
    966   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
    967     // We won't try to take advantage of the l2 cache for the time being, and
    968     // there is no l3 cache on sycl devices.
    969     return firstLevelCacheSize();
    970   }
    971   EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
    972     return queue_stream()->getNumSyclMultiProcessors();
    973   }
    974   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
    975     return queue_stream()->maxSyclThreadsPerBlock();
    976   }
    977   EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
    978     return queue_stream()->maxWorkItemSizes();
    979   }
    980   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
    981     // OpenCL doesnot have such concept
    982     return queue_stream()->maxSyclThreadsPerMultiProcessor();
    983   }
    984   EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
    985     return queue_stream()->sharedMemPerBlock();
    986   }
    987   EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
    988     return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
    989   }
    990 
    991   EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
    992     return queue_stream()->getPowerOfTwo(val, roundUp);
    993   }
    994   /// No need for sycl it should act the same as CPU version
    995   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
    996     return queue_stream()->majorDeviceVersion();
    997   }
    998 
    999   EIGEN_STRONG_INLINE void synchronize() const {
   1000     queue_stream()->synchronize();
   1001   }
   1002   EIGEN_STRONG_INLINE void async_synchronize(
   1003       cl::sycl::event e = cl::sycl::event()) const {
   1004     queue_stream()->async_synchronize(e);
   1005   }
   1006   EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
   1007     return queue_stream()->get_latest_event();
   1008   }
   1009 
   1010   // This function checks if the runtime recorded an error for the
   1011   // underlying stream device.
   1012   EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
   1013 
   1014   EIGEN_STRONG_INLINE bool has_local_memory() const {
   1015     return queue_stream()->has_local_memory();
   1016   }
   1017   EIGEN_STRONG_INLINE long max_buffer_size() const {
   1018     return queue_stream()->max_buffer_size();
   1019   }
   1020   EIGEN_STRONG_INLINE std::string getPlatformName() const {
   1021     return queue_stream()->getPlatformName();
   1022   }
   1023   EIGEN_STRONG_INLINE std::string getDeviceName() const {
   1024     return queue_stream()->getDeviceName();
   1025   }
   1026   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
   1027     return queue_stream()->getDeviceVendor();
   1028   }
   1029   template <typename OutScalar, typename KernelType, typename... T>
   1030   EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
   1031     queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
   1032         var...);
   1033   }
   1034   template <typename OutScalar, typename KernelType, typename... T>
   1035   EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
   1036     queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
   1037         var...);
   1038   }
   1039 
   1040   template <typename OutScalar, typename KernelType, typename... T>
   1041   EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
   1042     queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
   1043         var...);
   1044   }
   1045 };
   1046 }  // end namespace Eigen
   1047 
   1048 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H