cxx11_tensor_scan_sycl.cpp (6376B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // 10 // This Source Code Form is subject to the terms of the Mozilla 11 // Public License v. 2.0. If a copy of the MPL was not distributed 12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 14 #define EIGEN_TEST_NO_LONGDOUBLE 15 #define EIGEN_TEST_NO_COMPLEX 16 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 17 #define EIGEN_USE_SYCL 18 19 #include "main.h" 20 #include <unsupported/Eigen/CXX11/Tensor> 21 22 using Eigen::Tensor; 23 typedef Tensor<float, 1>::DimensionPair DimPair; 24 25 template <typename DataType, int DataLayout, typename IndexType> 26 void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size, 27 IndexType k_size, IndexType n_size, int consume_dim, 28 bool exclusive) { 29 static const DataType error_threshold = 1e-4f; 30 std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size 31 << " consume_dim : " << consume_dim << ")" << std::endl; 32 Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size); 33 Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size); 34 Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size, 35 n_size); 36 37 t_input.setRandom(); 38 std::size_t t_input_bytes = t_input.size() * sizeof(DataType); 39 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 40 41 DataType* gpu_data_in = 42 static_cast<DataType*>(sycl_device.allocate(t_input_bytes)); 43 DataType* gpu_data_out = 44 static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); 45 46 array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}}; 47 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input( 48 gpu_data_in, tensorRange); 49 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result( 50 gpu_data_out, tensorRange); 51 sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes); 52 sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes); 53 54 gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive); 55 56 t_result = t_input.cumsum(consume_dim, exclusive); 57 58 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out, 59 t_result_bytes); 60 sycl_device.synchronize(); 61 62 for (IndexType i = 0; i < t_result.size(); i++) { 63 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 64 t_result(i) - t_result_gpu(i)))) < error_threshold) { 65 continue; 66 } 67 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 68 error_threshold)) { 69 continue; 70 } 71 std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i) 72 << " vs SYCL : " << t_result_gpu(i) << std::endl; 73 assert(false); 74 } 75 sycl_device.deallocate(gpu_data_in); 76 sycl_device.deallocate(gpu_data_out); 77 } 78 79 template <typename DataType, typename Dev> 80 void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) { 81 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, 82 true); 83 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, 84 true); 85 } 86 template <typename DataType, typename Dev> 87 void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) { 88 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, 89 true); 90 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, 91 true); 92 } 93 template <typename DataType, typename Dev> 94 void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) { 95 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, 96 true); 97 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, 98 true); 99 } 100 template <typename DataType, typename Dev> 101 void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) { 102 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, 103 false); 104 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, 105 false); 106 } 107 template <typename DataType, typename Dev> 108 void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) { 109 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, 110 false); 111 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, 112 false); 113 } 114 template <typename DataType, typename Dev> 115 void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) { 116 test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, 117 false); 118 test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, 119 false); 120 } 121 EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) { 122 for (const auto& device : Eigen::get_sycl_supported_devices()) { 123 std::cout << "Running on " 124 << device.template get_info<cl::sycl::info::device::name>() 125 << std::endl; 126 QueueInterface queueInterface(device); 127 auto sycl_device = Eigen::SyclDevice(&queueInterface); 128 CALL_SUBTEST_1( 129 sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device)); 130 CALL_SUBTEST_2( 131 sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device)); 132 CALL_SUBTEST_3( 133 sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device)); 134 CALL_SUBTEST_4( 135 sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device)); 136 CALL_SUBTEST_5( 137 sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device)); 138 CALL_SUBTEST_6( 139 sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device)); 140 } 141 }