cxx11_tensor_sycl.cpp (14828B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // Benoit Steiner <benoit.steiner.goog@gmail.com> 10 // 11 // This Source Code Form is subject to the terms of the Mozilla 12 // Public License v. 2.0. If a copy of the MPL was not distributed 13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 14 15 16 #define EIGEN_TEST_NO_LONGDOUBLE 17 #define EIGEN_TEST_NO_COMPLEX 18 19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 20 #define EIGEN_USE_SYCL 21 22 #include "main.h" 23 #include <unsupported/Eigen/CXX11/Tensor> 24 25 using Eigen::array; 26 using Eigen::SyclDevice; 27 using Eigen::Tensor; 28 using Eigen::TensorMap; 29 30 template <typename DataType, int DataLayout, typename IndexType> 31 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { 32 IndexType sizeDim1 = 5; 33 IndexType sizeDim2 = 5; 34 IndexType sizeDim3 = 1; 35 array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; 36 Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); 37 Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); 38 Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange); 39 Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange); 40 41 in1 = in1.random(); 42 43 DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); 44 DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType))); 45 46 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); 47 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); 48 49 sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); 50 sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); 51 gpu1.device(sycl_device) = gpu1 * 3.14f; 52 gpu2.device(sycl_device) = gpu2 * 2.7f; 53 sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType)); 54 sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType)); 55 sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); 56 sycl_device.synchronize(); 57 58 for (IndexType i = 0; i < in1.size(); ++i) { 59 // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n"; 60 VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); 61 VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); 62 VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); 63 } 64 65 sycl_device.deallocate(gpu_data1); 66 sycl_device.deallocate(gpu_data2); 67 } 68 69 template <typename DataType, int DataLayout, typename IndexType> 70 void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { 71 IndexType size = 20; 72 array<IndexType, 1> tensorRange = {{size}}; 73 Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange); 74 Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange); 75 Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange); 76 77 in1 = in1.random(); 78 in2 = in1; 79 80 DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); 81 82 TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange); 83 sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); 84 sycl_device.synchronize(); 85 in1.setZero(); 86 87 sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); 88 sycl_device.synchronize(); 89 90 for (IndexType i = 0; i < in1.size(); ++i) { 91 VERIFY_IS_APPROX(out(i), in2(i)); 92 } 93 94 sycl_device.deallocate(gpu_data); 95 } 96 97 template <typename DataType, int DataLayout, typename IndexType> 98 void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) { 99 using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; 100 IndexType full_size = 32; 101 IndexType half_size = full_size / 2; 102 array<IndexType, 1> tensorRange = {{full_size}}; 103 tensor_type in1(tensorRange); 104 tensor_type out(tensorRange); 105 106 DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); 107 TensorMap<tensor_type> gpu1(gpu_data, tensorRange); 108 109 in1 = in1.random(); 110 // Copy all data to device, then permute on copy back to host 111 sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); 112 sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType)); 113 sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType)); 114 115 for (IndexType i = 0; i < half_size; ++i) { 116 VERIFY_IS_APPROX(out(i), in1(i + half_size)); 117 VERIFY_IS_APPROX(out(i + half_size), in1(i)); 118 } 119 120 in1 = in1.random(); 121 out.setZero(); 122 // Permute copies to device, then copy all back to host 123 sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType)); 124 sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType)); 125 sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); 126 127 for (IndexType i = 0; i < half_size; ++i) { 128 VERIFY_IS_APPROX(out(i), in1(i + half_size)); 129 VERIFY_IS_APPROX(out(i + half_size), in1(i)); 130 } 131 132 in1 = in1.random(); 133 out.setZero(); 134 DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); 135 TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange); 136 // Copy all to device, permute copies on device, then copy all back to host 137 sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); 138 sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType)); 139 sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType)); 140 sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType)); 141 142 for (IndexType i = 0; i < half_size; ++i) { 143 VERIFY_IS_APPROX(out(i), in1(i + half_size)); 144 VERIFY_IS_APPROX(out(i + half_size), in1(i)); 145 } 146 147 sycl_device.deallocate(gpu_data_out); 148 sycl_device.deallocate(gpu_data); 149 } 150 151 template <typename DataType, int DataLayout, typename IndexType> 152 void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) { 153 using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; 154 IndexType full_size = 32; 155 IndexType half_size = full_size / 2; 156 array<IndexType, 1> tensorRange = {{full_size}}; 157 tensor_type cpu_out(tensorRange); 158 tensor_type out(tensorRange); 159 160 cpu_out.setZero(); 161 162 std::memset(cpu_out.data(), 0, half_size * sizeof(DataType)); 163 std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType)); 164 165 DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); 166 TensorMap<tensor_type> gpu1(gpu_data, tensorRange); 167 168 sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType)); 169 sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType)); 170 sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); 171 172 for (IndexType i = 0; i < full_size; ++i) { 173 VERIFY_IS_APPROX(out(i), cpu_out(i)); 174 } 175 176 sycl_device.deallocate(gpu_data); 177 } 178 179 template <typename DataType, int DataLayout, typename IndexType> 180 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { 181 182 IndexType sizeDim1 = 100; 183 IndexType sizeDim2 = 10; 184 IndexType sizeDim3 = 20; 185 array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; 186 Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); 187 Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); 188 Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange); 189 Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); 190 191 in2 = in2.random(); 192 in3 = in3.random(); 193 194 DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); 195 DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType))); 196 DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType))); 197 DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); 198 199 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); 200 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); 201 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange); 202 TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); 203 204 /// a=1.2f 205 gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); 206 sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); 207 sycl_device.synchronize(); 208 209 for (IndexType i = 0; i < sizeDim1; ++i) { 210 for (IndexType j = 0; j < sizeDim2; ++j) { 211 for (IndexType k = 0; k < sizeDim3; ++k) { 212 VERIFY_IS_APPROX(in1(i,j,k), 1.2f); 213 } 214 } 215 } 216 printf("a=1.2f Test passed\n"); 217 218 /// a=b*1.2f 219 gpu_out.device(sycl_device) = gpu_in1 * 1.2f; 220 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); 221 sycl_device.synchronize(); 222 223 for (IndexType i = 0; i < sizeDim1; ++i) { 224 for (IndexType j = 0; j < sizeDim2; ++j) { 225 for (IndexType k = 0; k < sizeDim3; ++k) { 226 VERIFY_IS_APPROX(out(i,j,k), 227 in1(i,j,k) * 1.2f); 228 } 229 } 230 } 231 printf("a=b*1.2f Test Passed\n"); 232 233 /// c=a*b 234 sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType)); 235 gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; 236 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); 237 sycl_device.synchronize(); 238 239 for (IndexType i = 0; i < sizeDim1; ++i) { 240 for (IndexType j = 0; j < sizeDim2; ++j) { 241 for (IndexType k = 0; k < sizeDim3; ++k) { 242 VERIFY_IS_APPROX(out(i,j,k), 243 in1(i,j,k) * 244 in2(i,j,k)); 245 } 246 } 247 } 248 printf("c=a*b Test Passed\n"); 249 250 /// c=a+b 251 gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; 252 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); 253 sycl_device.synchronize(); 254 for (IndexType i = 0; i < sizeDim1; ++i) { 255 for (IndexType j = 0; j < sizeDim2; ++j) { 256 for (IndexType k = 0; k < sizeDim3; ++k) { 257 VERIFY_IS_APPROX(out(i,j,k), 258 in1(i,j,k) + 259 in2(i,j,k)); 260 } 261 } 262 } 263 printf("c=a+b Test Passed\n"); 264 265 /// c=a*a 266 gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; 267 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); 268 sycl_device.synchronize(); 269 for (IndexType i = 0; i < sizeDim1; ++i) { 270 for (IndexType j = 0; j < sizeDim2; ++j) { 271 for (IndexType k = 0; k < sizeDim3; ++k) { 272 VERIFY_IS_APPROX(out(i,j,k), 273 in1(i,j,k) * 274 in1(i,j,k)); 275 } 276 } 277 } 278 printf("c= a*a Test Passed\n"); 279 280 //a*3.14f + b*2.7f 281 gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); 282 sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); 283 sycl_device.synchronize(); 284 for (IndexType i = 0; i < sizeDim1; ++i) { 285 for (IndexType j = 0; j < sizeDim2; ++j) { 286 for (IndexType k = 0; k < sizeDim3; ++k) { 287 VERIFY_IS_APPROX(out(i,j,k), 288 in1(i,j,k) * 3.14f 289 + in2(i,j,k) * 2.7f); 290 } 291 } 292 } 293 printf("a*3.14f + b*2.7f Test Passed\n"); 294 295 ///d= (a>0.5? b:c) 296 sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType)); 297 gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); 298 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); 299 sycl_device.synchronize(); 300 for (IndexType i = 0; i < sizeDim1; ++i) { 301 for (IndexType j = 0; j < sizeDim2; ++j) { 302 for (IndexType k = 0; k < sizeDim3; ++k) { 303 VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) 304 ? in2(i, j, k) 305 : in3(i, j, k)); 306 } 307 } 308 } 309 printf("d= (a>0.5? b:c) Test Passed\n"); 310 sycl_device.deallocate(gpu_in1_data); 311 sycl_device.deallocate(gpu_in2_data); 312 sycl_device.deallocate(gpu_in3_data); 313 sycl_device.deallocate(gpu_out_data); 314 } 315 template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType> 316 static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ 317 IndexType size = 20; 318 array<IndexType, 1> tensorRange = {{size}}; 319 Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange); 320 Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange); 321 Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange); 322 323 in = in.random(); 324 325 Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); 326 Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); 327 328 TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange); 329 TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); 330 sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); 331 gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); 332 sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); 333 out_host = in. template cast<Scalar2>(); 334 for(IndexType i=0; i< size; i++) 335 { 336 VERIFY_IS_APPROX(out(i), out_host(i)); 337 } 338 printf("cast Test Passed\n"); 339 sycl_device.deallocate(gpu_in_data); 340 sycl_device.deallocate(gpu_out_data); 341 } 342 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ 343 QueueInterface queueInterface(s); 344 auto sycl_device = Eigen::SyclDevice(&queueInterface); 345 test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); 346 test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); 347 test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); 348 test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device); 349 test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device); 350 test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); 351 test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); 352 test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); 353 test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device); 354 test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device); 355 } 356 357 EIGEN_DECLARE_TEST(cxx11_tensor_sycl) { 358 for (const auto& device :Eigen::get_sycl_supported_devices()) { 359 CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); 360 } 361 }