cxx11_tensor_contract_sycl.cpp (47521B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // 10 // This Source Code Form is subject to the terms of the Mozilla 11 // Public License v. 2.0. If a copy of the MPL was not distributed 12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 14 #define EIGEN_TEST_NO_LONGDOUBLE 15 #define EIGEN_TEST_NO_COMPLEX 16 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #define EIGEN_USE_SYCL 19 20 #include <algorithm> 21 #include <chrono> 22 #include <ctime> 23 #include <iostream> 24 25 #include "main.h" 26 27 #include <unsupported/Eigen/CXX11/Tensor> 28 29 using Eigen::array; 30 using Eigen::SyclDevice; 31 using Eigen::Tensor; 32 using Eigen::TensorMap; 33 34 template <int DataLayout, typename DataType, typename IndexType, 35 typename Device> 36 void static test_sycl_contraction(const Device &sycl_device, IndexType m_size, 37 IndexType k_size, IndexType n_size) { 38 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 39 DimPair; 40 static const DataType error_threshold = DataType(1e-4); 41 // with these dimensions, the output has 300 * 140 elements, which is 42 // more than 30 * 1024, which is the number of threads in blocks on 43 // a 15 SM GK110 GPU 44 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); 45 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); 46 Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); 47 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); 48 Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; 49 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; 50 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; 51 Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; 52 53 t_left.setRandom(); 54 t_right.setRandom(); 55 56 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 57 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 58 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 59 60 DataType *d_t_left = 61 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 62 DataType *d_t_right = 63 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 64 DataType *d_t_result = 65 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 66 67 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 68 gpu_t_left(d_t_left, left_dims); 69 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 70 gpu_t_right(d_t_right, right_dims); 71 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 72 gpu_t_result(d_t_result, result_dims); 73 74 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 75 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 76 77 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 78 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 79 t_result_bytes); 80 81 t_result = t_left.contract(t_right, dims); 82 83 for (IndexType i = 0; i < t_result.size(); i++) { 84 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 85 t_result(i) - t_result_gpu(i)))) < error_threshold) { 86 continue; 87 } 88 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 89 error_threshold)) { 90 continue; 91 } 92 93 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 94 << ", mismatch detected at IndexType " << i << ": " << t_result(i) 95 << " vs " << t_result_gpu(i) << std::endl; 96 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); 97 } 98 sycl_device.deallocate(d_t_left); 99 sycl_device.deallocate(d_t_right); 100 sycl_device.deallocate(d_t_result); 101 } 102 103 template <int DataLayout, typename DataType, typename IndexType, 104 typename Device> 105 void test_sycl_contraction_m(const Device &sycl_device) { 106 for (IndexType k = 32; k < 256; k++) { 107 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 108 128); 109 } 110 } 111 112 template <int DataLayout, typename DataType, typename IndexType, 113 typename Device> 114 void test_sycl_contraction_k(const Device &sycl_device) { 115 for (IndexType k = 32; k < 256; k++) { 116 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 117 128); 118 } 119 } 120 121 template <int DataLayout, typename DataType, typename IndexType, 122 typename Device> 123 void test_sycl_contraction_n(const Device &sycl_device) { 124 for (IndexType k = 32; k < 256; k++) { 125 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 126 128, k); 127 } 128 } 129 130 template <int DataLayout, typename DataType, typename IndexType, 131 typename Device> 132 void test_sycl_contraction_sizes(const Device &sycl_device) { 133 IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, 134 257, 511, 512, 513, 1023, 1024, 1025}; 135 136 IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, 137 257, 511, 512, 513, 1023, 1024, 1025}; 138 139 IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129, 140 255, 257, 511, 512, 513, 1023, 1024, 1025}; 141 142 for (IndexType i = 0; i < 15; i++) { 143 for (IndexType j = 0; j < 15; j++) { 144 for (IndexType k = 0; k < 17; k++) { 145 test_sycl_contraction<DataLayout, DataType, IndexType>( 146 sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); 147 } 148 } 149 } 150 } 151 152 template <int DataLayout, typename DataType, typename IndexType, 153 typename Device> 154 void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size, 155 IndexType k_size, IndexType n_size) { 156 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 157 DimPair; 158 static const DataType error_threshold = DataType(1e-4); 159 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); 160 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); 161 Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); 162 163 Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; 164 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; 165 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; 166 Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; 167 168 t_left.setRandom(); 169 t_right.setRandom(); 170 171 // Allocate buffers twice as big to check for invalid read and write 172 auto padded_left_size = 2 * t_left.size(); 173 auto padded_right_size = 2 * t_right.size(); 174 auto padded_result_size = 2 * t_result.size(); 175 176 std::size_t t_left_bytes = padded_left_size * sizeof(DataType); 177 std::size_t t_right_bytes = padded_right_size * sizeof(DataType); 178 std::size_t t_result_bytes = padded_result_size * sizeof(DataType); 179 180 DataType *d_t_left = 181 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 182 DataType *d_t_right = 183 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 184 DataType *d_t_result = 185 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 186 187 // TensorMaps are still of the same size than the Tensors 188 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 189 gpu_t_left(d_t_left, left_dims); 190 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 191 gpu_t_right(d_t_right, right_dims); 192 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 193 gpu_t_result(d_t_result, result_dims); 194 195 // Write nan after the actual buffer to propagate nans everywhere in case of 196 // invalid reads 197 DataType nan = std::numeric_limits<DataType>::quiet_NaN(); 198 auto host_left_data = new DataType[padded_left_size]; 199 std::copy_n(t_left.data(), t_left.size(), host_left_data); 200 std::fill_n(host_left_data + t_left.size(), t_left.size(), nan); 201 auto host_right_data = new DataType[padded_right_size]; 202 std::copy_n(t_right.data(), t_right.size(), host_right_data); 203 std::fill_n(host_right_data + t_right.size(), t_right.size(), nan); 204 auto host_result_data = new DataType[padded_result_size]; 205 std::fill_n(host_result_data, padded_result_size, nan); 206 207 sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes); 208 sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes); 209 sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes); 210 211 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 212 sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes); 213 214 t_result = t_left.contract(t_right, dims); 215 216 for (IndexType i = 0; i < t_result.size(); i++) { 217 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 218 t_result(i) - host_result_data[i]))) < error_threshold) { 219 continue; 220 } 221 if (Eigen::internal::isApprox(t_result(i), host_result_data[i], 222 error_threshold)) { 223 continue; 224 } 225 if (std::isnan(host_result_data[i])) { 226 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 227 << ", invalid read detected at IndexType " << i << ": " 228 << t_result(i) << " vs " << host_result_data[i] << std::endl; 229 } else { 230 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 231 << ", mismatch detected at IndexType " << i << ": " 232 << t_result(i) << " vs " << host_result_data[i] << std::endl; 233 } 234 VERIFY_IS_APPROX(host_result_data[i], t_result(i)); 235 } 236 // Make sure that the rest of the result is still nans 237 for (IndexType i = t_result.size(); i < padded_result_size; i++) { 238 if (std::isnan(host_result_data[i])) { 239 continue; 240 } 241 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 242 << ", invalid write detected at IndexType " << i << ": " 243 << host_result_data[i] << std::endl; 244 VERIFY_IS_APPROX(host_result_data[i], t_result(i)); 245 } 246 sycl_device.deallocate(d_t_left); 247 sycl_device.deallocate(d_t_right); 248 sycl_device.deallocate(d_t_result); 249 250 delete[] host_left_data; 251 delete[] host_right_data; 252 delete[] host_result_data; 253 } 254 255 template <int DataLayout, typename DataType, typename IndexType, 256 typename Device> 257 void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size, 258 IndexType n_size) { 259 // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << 260 // ")" << std::endl; 261 // with these dimensions, the output has 300 * 140 elements, which is 262 // more than 30 * 1024, which is the number of threads in blocks on 263 // a 15 SM GK110 GPU 264 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 265 DimPair; 266 static const DataType error_threshold = DataType(1e-4); 267 Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); 268 Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); 269 Tensor<DataType, 0, DataLayout, IndexType> t_result; 270 Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu; 271 Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; 272 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; 273 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; 274 t_left.setRandom(); 275 t_right.setRandom(); 276 277 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 278 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 279 std::size_t t_result_bytes = sizeof(DataType); 280 281 DataType *d_t_left = 282 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 283 DataType *d_t_right = 284 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 285 DataType *d_t_result = 286 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 287 288 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 289 gpu_t_left(d_t_left, left_dims); 290 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 291 gpu_t_right(d_t_right, right_dims); 292 Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>> 293 gpu_t_result(d_t_result); 294 295 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 296 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 297 298 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 299 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 300 t_result_bytes); 301 302 t_result = t_left.contract(t_right, dims); 303 304 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 305 t_result() - t_result_gpu()))) > error_threshold && 306 !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { 307 std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size 308 << " : mismatch detected: " << t_result() << " vs " 309 << t_result_gpu() << std::endl; 310 VERIFY_IS_APPROX(t_result_gpu(), t_result()); 311 } 312 313 sycl_device.deallocate(d_t_left); 314 sycl_device.deallocate(d_t_right); 315 sycl_device.deallocate(d_t_result); 316 } 317 318 template <int DataLayout, typename DataType, typename IndexType, 319 typename Device> 320 void contraction_batch(const Device &sycl_device, IndexType m_size, 321 IndexType k_size, IndexType n_size, IndexType m_batch, 322 IndexType start, IndexType limit) { 323 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 324 DimPair; 325 static const DataType error_threshold = DataType(1e-4); 326 typedef Eigen::array<IndexType, 3> TensorDim; 327 typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType; 328 TensorDim left_dims = {{m_batch, k_size, m_size}}; 329 TensorDim right_dims = {{m_batch, n_size, k_size}}; 330 TensorDim res_dims = {{m_batch, m_size, n_size}}; 331 Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}}; 332 333 TensorType t_left(left_dims); 334 TensorType t_right(right_dims); 335 TensorType t_result_gpu(res_dims); 336 TensorType t_result(res_dims); 337 338 t_left.setRandom(); 339 t_right.setRandom(); 340 341 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 342 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 343 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 344 345 DataType *d_t_left = 346 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 347 DataType *d_t_right = 348 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 349 DataType *d_t_result = 350 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 351 352 Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims); 353 Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims); 354 Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims); 355 356 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 357 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 358 for (int i = start; i < limit; ++i) { 359 auto x = gpu_t_left.template chip<0>(i); 360 auto y = gpu_t_right.template chip<0>(i); 361 auto z = gpu_t_result.template chip<0>(i); 362 z.device(sycl_device) = x.contract(y, contract_pairs); 363 } 364 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 365 t_result_bytes); 366 367 for (int i = start; i < limit; ++i) { 368 auto x = t_left.template chip<0>(i); 369 auto y = t_right.template chip<0>(i); 370 auto z = t_result.template chip<0>(i); 371 z = x.contract(y, contract_pairs); 372 } 373 374 for (IndexType i = 0; i < t_result.size(); i++) { 375 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 376 t_result(i) - t_result_gpu(i)))) < error_threshold) { 377 continue; 378 } 379 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 380 error_threshold)) { 381 continue; 382 } 383 std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) 384 << " vs " << t_result_gpu(i) << std::endl; 385 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); 386 } 387 sycl_device.deallocate(d_t_left); 388 sycl_device.deallocate(d_t_right); 389 sycl_device.deallocate(d_t_result); 390 } 391 392 template <int DataLayout, typename DataType, typename IndexType, 393 typename Device> 394 void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size, 395 IndexType k_size, IndexType n_size) { 396 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 397 DimPair; 398 static const DataType error_threshold = DataType(1e-4); 399 Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; 400 Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; 401 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; 402 Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}}; 403 404 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); 405 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); 406 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); 407 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); 408 409 t_left.setRandom(); 410 t_right.setRandom(); 411 412 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 413 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 414 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 415 416 DataType *d_t_left = 417 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 418 DataType *d_t_right = 419 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 420 DataType *d_t_result = 421 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 422 423 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 424 gpu_t_left(d_t_left, left_dims); 425 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 426 gpu_t_right(d_t_right, right_dims); 427 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 428 gpu_t_result(d_t_result, res_dims); 429 430 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 431 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 432 433 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 434 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 435 t_result_bytes); 436 437 t_result = t_left.contract(t_right, dims); 438 439 for (IndexType j = 0; j < m_size; j++) { 440 for (IndexType i = 0; i < n_size; i++) { 441 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 442 t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) { 443 continue; 444 } 445 if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i), 446 error_threshold)) { 447 continue; 448 } 449 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 450 << ", mismatch detected at IndexType m: " << j << " n: " << i 451 << " CPU : " << t_result(j, i) 452 << " vs SYCL:" << t_result_gpu(j, i) << std::endl; 453 VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i)); 454 } 455 } 456 sycl_device.deallocate(d_t_left); 457 sycl_device.deallocate(d_t_right); 458 sycl_device.deallocate(d_t_result); 459 } 460 461 template <int DataLayout, typename DataType, typename IndexType, 462 typename Device> 463 void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size, 464 IndexType k_size, IndexType n_size) { 465 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 466 DimPair; 467 static const DataType error_threshold = DataType(1e-4); 468 Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; 469 Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; 470 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; 471 Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}}; 472 473 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); 474 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); 475 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); 476 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); 477 478 t_left.setRandom(); 479 t_right.setRandom(); 480 481 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 482 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 483 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 484 485 DataType *d_t_left = 486 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 487 DataType *d_t_right = 488 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 489 DataType *d_t_result = 490 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 491 492 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 493 gpu_t_left(d_t_left, left_dims); 494 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 495 gpu_t_right(d_t_right, right_dims); 496 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 497 gpu_t_result(d_t_result, res_dims); 498 499 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 500 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 501 502 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 503 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 504 t_result_bytes); 505 506 t_result = t_left.contract(t_right, dims); 507 508 for (IndexType i = 0; i < t_result.size(); i++) { 509 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 510 t_result(i) - t_result_gpu(i)))) < error_threshold) { 511 continue; 512 } 513 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 514 error_threshold)) { 515 continue; 516 } 517 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 518 << ", mismatch detected at IndexType " << i << ": " << t_result(i) 519 << " vs " << t_result_gpu(i) << std::endl; 520 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); 521 } 522 sycl_device.deallocate(d_t_left); 523 sycl_device.deallocate(d_t_right); 524 sycl_device.deallocate(d_t_result); 525 } 526 527 template <int DataLayout, typename DataType, typename IndexType, 528 typename Device> 529 void contraction_both_transposed(const Device &sycl_device, IndexType m_size, 530 IndexType k_size, IndexType n_size) { 531 typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair 532 DimPair; 533 static const DataType error_threshold = DataType(1e-4); 534 Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; 535 Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; 536 Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; 537 Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}}; 538 539 Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); 540 Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); 541 Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); 542 Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); 543 544 t_left.setRandom(); 545 t_right.setRandom(); 546 547 std::size_t t_left_bytes = t_left.size() * sizeof(DataType); 548 std::size_t t_right_bytes = t_right.size() * sizeof(DataType); 549 std::size_t t_result_bytes = t_result.size() * sizeof(DataType); 550 551 DataType *d_t_left = 552 static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); 553 DataType *d_t_right = 554 static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); 555 DataType *d_t_result = 556 static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); 557 558 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 559 gpu_t_left(d_t_left, left_dims); 560 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 561 gpu_t_right(d_t_right, right_dims); 562 Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> 563 gpu_t_result(d_t_result, res_dims); 564 565 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); 566 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); 567 568 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); 569 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, 570 t_result_bytes); 571 572 t_result = t_left.contract(t_right, dims); 573 574 for (IndexType i = 0; i < t_result.size(); i++) { 575 if (static_cast<DataType>(std::fabs(static_cast<DataType>( 576 t_result(i) - t_result_gpu(i)))) < error_threshold) { 577 continue; 578 } 579 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 580 error_threshold)) { 581 continue; 582 } 583 std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size 584 << ", mismatch detected at IndexType " << i << ": " << t_result(i) 585 << " vs " << t_result_gpu(i) << std::endl; 586 587 VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); 588 } 589 sycl_device.deallocate(d_t_left); 590 sycl_device.deallocate(d_t_right); 591 sycl_device.deallocate(d_t_result); 592 } 593 594 template <typename Dev> 595 void inline tensorOutofBound(const Dev &sycl_device) { 596 typedef float DataType; 597 typedef int64_t IndexType; 598 std::chrono::time_point<std::chrono::system_clock> start, end; 599 start = std::chrono::system_clock::now(); 600 // Test out of bound for Tensor-Tensor 601 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, 602 1024); 603 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, 604 4096); 605 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024, 606 2048); 607 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, 608 1024); 609 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024, 610 784); 611 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, 612 10); 613 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096, 614 513); 615 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024, 616 783); 617 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, 618 784); 619 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024, 620 11); 621 end = std::chrono::system_clock::now(); 622 std::chrono::duration<double> elapsed_seconds = end - start; 623 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 624 std::cout << "tensor out of bound tests finished computation at " 625 << std::ctime(&end_time) 626 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 627 } 628 629 template <typename Dev> 630 void inline tensorTensor(const Dev &sycl_device) { 631 typedef float DataType; 632 typedef int64_t IndexType; 633 std::chrono::time_point<std::chrono::system_clock> start, end; 634 start = std::chrono::system_clock::now(); 635 // Tensor Tensor Contraction 636 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 637 128); 638 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 639 128); 640 end = std::chrono::system_clock::now(); 641 std::chrono::duration<double> elapsed_seconds = end - start; 642 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 643 std::cout << "tensor tensor tests finished computation at " 644 << std::ctime(&end_time) 645 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 646 } 647 648 template <typename Dev> 649 void inline tensorTensor_m(const Dev &sycl_device) { 650 typedef float DataType; 651 typedef int64_t IndexType; 652 std::chrono::time_point<std::chrono::system_clock> start, end; 653 start = std::chrono::system_clock::now(); 654 // Tensor Tensor Contraction 655 test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device); 656 test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device); 657 658 end = std::chrono::system_clock::now(); 659 std::chrono::duration<double> elapsed_seconds = end - start; 660 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 661 std::cout << "tensor tensor tests finished computation at " 662 << std::ctime(&end_time) 663 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 664 } 665 666 template <typename Dev> 667 void inline tensorTensor_n(const Dev &sycl_device) { 668 typedef float DataType; 669 typedef int64_t IndexType; 670 std::chrono::time_point<std::chrono::system_clock> start, end; 671 start = std::chrono::system_clock::now(); 672 // Tensor Tensor Contraction 673 test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device); 674 test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device); 675 676 end = std::chrono::system_clock::now(); 677 std::chrono::duration<double> elapsed_seconds = end - start; 678 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 679 std::cout << "tensor tensor tests finished computation at " 680 << std::ctime(&end_time) 681 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 682 } 683 684 template <typename Dev> 685 void inline tensorTensor_k(const Dev &sycl_device) { 686 typedef float DataType; 687 typedef int64_t IndexType; 688 std::chrono::time_point<std::chrono::system_clock> start, end; 689 start = std::chrono::system_clock::now(); 690 test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device); 691 test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device); 692 693 end = std::chrono::system_clock::now(); 694 std::chrono::duration<double> elapsed_seconds = end - start; 695 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 696 std::cout << "tensor tensor tests finished computation at " 697 << std::ctime(&end_time) 698 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 699 } 700 701 template <typename Dev> 702 void inline tensorTensor_sizes(const Dev &sycl_device) { 703 typedef float DataType; 704 typedef int64_t IndexType; 705 std::chrono::time_point<std::chrono::system_clock> start, end; 706 start = std::chrono::system_clock::now(); 707 // Tensor Tensor Contraction 708 test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device); 709 test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device); 710 711 end = std::chrono::system_clock::now(); 712 std::chrono::duration<double> elapsed_seconds = end - start; 713 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 714 std::cout << "tensor tensor tests finished computation at " 715 << std::ctime(&end_time) 716 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 717 } 718 template <typename Dev> 719 void inline vectorVector(const Dev &sycl_device) { 720 typedef float DataType; 721 typedef int64_t IndexType; 722 std::chrono::time_point<std::chrono::system_clock> start, end; 723 start = std::chrono::system_clock::now(); 724 // VECTOR-VECTOR 725 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1, 726 1025); 727 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1, 728 1025); 729 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1, 730 1024); 731 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1, 732 1024); 733 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1, 734 1023); 735 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1, 736 1023); 737 738 end = std::chrono::system_clock::now(); 739 std::chrono::duration<double> elapsed_seconds = end - start; 740 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 741 std::cout << "contracted tensor tests finished computation at " 742 << std::ctime(&end_time) 743 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 744 } 745 746 template <typename Dev> 747 void inline vectorTensor(const Dev &sycl_device) { 748 typedef float DataType; 749 typedef int64_t IndexType; 750 std::chrono::time_point<std::chrono::system_clock> start, end; 751 start = std::chrono::system_clock::now(); 752 // Vector-Tensor 753 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025, 754 1025); 755 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025, 756 1025); 757 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024, 758 1024); 759 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024, 760 1024); 761 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023, 762 1023); 763 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023, 764 1023); 765 766 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097, 767 4097); 768 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097, 769 4097); 770 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096, 771 4096); 772 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096, 773 4096); 774 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095, 775 4095); 776 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095, 777 4095); 778 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816, 779 32); 780 781 end = std::chrono::system_clock::now(); 782 std::chrono::duration<double> elapsed_seconds = end - start; 783 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 784 std::cout << "finished computation at " << std::ctime(&end_time) 785 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 786 } 787 788 template <typename Dev> 789 void inline tensorVector(const Dev &sycl_device) { 790 typedef float DataType; 791 typedef int64_t IndexType; 792 std::chrono::time_point<std::chrono::system_clock> start, end; 793 start = std::chrono::system_clock::now(); 794 // Matrix-Vector 795 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025, 796 1); 797 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025, 798 1); 799 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024, 800 1); 801 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, 802 1); 803 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023, 804 1); 805 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023, 806 1); 807 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197, 808 1); 809 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097, 810 1); 811 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096, 812 1); 813 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196, 814 1); 815 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095, 816 1); 817 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095, 818 1); 819 // If the GEMV disabled it will creates one kernel to calculate the contraction. 820 // Therefore the acumuation of float number will overflow the precision 821 // threshold for float and cause the test to fail. While it the GMV multiple 822 // kernel will be created and each one run the overflow of accumutation breaks 823 // among the kernels. 824 #ifndef EIGEN_SYCL_DISABLE_GEMV 825 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032, 826 1); 827 #endif 828 829 end = std::chrono::system_clock::now(); 830 std::chrono::duration<double> elapsed_seconds = end - start; 831 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 832 std::cout << "finished computation at " << std::ctime(&end_time) 833 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 834 } 835 836 template <typename Dev> 837 void inline tensorScalar(const Dev &sycl_device) { 838 typedef float DataType; 839 typedef int64_t IndexType; 840 std::chrono::time_point<std::chrono::system_clock> start, end; 841 start = std::chrono::system_clock::now(); 842 // SCALAR Contraction 843 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127); 844 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127); 845 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128); 846 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128); 847 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129); 848 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129); 849 850 end = std::chrono::system_clock::now(); 851 std::chrono::duration<double> elapsed_seconds = end - start; 852 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 853 std::cout << "finished computation at " << std::ctime(&end_time) 854 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 855 } 856 857 template <typename Dev> 858 void inline skinnyTensor_row(const Dev &sycl_device) { 859 typedef float DataType; 860 typedef int64_t IndexType; 861 std::chrono::time_point<std::chrono::system_clock> start, end; 862 start = std::chrono::system_clock::now(); 863 // Tensor Tensor Contraction 864 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16); 865 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073, 866 257); 867 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072, 868 256); 869 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073, 870 16); 871 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072, 872 17); 873 end = std::chrono::system_clock::now(); 874 std::chrono::duration<double> elapsed_seconds = end - start; 875 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 876 std::cout << "finished computation at " << std::ctime(&end_time) 877 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 878 } 879 880 template <typename Dev> 881 void inline skinnyTensor_col(const Dev &sycl_device) { 882 typedef float DataType; 883 typedef int64_t IndexType; 884 std::chrono::time_point<std::chrono::system_clock> start, end; 885 start = std::chrono::system_clock::now(); 886 // Tensor Tensor Contraction 887 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16); 888 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073, 889 257); 890 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072, 891 256); 892 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073, 893 16); 894 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072, 895 17); 896 end = std::chrono::system_clock::now(); 897 std::chrono::duration<double> elapsed_seconds = end - start; 898 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 899 std::cout << "finished computation at " << std::ctime(&end_time) 900 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 901 } 902 903 template <typename Dev> 904 void inline tensor_contraction_batch_per_device(const Dev &sycl_device) { 905 typedef float DataType; 906 typedef int64_t IndexType; 907 std::chrono::time_point<std::chrono::system_clock> start, end; 908 start = std::chrono::system_clock::now(); 909 910 contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, 911 0, 4); 912 contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, 913 0, 4); 914 end = std::chrono::system_clock::now(); 915 std::chrono::duration<double> elapsed_seconds = end - start; 916 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 917 std::cout << "finished computation at " << std::ctime(&end_time) 918 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 919 } 920 921 template <typename Dev> 922 void inline tensor_contraction_lhs_transposed_per_device( 923 const Dev &sycl_device) { 924 typedef float DataType; 925 typedef int64_t IndexType; 926 std::chrono::time_point<std::chrono::system_clock> start, end; 927 start = std::chrono::system_clock::now(); 928 929 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4, 930 8); 931 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, 932 32); 933 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, 934 64); 935 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784, 936 2048, 1024); 937 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, 938 10, 1024); 939 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, 940 1024, 1024); 941 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, 942 4096, 1024); 943 end = std::chrono::system_clock::now(); 944 std::chrono::duration<double> elapsed_seconds = end - start; 945 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 946 std::cout << "finished computation at " << std::ctime(&end_time) 947 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 948 } 949 950 template <typename Dev> 951 void inline tensor_contraction_rhs_transposed_per_device( 952 const Dev &sycl_device) { 953 typedef float DataType; 954 typedef int64_t IndexType; 955 std::chrono::time_point<std::chrono::system_clock> start, end; 956 start = std::chrono::system_clock::now(); 957 958 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 959 16); 960 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, 961 17); 962 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, 963 32); 964 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, 965 64); 966 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10, 967 1024, 1024); 968 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, 969 1024, 4096); 970 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, 971 1024, 2048); 972 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, 973 1024, 784); 974 end = std::chrono::system_clock::now(); 975 std::chrono::duration<double> elapsed_seconds = end - start; 976 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 977 std::cout << "finished computation at " << std::ctime(&end_time) 978 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 979 } 980 981 template <typename Dev> 982 void inline tensor_contraction_both_transposed_per_device( 983 const Dev &sycl_device) { 984 typedef float DataType; 985 typedef int64_t IndexType; 986 std::chrono::time_point<std::chrono::system_clock> start, end; 987 start = std::chrono::system_clock::now(); 988 989 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, 990 17); 991 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, 992 32); 993 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 994 16, 64); 995 end = std::chrono::system_clock::now(); 996 std::chrono::duration<double> elapsed_seconds = end - start; 997 std::time_t end_time = std::chrono::system_clock::to_time_t(end); 998 std::cout << "finished computation at " << std::ctime(&end_time) 999 << "elapsed time: " << elapsed_seconds.count() << "s\n"; 1000 } 1001 1002 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) { 1003 for (const auto &device : Eigen::get_sycl_supported_devices()) { 1004 std::cout << "Running on " 1005 << device.template get_info<cl::sycl::info::device::name>() 1006 << std::endl; 1007 QueueInterface queueInterface(device); 1008 auto sycl_device = Eigen::SyclDevice(&queueInterface); 1009 CALL_SUBTEST_1(tensorOutofBound(sycl_device)); 1010 CALL_SUBTEST_2(tensorTensor(sycl_device)); 1011 CALL_SUBTEST_2(tensorTensor_m(sycl_device)); 1012 CALL_SUBTEST_2(tensorTensor_n(sycl_device)); 1013 CALL_SUBTEST_2(tensorTensor_k(sycl_device)); 1014 CALL_SUBTEST_2(tensorTensor_sizes(sycl_device)); 1015 CALL_SUBTEST_3(vectorVector(sycl_device)); 1016 CALL_SUBTEST_4(vectorTensor(sycl_device)); 1017 CALL_SUBTEST_5(tensorVector(sycl_device)); 1018 CALL_SUBTEST_6(tensorScalar(sycl_device)); 1019 CALL_SUBTEST_7(skinnyTensor_row(sycl_device)); 1020 CALL_SUBTEST_7(skinnyTensor_col(sycl_device)); 1021 CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device)); 1022 CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device)); 1023 CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device)); 1024 CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device)); 1025 } 1026 }