tensor_contract_sycl_bench.cc (11331B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 5 // Mehdi Goli Codeplay Software Ltd. 6 // Ralph Potter Codeplay Software Ltd. 7 // Luke Iwanski Codeplay Software Ltd. 8 // Contact: <eigen@codeplay.com> 9 // 10 // This Source Code Form is subject to the terms of the Mozilla 11 // Public License v. 2.0. If a copy of the MPL was not distributed 12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 #ifndef EIGEN_BENCH_CONTRACT_SYCL 14 #define EIGEN_BENCH_CONTRACT_SYCL 15 #define EIGEN_TEST_NO_LONGDOUBLE 16 #define EIGEN_TEST_NO_COMPLEX 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #include <SYCL/sycl.hpp> 19 #include <fstream> 20 #include <iostream> 21 #include <chrono> 22 #include <ctime> 23 24 #include <unsupported/Eigen/CXX11/Tensor> 25 26 using Eigen::array; 27 using Eigen::SyclDevice; 28 using Eigen::Tensor; 29 using Eigen::TensorMap; 30 std::ofstream out("Result.txt"); 31 32 std::chrono::time_point<std::chrono::system_clock> get_time(){ 33 std::chrono::time_point<std::chrono::system_clock> start, end; 34 return std::chrono::system_clock::now(); 35 } 36 37 template<typename Start, typename End, typename TensorIndex> 38 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ 39 40 std::chrono::duration<double> elapsed_seconds = end-start; 41 std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << 42 static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; 43 out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << 44 static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; 45 } 46 47 // do a contraction which is equivalent to a matrix multiplication 48 template<typename T, typename Device, typename TensorIndex> 49 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { 50 T* a_; 51 T* b_; 52 T* c_; 53 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 54 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 55 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 56 57 // Initialize the content of the memory pools to prevent asan from 58 // complaining. 59 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 60 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 61 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 62 63 Eigen::array<TensorIndex, 2> sizeA; 64 sizeA[0] = m_; 65 sizeA[1] = k_; 66 Eigen::array<TensorIndex, 2> sizeB; 67 sizeB[0] = k_; 68 sizeB[1] = n_; 69 Eigen::array<TensorIndex, 2> sizeC; 70 sizeC[0] = m_; 71 sizeC[1] = n_; 72 73 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); 74 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); 75 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); 76 77 typedef typename Tensor<T, 2>::DimensionPair DimPair; 78 Eigen::array<DimPair, 1> dims; 79 dims[0] = DimPair(1, 0); 80 #ifdef EIGEN_USE_SYCL // warmup for sycl 81 for (int iter = 0; iter < 10; ++iter) { 82 C.device(device_) = A.contract(B, dims); 83 } 84 #endif 85 auto start = get_time(); 86 for (int iter = 0; iter < num_iters; ++iter) { 87 C.device(device_) = A.contract(B, dims); 88 } 89 auto end = get_time(); 90 // Record the number of FLOPs executed per second (size_ multiplications and 91 // additions for each value in the resulting tensor) 92 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); 93 device_.deallocate(a_); 94 device_.deallocate(b_); 95 device_.deallocate(c_); 96 device_.synchronize(); 97 } 98 99 100 101 // do a contraction which is equivalent to a matrix multiplication 102 template<typename T, typename Device, typename TensorIndex> 103 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { 104 T* a_; 105 T* b_; 106 T* c_; 107 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 108 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 109 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 110 111 // Initialize the content of the memory pools to prevent asan from 112 // complaining. 113 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 114 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 115 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 116 117 Eigen::array<TensorIndex, 2> sizeA; 118 sizeA[0] = m_; 119 sizeA[1] = k_; 120 Eigen::array<TensorIndex, 2> sizeB; 121 sizeB[0] = k_; 122 sizeB[1] = n_; 123 Eigen::array<TensorIndex, 2> sizeC; 124 sizeC[0] = m_; 125 sizeC[1] = n_; 126 127 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); 128 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); 129 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); 130 131 typedef typename Tensor<T, 2>::DimensionPair DimPair; 132 Eigen::array<DimPair, 1> dims; 133 dims[0] = DimPair(1, 0); 134 #ifdef EIGEN_USE_SYCL // warmup for sycl 135 for (int iter = 0; iter < 10; ++iter) { 136 C.device(device_) = A.contract(B, dims); 137 } 138 #endif 139 auto start = get_time(); 140 for (int iter = 0; iter < num_iters; ++iter) { 141 C.device(device_) = A.contract(B, dims); 142 } 143 auto end = get_time(); 144 // Record the number of FLOPs executed per second (size_ multiplications and 145 // additions for each value in the resulting tensor) 146 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); 147 device_.deallocate(a_); 148 device_.deallocate(b_); 149 device_.deallocate(c_); 150 device_.synchronize(); 151 } 152 153 154 template<typename T, typename Device, typename TensorIndex> 155 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { 156 T* a_; 157 T* b_; 158 T* c_; 159 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 160 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 161 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 162 163 // Initialize the content of the memory pools to prevent asan from 164 // complaining. 165 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 166 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 167 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 168 Eigen::array<TensorIndex, 2> sizeA; 169 sizeA[0] = k_; 170 sizeA[1] = m_; 171 Eigen::array<TensorIndex, 2> sizeB; 172 sizeB[0] = k_; 173 sizeB[1] = n_; 174 Eigen::array<TensorIndex, 2> sizeC; 175 sizeC[0] = m_; 176 sizeC[1] = n_; 177 178 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); 179 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); 180 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); 181 182 typedef typename Tensor<T, 2>::DimensionPair DimPair; 183 Eigen::array<DimPair, 1> dims; 184 dims[0] = DimPair(0, 0); 185 #ifdef EIGEN_USE_SYCL // warmup for sycl 186 for (int iter = 0; iter < 10; ++iter) { 187 C.device(device_) = A.contract(B, dims); 188 } 189 #endif 190 auto start = get_time(); 191 for (int iter = 0; iter < num_iters; ++iter) { 192 C.device(device_) = A.contract(B, dims); 193 } 194 auto end = get_time(); 195 // Record the number of FLOPs executed per second (size_ multiplications and 196 // additions for each value in the resulting tensor) 197 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); 198 device_.deallocate(a_); 199 device_.deallocate(b_); 200 device_.deallocate(c_); 201 device_.synchronize(); 202 203 } 204 205 template<typename T, typename Device, typename TensorIndex> 206 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { 207 T* a_; 208 T* b_; 209 T* c_; 210 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 211 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 212 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 213 214 // Initialize the content of the memory pools to prevent asan from 215 // complaining. 216 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 217 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 218 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 219 220 Eigen::array<TensorIndex, 2> sizeA; 221 sizeA[0] = m_; 222 sizeA[1] = k_; 223 Eigen::array<TensorIndex, 2> sizeB; 224 sizeB[0] = n_; 225 sizeB[1] = k_; 226 Eigen::array<TensorIndex, 2> sizeC; 227 sizeC[0] = m_; 228 sizeC[1] = n_; 229 230 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); 231 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); 232 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); 233 234 typedef typename Tensor<T, 2>::DimensionPair DimPair; 235 Eigen::array<DimPair, 1> dims; 236 dims[0] = DimPair(1, 1); 237 #ifdef EIGEN_USE_SYCL // warmup for sycl 238 for (int iter = 0; iter < 10; ++iter) { 239 C.device(device_) = A.contract(B, dims); 240 } 241 #endif 242 auto start = get_time(); 243 for (int iter = 0; iter < num_iters; ++iter) { 244 C.device(device_) = A.contract(B, dims); 245 } 246 auto end = get_time(); 247 // Record the number of FLOPs executed per second (size_ multiplications and 248 // additions for each value in the resulting tensor) 249 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); 250 device_.deallocate(a_); 251 device_.deallocate(b_); 252 device_.deallocate(c_); 253 device_.synchronize(); 254 255 } 256 257 template<typename T, typename Device, typename TensorIndex> 258 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { 259 T* a_; 260 T* b_; 261 T* c_; 262 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 263 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 264 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 265 266 // Initialize the content of the memory pools to prevent asan from 267 // complaining. 268 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 269 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 270 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 271 272 Eigen::array<TensorIndex, 2> sizeA; 273 sizeA[0] = k_; 274 sizeA[1] = m_; 275 Eigen::array<TensorIndex, 2> sizeB; 276 sizeB[0] = n_; 277 sizeB[1] = k_; 278 Eigen::array<TensorIndex, 2> sizeC; 279 sizeC[0] = m_; 280 sizeC[1] = n_; 281 282 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); 283 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); 284 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); 285 286 typedef typename Tensor<T, 2>::DimensionPair DimPair; 287 Eigen::array<DimPair, 1> dims; 288 dims[0] = DimPair(0, 1); 289 #ifdef EIGEN_USE_SYCL // warmup for sycl 290 for (int iter = 0; iter < 10; ++iter) { 291 C.device(device_) = A.contract(B, dims); 292 } 293 #endif 294 auto start = get_time(); 295 for (int iter = 0; iter < num_iters; ++iter) { 296 C.device(device_) = A.contract(B, dims); 297 } 298 auto end = get_time(); 299 // Record the number of FLOPs executed per second (size_ multiplications and 300 // additions for each value in the resulting tensor) 301 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); 302 device_.deallocate(a_); 303 device_.deallocate(b_); 304 device_.deallocate(c_); 305 device_.synchronize(); 306 } 307 308 int main() { 309 cl::sycl::gpu_selector selector; 310 Eigen::QueueInterface queue(selector); 311 Eigen::SyclDevice device(&queue); 312 int64_t num_iters =20; 313 for(int64_t m = 32; m <= 4096; m *= 2) 314 for(int64_t k = 32; k <= 4096; k *= 2) 315 for(int64_t n = 32; n <= 4096; n*= 2){ 316 (contraction<float>(device, num_iters, m, k, n)); 317 (contractionRowMajor<float>(device, num_iters, m, k, n)); 318 (contractionAT<float>(device, num_iters, m, k, n)); 319 (contractionBT<float>(device, num_iters, m, k, n)); 320 (contractionABT<float>(device, num_iters, m, k, n)); 321 } 322 return 0; 323 } 324 325 #endif // EIGEN_BENCH_CONTRACT_SYCL