gpu_basic.cu (16082B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 // workaround issue between gcc >= 4.7 and cuda 5.5 11 #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7) 12 #undef _GLIBCXX_ATOMIC_BUILTINS 13 #undef _GLIBCXX_USE_INT128 14 #endif 15 16 #define EIGEN_TEST_NO_LONGDOUBLE 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 18 19 #include "main.h" 20 #include "gpu_common.h" 21 22 // Check that dense modules can be properly parsed by nvcc 23 #include <Eigen/Dense> 24 25 // struct Foo{ 26 // EIGEN_DEVICE_FUNC 27 // void operator()(int i, const float* mats, float* vecs) const { 28 // using namespace Eigen; 29 // // Matrix3f M(data); 30 // // Vector3f x(data+9); 31 // // Map<Vector3f>(data+9) = M.inverse() * x; 32 // Matrix3f M(mats+i/16); 33 // Vector3f x(vecs+i*3); 34 // // using std::min; 35 // // using std::sqrt; 36 // Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x(); 37 // //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum(); 38 // } 39 // }; 40 41 template<typename T> 42 struct coeff_wise { 43 EIGEN_DEVICE_FUNC 44 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 45 { 46 using namespace Eigen; 47 T x1(in+i); 48 T x2(in+i+1); 49 T x3(in+i+2); 50 Map<T> res(out+i*T::MaxSizeAtCompileTime); 51 52 res.array() += (in[0] * x1 + x2).array() * x3.array(); 53 } 54 }; 55 56 template<typename T> 57 struct complex_sqrt { 58 EIGEN_DEVICE_FUNC 59 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 60 { 61 using namespace Eigen; 62 typedef typename T::Scalar ComplexType; 63 typedef typename T::Scalar::value_type ValueType; 64 const int num_special_inputs = 18; 65 66 if (i == 0) { 67 const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN(); 68 typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs; 69 SpecialInputs special_in; 70 special_in.setZero(); 71 int idx = 0; 72 special_in[idx++] = ComplexType(0, 0); 73 special_in[idx++] = ComplexType(-0, 0); 74 special_in[idx++] = ComplexType(0, -0); 75 special_in[idx++] = ComplexType(-0, -0); 76 // GCC's fallback sqrt implementation fails for inf inputs. 77 // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if 78 // clang includes the GCC header (which temporarily disables 79 // _GLIBCXX_USE_C99_COMPLEX) 80 #if !defined(_GLIBCXX_COMPLEX) || \ 81 (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX)) 82 const ValueType inf = std::numeric_limits<ValueType>::infinity(); 83 special_in[idx++] = ComplexType(1.0, inf); 84 special_in[idx++] = ComplexType(nan, inf); 85 special_in[idx++] = ComplexType(1.0, -inf); 86 special_in[idx++] = ComplexType(nan, -inf); 87 special_in[idx++] = ComplexType(-inf, 1.0); 88 special_in[idx++] = ComplexType(inf, 1.0); 89 special_in[idx++] = ComplexType(-inf, -1.0); 90 special_in[idx++] = ComplexType(inf, -1.0); 91 special_in[idx++] = ComplexType(-inf, nan); 92 special_in[idx++] = ComplexType(inf, nan); 93 #endif 94 special_in[idx++] = ComplexType(1.0, nan); 95 special_in[idx++] = ComplexType(nan, 1.0); 96 special_in[idx++] = ComplexType(nan, -1.0); 97 special_in[idx++] = ComplexType(nan, nan); 98 99 Map<SpecialInputs> special_out(out); 100 special_out = special_in.cwiseSqrt(); 101 } 102 103 T x1(in + i); 104 Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime); 105 res = x1.cwiseSqrt(); 106 } 107 }; 108 109 template<typename T> 110 struct complex_operators { 111 EIGEN_DEVICE_FUNC 112 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 113 { 114 using namespace Eigen; 115 typedef typename T::Scalar ComplexType; 116 typedef typename T::Scalar::value_type ValueType; 117 const int num_scalar_operators = 24; 118 const int num_vector_operators = 23; // no unary + operator. 119 int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime); 120 121 // Scalar operators. 122 const ComplexType a = in[i]; 123 const ComplexType b = in[i + 1]; 124 125 out[out_idx++] = +a; 126 out[out_idx++] = -a; 127 128 out[out_idx++] = a + b; 129 out[out_idx++] = a + numext::real(b); 130 out[out_idx++] = numext::real(a) + b; 131 out[out_idx++] = a - b; 132 out[out_idx++] = a - numext::real(b); 133 out[out_idx++] = numext::real(a) - b; 134 out[out_idx++] = a * b; 135 out[out_idx++] = a * numext::real(b); 136 out[out_idx++] = numext::real(a) * b; 137 out[out_idx++] = a / b; 138 out[out_idx++] = a / numext::real(b); 139 out[out_idx++] = numext::real(a) / b; 140 141 out[out_idx] = a; out[out_idx++] += b; 142 out[out_idx] = a; out[out_idx++] -= b; 143 out[out_idx] = a; out[out_idx++] *= b; 144 out[out_idx] = a; out[out_idx++] /= b; 145 146 const ComplexType true_value = ComplexType(ValueType(1), ValueType(0)); 147 const ComplexType false_value = ComplexType(ValueType(0), ValueType(0)); 148 out[out_idx++] = (a == b ? true_value : false_value); 149 out[out_idx++] = (a == numext::real(b) ? true_value : false_value); 150 out[out_idx++] = (numext::real(a) == b ? true_value : false_value); 151 out[out_idx++] = (a != b ? true_value : false_value); 152 out[out_idx++] = (a != numext::real(b) ? true_value : false_value); 153 out[out_idx++] = (numext::real(a) != b ? true_value : false_value); 154 155 // Vector versions. 156 T x1(in + i); 157 T x2(in + i + 1); 158 const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators; 159 const int size = T::MaxSizeAtCompileTime; 160 int block_idx = 0; 161 162 Map<VectorX<ComplexType>> res(out + out_idx, res_size); 163 res.segment(block_idx, size) = -x1; 164 block_idx += size; 165 166 res.segment(block_idx, size) = x1 + x2; 167 block_idx += size; 168 res.segment(block_idx, size) = x1 + x2.real(); 169 block_idx += size; 170 res.segment(block_idx, size) = x1.real() + x2; 171 block_idx += size; 172 res.segment(block_idx, size) = x1 - x2; 173 block_idx += size; 174 res.segment(block_idx, size) = x1 - x2.real(); 175 block_idx += size; 176 res.segment(block_idx, size) = x1.real() - x2; 177 block_idx += size; 178 res.segment(block_idx, size) = x1.array() * x2.array(); 179 block_idx += size; 180 res.segment(block_idx, size) = x1.array() * x2.real().array(); 181 block_idx += size; 182 res.segment(block_idx, size) = x1.real().array() * x2.array(); 183 block_idx += size; 184 res.segment(block_idx, size) = x1.array() / x2.array(); 185 block_idx += size; 186 res.segment(block_idx, size) = x1.array() / x2.real().array(); 187 block_idx += size; 188 res.segment(block_idx, size) = x1.real().array() / x2.array(); 189 block_idx += size; 190 191 res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2; 192 block_idx += size; 193 res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2; 194 block_idx += size; 195 res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array(); 196 block_idx += size; 197 res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array(); 198 block_idx += size; 199 200 const T true_vector = T::Constant(true_value); 201 const T false_vector = T::Constant(false_value); 202 res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector); 203 block_idx += size; 204 // Mixing types in equality comparison does not work. 205 // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector); 206 // block_idx += size; 207 // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector); 208 // block_idx += size; 209 res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector); 210 block_idx += size; 211 // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector); 212 // block_idx += size; 213 // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector); 214 // block_idx += size; 215 } 216 }; 217 218 template<typename T> 219 struct replicate { 220 EIGEN_DEVICE_FUNC 221 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 222 { 223 using namespace Eigen; 224 T x1(in+i); 225 int step = x1.size() * 4; 226 int stride = 3 * step; 227 228 typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType; 229 MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2); 230 MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3); 231 MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3); 232 } 233 }; 234 235 template<typename T> 236 struct alloc_new_delete { 237 EIGEN_DEVICE_FUNC 238 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 239 { 240 int offset = 2*i*T::MaxSizeAtCompileTime; 241 T* x = new T(in + offset); 242 Eigen::Map<T> u(out + offset); 243 u = *x; 244 delete x; 245 246 offset += T::MaxSizeAtCompileTime; 247 T* y = new T[1]; 248 y[0] = T(in + offset); 249 Eigen::Map<T> v(out + offset); 250 v = y[0]; 251 delete[] y; 252 } 253 }; 254 255 template<typename T> 256 struct redux { 257 EIGEN_DEVICE_FUNC 258 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 259 { 260 using namespace Eigen; 261 int N = 10; 262 T x1(in+i); 263 out[i*N+0] = x1.minCoeff(); 264 out[i*N+1] = x1.maxCoeff(); 265 out[i*N+2] = x1.sum(); 266 out[i*N+3] = x1.prod(); 267 out[i*N+4] = x1.matrix().squaredNorm(); 268 out[i*N+5] = x1.matrix().norm(); 269 out[i*N+6] = x1.colwise().sum().maxCoeff(); 270 out[i*N+7] = x1.rowwise().maxCoeff().sum(); 271 out[i*N+8] = x1.matrix().colwise().squaredNorm().sum(); 272 } 273 }; 274 275 template<typename T1, typename T2> 276 struct prod_test { 277 EIGEN_DEVICE_FUNC 278 void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const 279 { 280 using namespace Eigen; 281 typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3; 282 T1 x1(in+i); 283 T2 x2(in+i+1); 284 Map<T3> res(out+i*T3::MaxSizeAtCompileTime); 285 res += in[i] * x1 * x2; 286 } 287 }; 288 289 template<typename T1, typename T2> 290 struct diagonal { 291 EIGEN_DEVICE_FUNC 292 void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const 293 { 294 using namespace Eigen; 295 T1 x1(in+i); 296 Map<T2> res(out+i*T2::MaxSizeAtCompileTime); 297 res += x1.diagonal(); 298 } 299 }; 300 301 template<typename T> 302 struct eigenvalues_direct { 303 EIGEN_DEVICE_FUNC 304 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 305 { 306 using namespace Eigen; 307 typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec; 308 T M(in+i); 309 Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime); 310 T A = M*M.adjoint(); 311 SelfAdjointEigenSolver<T> eig; 312 eig.computeDirect(A); 313 res = eig.eigenvalues(); 314 } 315 }; 316 317 template<typename T> 318 struct eigenvalues { 319 EIGEN_DEVICE_FUNC 320 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 321 { 322 using namespace Eigen; 323 typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec; 324 T M(in+i); 325 Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime); 326 T A = M*M.adjoint(); 327 SelfAdjointEigenSolver<T> eig; 328 eig.compute(A); 329 res = eig.eigenvalues(); 330 } 331 }; 332 333 template<typename T> 334 struct matrix_inverse { 335 EIGEN_DEVICE_FUNC 336 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 337 { 338 using namespace Eigen; 339 T M(in+i); 340 Map<T> res(out+i*T::MaxSizeAtCompileTime); 341 res = M.inverse(); 342 } 343 }; 344 345 template<typename T> 346 struct numeric_limits_test { 347 EIGEN_DEVICE_FUNC 348 void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const 349 { 350 EIGEN_UNUSED_VARIABLE(in) 351 int out_idx = i * 5; 352 out[out_idx++] = numext::numeric_limits<float>::epsilon(); 353 out[out_idx++] = (numext::numeric_limits<float>::max)(); 354 out[out_idx++] = (numext::numeric_limits<float>::min)(); 355 out[out_idx++] = numext::numeric_limits<float>::infinity(); 356 out[out_idx++] = numext::numeric_limits<float>::quiet_NaN(); 357 } 358 }; 359 360 template<typename Type1, typename Type2> 361 bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only 362 { 363 if (a.rows() != b.rows()) { 364 return false; 365 } 366 if (a.cols() != b.cols()) { 367 return false; 368 } 369 for (Index r = 0; r < a.rows(); ++r) { 370 for (Index c = 0; c < a.cols(); ++c) { 371 if (a(r, c) != b(r, c) 372 && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 373 && !test_isApprox(a(r, c), b(r, c))) { 374 return false; 375 } 376 } 377 } 378 return true; 379 } 380 381 template<typename Kernel, typename Input, typename Output> 382 void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out) 383 { 384 Output out_ref, out_gpu; 385 #if !defined(EIGEN_GPU_COMPILE_PHASE) 386 out_ref = out_gpu = out; 387 #else 388 EIGEN_UNUSED_VARIABLE(in); 389 EIGEN_UNUSED_VARIABLE(out); 390 #endif 391 run_on_cpu (ker, n, in, out_ref); 392 run_on_gpu(ker, n, in, out_gpu); 393 #if !defined(EIGEN_GPU_COMPILE_PHASE) 394 verifyIsApproxWithInfsNans(out_ref, out_gpu); 395 #endif 396 } 397 398 EIGEN_DECLARE_TEST(gpu_basic) 399 { 400 ei_test_init_gpu(); 401 402 int nthreads = 100; 403 Eigen::VectorXf in, out; 404 Eigen::VectorXcf cfin, cfout; 405 406 #if !defined(EIGEN_GPU_COMPILE_PHASE) 407 int data_size = nthreads * 512; 408 in.setRandom(data_size); 409 out.setConstant(data_size, -1); 410 cfin.setRandom(data_size); 411 cfout.setConstant(data_size, -1); 412 #endif 413 414 CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) ); 415 CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) ); 416 417 #if !defined(EIGEN_USE_HIP) 418 // FIXME 419 // These subtests result in a compile failure on the HIP platform 420 // 421 // eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error: 422 // base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type' 423 // (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor 424 CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) ); 425 CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) ); 426 427 // HIP does not support new/delete on device. 428 CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) ); 429 #endif 430 431 CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) ); 432 CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) ); 433 434 CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) ); 435 CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) ); 436 437 CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) ); 438 CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) ); 439 440 CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) ); 441 CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) ); 442 CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) ); 443 444 CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) ); 445 CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) ); 446 447 // Test std::complex. 448 CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) ); 449 CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) ); 450 451 // numeric_limits 452 CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) ); 453 454 #if defined(__NVCC__) 455 // FIXME 456 // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda 457 CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) ); 458 typedef Matrix<float,6,6> Matrix6f; 459 CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) ); 460 #endif 461 }