cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

gpu_basic.cu (16082B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 // workaround issue between gcc >= 4.7 and cuda 5.5
     11 #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
     12   #undef _GLIBCXX_ATOMIC_BUILTINS
     13   #undef _GLIBCXX_USE_INT128
     14 #endif
     15 
     16 #define EIGEN_TEST_NO_LONGDOUBLE
     17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
     18 
     19 #include "main.h"
     20 #include "gpu_common.h"
     21 
     22 // Check that dense modules can be properly parsed by nvcc
     23 #include <Eigen/Dense>
     24 
     25 // struct Foo{
     26 //   EIGEN_DEVICE_FUNC
     27 //   void operator()(int i, const float* mats, float* vecs) const {
     28 //     using namespace Eigen;
     29 //   //   Matrix3f M(data);
     30 //   //   Vector3f x(data+9);
     31 //   //   Map<Vector3f>(data+9) = M.inverse() * x;
     32 //     Matrix3f M(mats+i/16);
     33 //     Vector3f x(vecs+i*3);
     34 //   //   using std::min;
     35 //   //   using std::sqrt;
     36 //     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
     37 //     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
     38 //   }
     39 // };
     40 
     41 template<typename T>
     42 struct coeff_wise {
     43   EIGEN_DEVICE_FUNC
     44   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
     45   {
     46     using namespace Eigen;
     47     T x1(in+i);
     48     T x2(in+i+1);
     49     T x3(in+i+2);
     50     Map<T> res(out+i*T::MaxSizeAtCompileTime);
     51     
     52     res.array() += (in[0] * x1 + x2).array() * x3.array();
     53   }
     54 };
     55 
     56 template<typename T>
     57 struct complex_sqrt {
     58   EIGEN_DEVICE_FUNC
     59   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
     60   {
     61     using namespace Eigen;
     62     typedef typename T::Scalar ComplexType;
     63     typedef typename T::Scalar::value_type ValueType;
     64     const int num_special_inputs = 18;
     65     
     66     if (i == 0) {
     67       const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
     68       typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
     69       SpecialInputs special_in;
     70       special_in.setZero();
     71       int idx = 0;
     72       special_in[idx++] = ComplexType(0, 0);
     73       special_in[idx++] = ComplexType(-0, 0);
     74       special_in[idx++] = ComplexType(0, -0);
     75       special_in[idx++] = ComplexType(-0, -0);
     76       // GCC's fallback sqrt implementation fails for inf inputs.
     77       // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
     78       // clang includes the GCC header (which temporarily disables
     79       // _GLIBCXX_USE_C99_COMPLEX)
     80       #if !defined(_GLIBCXX_COMPLEX) || \
     81         (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
     82       const ValueType inf = std::numeric_limits<ValueType>::infinity();
     83       special_in[idx++] = ComplexType(1.0, inf);
     84       special_in[idx++] = ComplexType(nan, inf);
     85       special_in[idx++] = ComplexType(1.0, -inf);
     86       special_in[idx++] = ComplexType(nan, -inf);
     87       special_in[idx++] = ComplexType(-inf, 1.0);
     88       special_in[idx++] = ComplexType(inf, 1.0);
     89       special_in[idx++] = ComplexType(-inf, -1.0);
     90       special_in[idx++] = ComplexType(inf, -1.0);
     91       special_in[idx++] = ComplexType(-inf, nan);
     92       special_in[idx++] = ComplexType(inf, nan);
     93       #endif
     94       special_in[idx++] = ComplexType(1.0, nan);
     95       special_in[idx++] = ComplexType(nan, 1.0);
     96       special_in[idx++] = ComplexType(nan, -1.0);
     97       special_in[idx++] = ComplexType(nan, nan);
     98       
     99       Map<SpecialInputs> special_out(out);
    100       special_out = special_in.cwiseSqrt();
    101     }
    102     
    103     T x1(in + i);
    104     Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
    105     res = x1.cwiseSqrt();
    106   }
    107 };
    108 
    109 template<typename T>
    110 struct complex_operators {
    111   EIGEN_DEVICE_FUNC
    112   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    113   {
    114     using namespace Eigen;
    115     typedef typename T::Scalar ComplexType;
    116     typedef typename T::Scalar::value_type ValueType;
    117     const int num_scalar_operators = 24;
    118     const int num_vector_operators = 23;  // no unary + operator.
    119     int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
    120     
    121     // Scalar operators.
    122     const ComplexType a = in[i];
    123     const ComplexType b = in[i + 1];
    124     
    125     out[out_idx++] = +a;
    126     out[out_idx++] = -a;
    127     
    128     out[out_idx++] = a + b;
    129     out[out_idx++] = a + numext::real(b);
    130     out[out_idx++] = numext::real(a) + b;
    131     out[out_idx++] = a - b;
    132     out[out_idx++] = a - numext::real(b);
    133     out[out_idx++] = numext::real(a) - b;
    134     out[out_idx++] = a * b;
    135     out[out_idx++] = a * numext::real(b);
    136     out[out_idx++] = numext::real(a) * b;
    137     out[out_idx++] = a / b;
    138     out[out_idx++] = a / numext::real(b);
    139     out[out_idx++] = numext::real(a) / b;
    140     
    141     out[out_idx] = a; out[out_idx++] += b;
    142     out[out_idx] = a; out[out_idx++] -= b;
    143     out[out_idx] = a; out[out_idx++] *= b;
    144     out[out_idx] = a; out[out_idx++] /= b;
    145     
    146     const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
    147     const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
    148     out[out_idx++] = (a == b ? true_value : false_value);
    149     out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
    150     out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
    151     out[out_idx++] = (a != b ? true_value : false_value);
    152     out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
    153     out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
    154     
    155     // Vector versions.
    156     T x1(in + i);
    157     T x2(in + i + 1);
    158     const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
    159     const int size = T::MaxSizeAtCompileTime;
    160     int block_idx = 0;
    161     
    162     Map<VectorX<ComplexType>> res(out + out_idx, res_size);
    163     res.segment(block_idx, size) = -x1;
    164     block_idx += size;
    165     
    166     res.segment(block_idx, size) = x1 + x2;
    167     block_idx += size;
    168     res.segment(block_idx, size) = x1 + x2.real();
    169     block_idx += size;
    170     res.segment(block_idx, size) = x1.real() + x2;
    171     block_idx += size;
    172     res.segment(block_idx, size) = x1 - x2;
    173     block_idx += size;
    174     res.segment(block_idx, size) = x1 - x2.real();
    175     block_idx += size;
    176     res.segment(block_idx, size) = x1.real() - x2;
    177     block_idx += size;
    178     res.segment(block_idx, size) = x1.array() * x2.array();
    179     block_idx += size;
    180     res.segment(block_idx, size) = x1.array() * x2.real().array();
    181     block_idx += size;
    182     res.segment(block_idx, size) = x1.real().array() * x2.array();
    183     block_idx += size;
    184     res.segment(block_idx, size) = x1.array() / x2.array();
    185     block_idx += size;
    186     res.segment(block_idx, size) = x1.array() / x2.real().array();
    187     block_idx += size;
    188     res.segment(block_idx, size) = x1.real().array() / x2.array();
    189     block_idx += size;
    190     
    191     res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
    192     block_idx += size;
    193     res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
    194     block_idx += size;
    195     res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
    196     block_idx += size;
    197     res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
    198     block_idx += size;
    199 
    200     const T true_vector = T::Constant(true_value);
    201     const T false_vector = T::Constant(false_value);
    202     res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
    203     block_idx += size;
    204     // Mixing types in equality comparison does not work.
    205     // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
    206     // block_idx += size;
    207     // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
    208     // block_idx += size;
    209     res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
    210     block_idx += size;
    211     // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
    212     // block_idx += size;
    213     // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
    214     // block_idx += size;
    215   }
    216 };
    217 
    218 template<typename T>
    219 struct replicate {
    220   EIGEN_DEVICE_FUNC
    221   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    222   {
    223     using namespace Eigen;
    224     T x1(in+i);
    225     int step   = x1.size() * 4;
    226     int stride = 3 * step;
    227     
    228     typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
    229     MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
    230     MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
    231     MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
    232   }
    233 };
    234 
    235 template<typename T>
    236 struct alloc_new_delete {
    237   EIGEN_DEVICE_FUNC
    238   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    239   {
    240     int offset = 2*i*T::MaxSizeAtCompileTime;
    241     T* x = new T(in + offset);
    242     Eigen::Map<T> u(out + offset);
    243     u = *x;
    244     delete x;
    245     
    246     offset += T::MaxSizeAtCompileTime;
    247     T* y = new T[1];
    248     y[0] = T(in + offset);
    249     Eigen::Map<T> v(out + offset);
    250     v = y[0];    
    251     delete[] y;
    252   }
    253 };
    254 
    255 template<typename T>
    256 struct redux {
    257   EIGEN_DEVICE_FUNC
    258   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    259   {
    260     using namespace Eigen;
    261     int N = 10;
    262     T x1(in+i);
    263     out[i*N+0] = x1.minCoeff();
    264     out[i*N+1] = x1.maxCoeff();
    265     out[i*N+2] = x1.sum();
    266     out[i*N+3] = x1.prod();
    267     out[i*N+4] = x1.matrix().squaredNorm();
    268     out[i*N+5] = x1.matrix().norm();
    269     out[i*N+6] = x1.colwise().sum().maxCoeff();
    270     out[i*N+7] = x1.rowwise().maxCoeff().sum();
    271     out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
    272   }
    273 };
    274 
    275 template<typename T1, typename T2>
    276 struct prod_test {
    277   EIGEN_DEVICE_FUNC
    278   void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
    279   {
    280     using namespace Eigen;
    281     typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
    282     T1 x1(in+i);
    283     T2 x2(in+i+1);
    284     Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
    285     res += in[i] * x1 * x2;
    286   }
    287 };
    288 
    289 template<typename T1, typename T2>
    290 struct diagonal {
    291   EIGEN_DEVICE_FUNC
    292   void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
    293   {
    294     using namespace Eigen;
    295     T1 x1(in+i);
    296     Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
    297     res += x1.diagonal();
    298   }
    299 };
    300 
    301 template<typename T>
    302 struct eigenvalues_direct {
    303   EIGEN_DEVICE_FUNC
    304   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    305   {
    306     using namespace Eigen;
    307     typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
    308     T M(in+i);
    309     Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
    310     T A = M*M.adjoint();
    311     SelfAdjointEigenSolver<T> eig;
    312     eig.computeDirect(A);
    313     res = eig.eigenvalues();
    314   }
    315 };
    316 
    317 template<typename T>
    318 struct eigenvalues {
    319   EIGEN_DEVICE_FUNC
    320   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    321   {
    322     using namespace Eigen;
    323     typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
    324     T M(in+i);
    325     Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
    326     T A = M*M.adjoint();
    327     SelfAdjointEigenSolver<T> eig;
    328     eig.compute(A);
    329     res = eig.eigenvalues();
    330   }
    331 };
    332 
    333 template<typename T>
    334 struct matrix_inverse {
    335   EIGEN_DEVICE_FUNC
    336   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    337   {
    338     using namespace Eigen;
    339     T M(in+i);
    340     Map<T> res(out+i*T::MaxSizeAtCompileTime);
    341     res = M.inverse();
    342   }
    343 };
    344 
    345 template<typename T>
    346 struct numeric_limits_test {
    347   EIGEN_DEVICE_FUNC
    348   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
    349   {
    350     EIGEN_UNUSED_VARIABLE(in)
    351     int out_idx = i * 5;
    352     out[out_idx++] = numext::numeric_limits<float>::epsilon();
    353     out[out_idx++] = (numext::numeric_limits<float>::max)();
    354     out[out_idx++] = (numext::numeric_limits<float>::min)();
    355     out[out_idx++] = numext::numeric_limits<float>::infinity();
    356     out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
    357   }
    358 };
    359 
    360 template<typename Type1, typename Type2>
    361 bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
    362 {
    363   if (a.rows() != b.rows()) {
    364     return false;
    365   }
    366   if (a.cols() != b.cols()) {
    367     return false;
    368   }
    369   for (Index r = 0; r < a.rows(); ++r) {
    370     for (Index c = 0; c < a.cols(); ++c) {
    371       if (a(r, c) != b(r, c)
    372           && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 
    373           && !test_isApprox(a(r, c), b(r, c))) {
    374         return false;
    375       }
    376     }
    377   }
    378   return true;
    379 }
    380 
    381 template<typename Kernel, typename Input, typename Output>
    382 void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
    383 {
    384   Output out_ref, out_gpu;
    385   #if !defined(EIGEN_GPU_COMPILE_PHASE)
    386   out_ref = out_gpu = out;
    387   #else
    388   EIGEN_UNUSED_VARIABLE(in);
    389   EIGEN_UNUSED_VARIABLE(out);
    390   #endif
    391   run_on_cpu (ker, n, in,  out_ref);
    392   run_on_gpu(ker, n, in, out_gpu);
    393   #if !defined(EIGEN_GPU_COMPILE_PHASE)
    394   verifyIsApproxWithInfsNans(out_ref, out_gpu);
    395   #endif
    396 }
    397 
    398 EIGEN_DECLARE_TEST(gpu_basic)
    399 {
    400   ei_test_init_gpu();
    401   
    402   int nthreads = 100;
    403   Eigen::VectorXf in, out;
    404   Eigen::VectorXcf cfin, cfout;
    405   
    406   #if !defined(EIGEN_GPU_COMPILE_PHASE)
    407   int data_size = nthreads * 512;
    408   in.setRandom(data_size);
    409   out.setConstant(data_size, -1);
    410   cfin.setRandom(data_size);
    411   cfout.setConstant(data_size, -1);
    412   #endif
    413   
    414   CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
    415   CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
    416 
    417 #if !defined(EIGEN_USE_HIP)
    418   // FIXME
    419   // These subtests result in a compile failure on the HIP platform
    420   //
    421   //  eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
    422   //           base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
    423   //           (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
    424   CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
    425   CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
    426 
    427   // HIP does not support new/delete on device.
    428   CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
    429 #endif
    430   
    431   CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
    432   CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
    433   
    434   CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
    435   CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
    436   
    437   CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
    438   CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
    439 
    440   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
    441   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
    442   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
    443   
    444   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
    445   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
    446 
    447   // Test std::complex.
    448   CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
    449   CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
    450 
    451   // numeric_limits
    452   CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
    453 
    454 #if defined(__NVCC__)
    455   // FIXME
    456   // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
    457   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
    458   typedef Matrix<float,6,6> Matrix6f;
    459   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
    460 #endif
    461 }