cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

InverseSize4.h (13693B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2001 Intel Corporation
      5 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
      6 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
      7 //
      8 // This Source Code Form is subject to the terms of the Mozilla
      9 // Public License v. 2.0. If a copy of the MPL was not distributed
     10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     11 //
     12 // The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
     13 // inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
     14 // adjugate of M and determinant of M respectively. M# is computed block-wise
     15 // using specific formulae. For proof, see:
     16 // https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
     17 // Variable names are adopted from \src\LU\Inverse_SSE.h.
     18 //
     19 // The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
     20 // comes from the following Intel's library:
     21 // http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
     22 //
     23 // Here is the respective copyright and license statement:
     24 //
     25 //   Copyright (c) 2001 Intel Corporation.
     26 //
     27 // Permition is granted to use, copy, distribute and prepare derivative works
     28 // of this library for any purpose and without fee, provided, that the above
     29 // copyright notice and this statement appear in all copies.
     30 // Intel makes no representations about the suitability of this software for
     31 // any purpose, and specifically disclaims all warranties.
     32 // See LEGAL.TXT for all the legal information.
     33 //
     34 // TODO: Unify implementations of different data types (i.e. float and double).
     35 #ifndef EIGEN_INVERSE_SIZE_4_H
     36 #define EIGEN_INVERSE_SIZE_4_H
     37 
     38 namespace Eigen
     39 {
     40 namespace internal
     41 {
     42 template <typename MatrixType, typename ResultType>
     43 struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
     44 {
     45   enum
     46   {
     47     MatrixAlignment = traits<MatrixType>::Alignment,
     48     ResultAlignment = traits<ResultType>::Alignment,
     49     StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
     50   };
     51   typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
     52 
     53   static void run(const MatrixType &mat, ResultType &result)
     54   {
     55     ActualMatrixType matrix(mat);
     56 
     57     const float* data = matrix.data();
     58     const Index stride = matrix.innerStride();
     59     Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
     60     Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
     61     Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
     62     Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
     63 
     64     // Four 2x2 sub-matrices of the input matrix
     65     // input = [[A, B],
     66     //          [C, D]]
     67     Packet4f A, B, C, D;
     68 
     69     if (!StorageOrdersMatch)
     70     {
     71       A = vec4f_unpacklo(_L1, _L2);
     72       B = vec4f_unpacklo(_L3, _L4);
     73       C = vec4f_unpackhi(_L1, _L2);
     74       D = vec4f_unpackhi(_L3, _L4);
     75     }
     76     else
     77     {
     78       A = vec4f_movelh(_L1, _L2);
     79       B = vec4f_movehl(_L2, _L1);
     80       C = vec4f_movelh(_L3, _L4);
     81       D = vec4f_movehl(_L4, _L3);
     82     }
     83 
     84     Packet4f AB, DC;
     85 
     86     // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
     87     AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
     88     AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
     89 
     90     // DC = D#*C
     91     DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
     92     DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
     93 
     94     // determinants of the sub-matrices
     95     Packet4f dA, dB, dC, dD;
     96 
     97     dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
     98     dA = psub(dA, vec4f_movehl(dA, dA));
     99 
    100     dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
    101     dB = psub(dB, vec4f_movehl(dB, dB));
    102 
    103     dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
    104     dC = psub(dC, vec4f_movehl(dC, dC));
    105 
    106     dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
    107     dD = psub(dD, vec4f_movehl(dD, dD));
    108 
    109     Packet4f d, d1, d2;
    110 
    111     d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
    112     d = padd(d, vec4f_movehl(d, d));
    113     d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
    114     d1 = pmul(dA, dD);
    115     d2 = pmul(dB, dC);
    116 
    117     // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
    118     Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
    119 
    120     // reciprocal of the determinant of the input matrix, rd = 1/det
    121     Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
    122 
    123     // Four sub-matrices of the inverse
    124     Packet4f iA, iB, iC, iD;
    125 
    126     // iD = D*|A| - C*A#*B
    127     iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
    128     iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
    129     iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
    130 
    131     // iA = A*|D| - B*D#*C
    132     iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
    133     iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
    134     iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
    135 
    136     // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
    137     iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
    138     iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
    139     iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
    140 
    141     // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
    142     iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
    143     iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
    144     iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
    145 
    146     const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
    147     const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
    148     rd = pxor(rd, p4f_sign_PNNP);
    149     iA = pmul(iA, rd);
    150     iB = pmul(iB, rd);
    151     iC = pmul(iC, rd);
    152     iD = pmul(iD, rd);
    153 
    154     Index res_stride = result.outerStride();
    155     float *res = result.data();
    156 
    157     pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
    158     pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
    159     pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
    160     pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
    161   }
    162 };
    163 
    164 #if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
    165 // same algorithm as above, except that each operand is split into
    166 // halves for two registers to hold.
    167 template <typename MatrixType, typename ResultType>
    168 struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
    169 {
    170   enum
    171   {
    172     MatrixAlignment = traits<MatrixType>::Alignment,
    173     ResultAlignment = traits<ResultType>::Alignment,
    174     StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
    175   };
    176   typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
    177                                MatrixType const &,
    178                                typename MatrixType::PlainObject>::type
    179       ActualMatrixType;
    180 
    181   static void run(const MatrixType &mat, ResultType &result)
    182   {
    183     ActualMatrixType matrix(mat);
    184 
    185     // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
    186     // row e.g. A1, upper row of A, A2, lower row of A
    187     // input = [[A, B],  =  [[[A1, [B1,
    188     //          [C, D]]        A2], B2]],
    189     //                       [[C1, [D1,
    190     //                         C2], D2]]]
    191 
    192     Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
    193 
    194     const double* data = matrix.data();
    195     const Index stride = matrix.innerStride();
    196     if (StorageOrdersMatch)
    197     {
    198       A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
    199       B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
    200       A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
    201       B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
    202       C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
    203       D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
    204       C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
    205       D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
    206     }
    207     else
    208     {
    209       Packet2d temp;
    210       A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
    211       C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
    212       A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
    213       C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
    214       temp = A1;
    215       A1 = vec2d_unpacklo(A1, A2);
    216       A2 = vec2d_unpackhi(temp, A2);
    217 
    218       temp = C1;
    219       C1 = vec2d_unpacklo(C1, C2);
    220       C2 = vec2d_unpackhi(temp, C2);
    221 
    222       B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
    223       D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
    224       B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
    225       D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
    226 
    227       temp = B1;
    228       B1 = vec2d_unpacklo(B1, B2);
    229       B2 = vec2d_unpackhi(temp, B2);
    230 
    231       temp = D1;
    232       D1 = vec2d_unpacklo(D1, D2);
    233       D2 = vec2d_unpackhi(temp, D2);
    234     }
    235 
    236     // determinants of the sub-matrices
    237     Packet2d dA, dB, dC, dD;
    238 
    239     dA = vec2d_swizzle2(A2, A2, 1);
    240     dA = pmul(A1, dA);
    241     dA = psub(dA, vec2d_duplane(dA, 1));
    242 
    243     dB = vec2d_swizzle2(B2, B2, 1);
    244     dB = pmul(B1, dB);
    245     dB = psub(dB, vec2d_duplane(dB, 1));
    246 
    247     dC = vec2d_swizzle2(C2, C2, 1);
    248     dC = pmul(C1, dC);
    249     dC = psub(dC, vec2d_duplane(dC, 1));
    250 
    251     dD = vec2d_swizzle2(D2, D2, 1);
    252     dD = pmul(D1, dD);
    253     dD = psub(dD, vec2d_duplane(dD, 1));
    254 
    255     Packet2d DC1, DC2, AB1, AB2;
    256 
    257     // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
    258     AB1 = pmul(B1, vec2d_duplane(A2, 1));
    259     AB2 = pmul(B2, vec2d_duplane(A1, 0));
    260     AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
    261     AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
    262 
    263     // DC = D#*C
    264     DC1 = pmul(C1, vec2d_duplane(D2, 1));
    265     DC2 = pmul(C2, vec2d_duplane(D1, 0));
    266     DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
    267     DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
    268 
    269     Packet2d d1, d2;
    270 
    271     // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
    272     Packet2d det;
    273 
    274     // reciprocal of the determinant of the input matrix, rd = 1/det
    275     Packet2d rd;
    276 
    277     d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
    278     d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
    279     rd = padd(d1, d2);
    280     rd = padd(rd, vec2d_duplane(rd, 1));
    281 
    282     d1 = pmul(dA, dD);
    283     d2 = pmul(dB, dC);
    284 
    285     det = padd(d1, d2);
    286     det = psub(det, rd);
    287     det = vec2d_duplane(det, 0);
    288     rd = pdiv(pset1<Packet2d>(1.0), det);
    289 
    290     // rows of four sub-matrices of the inverse
    291     Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
    292 
    293     // iD = D*|A| - C*A#*B
    294     iD1 = pmul(AB1, vec2d_duplane(C1, 0));
    295     iD2 = pmul(AB1, vec2d_duplane(C2, 0));
    296     iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
    297     iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
    298     dA = vec2d_duplane(dA, 0);
    299     iD1 = psub(pmul(D1, dA), iD1);
    300     iD2 = psub(pmul(D2, dA), iD2);
    301 
    302     // iA = A*|D| - B*D#*C
    303     iA1 = pmul(DC1, vec2d_duplane(B1, 0));
    304     iA2 = pmul(DC1, vec2d_duplane(B2, 0));
    305     iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
    306     iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
    307     dD = vec2d_duplane(dD, 0);
    308     iA1 = psub(pmul(A1, dD), iA1);
    309     iA2 = psub(pmul(A2, dD), iA2);
    310 
    311     // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
    312     iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
    313     iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
    314     iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
    315     iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
    316     dB = vec2d_duplane(dB, 0);
    317     iB1 = psub(pmul(C1, dB), iB1);
    318     iB2 = psub(pmul(C2, dB), iB2);
    319 
    320     // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
    321     iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
    322     iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
    323     iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
    324     iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
    325     dC = vec2d_duplane(dC, 0);
    326     iC1 = psub(pmul(B1, dC), iC1);
    327     iC2 = psub(pmul(B2, dC), iC2);
    328 
    329     const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
    330     const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
    331     const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
    332     const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
    333     d1 = pxor(rd, sign_PN);
    334     d2 = pxor(rd, sign_NP);
    335 
    336     Index res_stride = result.outerStride();
    337     double *res = result.data();
    338     pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
    339     pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
    340     pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
    341     pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
    342     pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
    343     pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
    344     pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
    345     pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
    346   }
    347 };
    348 #endif
    349 } // namespace internal
    350 } // namespace Eigen
    351 #endif