btMatrix3x3.h

Go to the documentation of this file.
00001 /*
00002 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
00003 
00004 This software is provided 'as-is', without any express or implied warranty.
00005 In no event will the authors be held liable for any damages arising from the use of this software.
00006 Permission is granted to anyone to use this software for any purpose, 
00007 including commercial applications, and to alter it and redistribute it freely, 
00008 subject to the following restrictions:
00009 
00010 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
00011 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
00012 3. This notice may not be removed or altered from any source distribution.
00013 */
00014 
00015 
00016 #ifndef BT_MATRIX3x3_H
00017 #define BT_MATRIX3x3_H
00018 
00019 #include "btVector3.h"
00020 #include "btQuaternion.h"
00021 #include <stdio.h>
00022 
00023 #ifdef BT_USE_SSE
00024 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
00025 const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
00026 #endif
00027 
00028 #if defined(BT_USE_SSE) || defined(BT_USE_NEON)
00029 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
00030 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
00031 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
00032 #endif
00033 
00034 #ifdef BT_USE_DOUBLE_PRECISION
00035 #define btMatrix3x3Data btMatrix3x3DoubleData 
00036 #else
00037 #define btMatrix3x3Data btMatrix3x3FloatData
00038 #endif //BT_USE_DOUBLE_PRECISION
00039 
00040 
00043 ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
00044 
00046         btVector3 m_el[3];
00047 
00048 public:
00050         btMatrix3x3 () {}
00051 
00052         //              explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
00053 
00055         explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
00056         /*
00057         template <typename btScalar>
00058         Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
00059         { 
00060         setEulerYPR(yaw, pitch, roll);
00061         }
00062         */
00064         btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
00065                 const btScalar& yx, const btScalar& yy, const btScalar& yz,
00066                 const btScalar& zx, const btScalar& zy, const btScalar& zz)
00067         { 
00068                 setValue(xx, xy, xz, 
00069                         yx, yy, yz, 
00070                         zx, zy, zz);
00071         }
00072 
00073 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00074         SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 ) 
00075         {
00076         m_el[0].mVec128 = v0;
00077         m_el[1].mVec128 = v1;
00078         m_el[2].mVec128 = v2;
00079         }
00080 
00081         SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 ) 
00082         {
00083         m_el[0] = v0;
00084         m_el[1] = v1;
00085         m_el[2] = v2;
00086         }
00087 
00088         // Copy constructor
00089         SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
00090         {
00091                 m_el[0].mVec128 = rhs.m_el[0].mVec128;
00092                 m_el[1].mVec128 = rhs.m_el[1].mVec128;
00093                 m_el[2].mVec128 = rhs.m_el[2].mVec128;
00094         }
00095 
00096         // Assignment Operator
00097         SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m) 
00098         {
00099                 m_el[0].mVec128 = m.m_el[0].mVec128;
00100                 m_el[1].mVec128 = m.m_el[1].mVec128;
00101                 m_el[2].mVec128 = m.m_el[2].mVec128;
00102                 
00103                 return *this;
00104         }
00105 
00106 #else
00107 
00109         SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
00110         {
00111                 m_el[0] = other.m_el[0];
00112                 m_el[1] = other.m_el[1];
00113                 m_el[2] = other.m_el[2];
00114         }
00115     
00117         SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
00118         {
00119                 m_el[0] = other.m_el[0];
00120                 m_el[1] = other.m_el[1];
00121                 m_el[2] = other.m_el[2];
00122                 return *this;
00123         }
00124 
00125 #endif
00126 
00129         SIMD_FORCE_INLINE btVector3 getColumn(int i) const
00130         {
00131                 return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
00132         }
00133 
00134 
00137         SIMD_FORCE_INLINE const btVector3& getRow(int i) const
00138         {
00139                 btFullAssert(0 <= i && i < 3);
00140                 return m_el[i];
00141         }
00142 
00145         SIMD_FORCE_INLINE btVector3&  operator[](int i)
00146         { 
00147                 btFullAssert(0 <= i && i < 3);
00148                 return m_el[i]; 
00149         }
00150 
00153         SIMD_FORCE_INLINE const btVector3& operator[](int i) const
00154         {
00155                 btFullAssert(0 <= i && i < 3);
00156                 return m_el[i]; 
00157         }
00158 
00162         btMatrix3x3& operator*=(const btMatrix3x3& m); 
00163 
00167         btMatrix3x3& operator+=(const btMatrix3x3& m); 
00168 
00172         btMatrix3x3& operator-=(const btMatrix3x3& m); 
00173 
00176         void setFromOpenGLSubMatrix(const btScalar *m)
00177         {
00178                 m_el[0].setValue(m[0],m[4],m[8]);
00179                 m_el[1].setValue(m[1],m[5],m[9]);
00180                 m_el[2].setValue(m[2],m[6],m[10]);
00181 
00182         }
00193         void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz, 
00194                 const btScalar& yx, const btScalar& yy, const btScalar& yz, 
00195                 const btScalar& zx, const btScalar& zy, const btScalar& zz)
00196         {
00197                 m_el[0].setValue(xx,xy,xz);
00198                 m_el[1].setValue(yx,yy,yz);
00199                 m_el[2].setValue(zx,zy,zz);
00200         }
00201 
00204         void setRotation(const btQuaternion& q) 
00205         {
00206                 btScalar d = q.length2();
00207                 btFullAssert(d != btScalar(0.0));
00208                 btScalar s = btScalar(2.0) / d;
00209     
00210     #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
00211         __m128  vs, Q = q.get128();
00212                 __m128i Qi = btCastfTo128i(Q);
00213         __m128  Y, Z;
00214         __m128  V1, V2, V3;
00215         __m128  V11, V21, V31;
00216         __m128  NQ = _mm_xor_ps(Q, btvMzeroMask);
00217                 __m128i NQi = btCastfTo128i(NQ);
00218         
00219         V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));        // Y X Z W
00220                 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));     // -X -X  Y  W
00221         V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));        // Z Y X W
00222         V1 = _mm_xor_ps(V1, vMPPP);     //      change the sign of the first element
00223                         
00224         V11     = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));   // Y Y X W
00225                 V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
00226                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));       //  X  Z -X -W
00227 
00228                 V2 = V2 * V1;   //
00229                 V1 = V1 * V11;  //
00230                 V3 = V3 * V31;  //
00231 
00232         V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));       //      -Z -W  Y  W
00233                 V11 = V11 * V21;        //
00234         V21 = _mm_xor_ps(V21, vMPPP);   //      change the sign of the first element
00235                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));       //       W  W -Y -W
00236         V31 = _mm_xor_ps(V31, vMPPP);   //      change the sign of the first element
00237                 Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));        // -W -Z -X -W
00238                 Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3))); //  Y  X  Y  W
00239 
00240                 vs = _mm_load_ss(&s);
00241                 V21 = V21 * Y;
00242                 V31 = V31 * Z;
00243 
00244                 V1 = V1 + V11;
00245         V2 = V2 + V21;
00246         V3 = V3 + V31;
00247 
00248         vs = bt_splat3_ps(vs, 0);
00249             //  s ready
00250         V1 = V1 * vs;
00251         V2 = V2 * vs;
00252         V3 = V3 * vs;
00253         
00254         V1 = V1 + v1000;
00255         V2 = V2 + v0100;
00256         V3 = V3 + v0010;
00257         
00258         m_el[0] = V1; 
00259         m_el[1] = V2;
00260         m_el[2] = V3;
00261     #else    
00262                 btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
00263                 btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
00264                 btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
00265                 btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
00266                 setValue(
00267             btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
00268                         xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
00269                         xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
00270         #endif
00271     }
00272 
00273 
00279         void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll) 
00280         {
00281                 setEulerZYX(roll, pitch, yaw);
00282         }
00283 
00293         void setEulerZYX(btScalar eulerX,btScalar eulerY,btScalar eulerZ) { 
00295                 btScalar ci ( btCos(eulerX)); 
00296                 btScalar cj ( btCos(eulerY)); 
00297                 btScalar ch ( btCos(eulerZ)); 
00298                 btScalar si ( btSin(eulerX)); 
00299                 btScalar sj ( btSin(eulerY)); 
00300                 btScalar sh ( btSin(eulerZ)); 
00301                 btScalar cc = ci * ch; 
00302                 btScalar cs = ci * sh; 
00303                 btScalar sc = si * ch; 
00304                 btScalar ss = si * sh;
00305 
00306                 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
00307                         cj * sh, sj * ss + cc, sj * cs - sc, 
00308                         -sj,      cj * si,      cj * ci);
00309         }
00310 
00312         void setIdentity()
00313         { 
00314 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
00315                         m_el[0] = v1000; 
00316                         m_el[1] = v0100;
00317                         m_el[2] = v0010;
00318 #else
00319                 setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0), 
00320                         btScalar(0.0), btScalar(1.0), btScalar(0.0), 
00321                         btScalar(0.0), btScalar(0.0), btScalar(1.0)); 
00322 #endif
00323         }
00324 
00325         static const btMatrix3x3&       getIdentity()
00326         {
00327 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
00328         static const btMatrix3x3 
00329         identityMatrix(v1000, v0100, v0010);
00330 #else
00331                 static const btMatrix3x3 
00332         identityMatrix(
00333             btScalar(1.0), btScalar(0.0), btScalar(0.0), 
00334                         btScalar(0.0), btScalar(1.0), btScalar(0.0), 
00335                         btScalar(0.0), btScalar(0.0), btScalar(1.0));
00336 #endif
00337                 return identityMatrix;
00338         }
00339 
00342         void getOpenGLSubMatrix(btScalar *m) const 
00343         {
00344 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
00345         __m128 v0 = m_el[0].mVec128;
00346         __m128 v1 = m_el[1].mVec128;
00347         __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
00348         __m128 *vm = (__m128 *)m;
00349         __m128 vT;
00350         
00351         v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
00352         
00353         vT = _mm_unpackhi_ps(v0, v1);   //      z0 z1 * *
00354         v0 = _mm_unpacklo_ps(v0, v1);   //      x0 x1 y0 y1
00355 
00356         v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );   // y0 y1 y2 0
00357         v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );   // x0 x1 x2 0
00358         v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  // z0 z1 z2 0
00359 
00360         vm[0] = v0;
00361         vm[1] = v1;
00362         vm[2] = v2;
00363 #elif defined(BT_USE_NEON)
00364         // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
00365         static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
00366         float32x4_t *vm = (float32x4_t *)m;
00367         float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
00368         float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
00369         float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
00370         float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
00371         float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
00372         float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
00373 
00374         vm[0] = v0;
00375         vm[1] = v1;
00376         vm[2] = v2;
00377 #else
00378                 m[0]  = btScalar(m_el[0].x()); 
00379                 m[1]  = btScalar(m_el[1].x());
00380                 m[2]  = btScalar(m_el[2].x());
00381                 m[3]  = btScalar(0.0); 
00382                 m[4]  = btScalar(m_el[0].y());
00383                 m[5]  = btScalar(m_el[1].y());
00384                 m[6]  = btScalar(m_el[2].y());
00385                 m[7]  = btScalar(0.0); 
00386                 m[8]  = btScalar(m_el[0].z()); 
00387                 m[9]  = btScalar(m_el[1].z());
00388                 m[10] = btScalar(m_el[2].z());
00389                 m[11] = btScalar(0.0); 
00390 #endif
00391         }
00392 
00395         void getRotation(btQuaternion& q) const
00396         {
00397 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00398         btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
00399         btScalar s, x;
00400         
00401         union {
00402             btSimdFloat4 vec;
00403             btScalar f[4];
00404         } temp;
00405         
00406         if (trace > btScalar(0.0)) 
00407         {
00408             x = trace + btScalar(1.0);
00409 
00410             temp.f[0]=m_el[2].y() - m_el[1].z();
00411             temp.f[1]=m_el[0].z() - m_el[2].x();
00412             temp.f[2]=m_el[1].x() - m_el[0].y();
00413             temp.f[3]=x;
00414             //temp.f[3]= s * btScalar(0.5);
00415         } 
00416         else 
00417         {
00418             int i, j, k;
00419             if(m_el[0].x() < m_el[1].y()) 
00420             { 
00421                 if( m_el[1].y() < m_el[2].z() )
00422                     { i = 2; j = 0; k = 1; }
00423                 else
00424                     { i = 1; j = 2; k = 0; }
00425             }
00426             else
00427             {
00428                 if( m_el[0].x() < m_el[2].z())
00429                     { i = 2; j = 0; k = 1; }
00430                 else
00431                     { i = 0; j = 1; k = 2; }
00432             }
00433 
00434             x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
00435 
00436             temp.f[3] = (m_el[k][j] - m_el[j][k]);
00437             temp.f[j] = (m_el[j][i] + m_el[i][j]);
00438             temp.f[k] = (m_el[k][i] + m_el[i][k]);
00439             temp.f[i] = x;
00440             //temp.f[i] = s * btScalar(0.5);
00441         }
00442 
00443         s = btSqrt(x);
00444         q.set128(temp.vec);
00445         s = btScalar(0.5) / s;
00446 
00447         q *= s;
00448 #else    
00449                 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
00450 
00451                 btScalar temp[4];
00452 
00453                 if (trace > btScalar(0.0)) 
00454                 {
00455                         btScalar s = btSqrt(trace + btScalar(1.0));
00456                         temp[3]=(s * btScalar(0.5));
00457                         s = btScalar(0.5) / s;
00458 
00459                         temp[0]=((m_el[2].y() - m_el[1].z()) * s);
00460                         temp[1]=((m_el[0].z() - m_el[2].x()) * s);
00461                         temp[2]=((m_el[1].x() - m_el[0].y()) * s);
00462                 } 
00463                 else 
00464                 {
00465                         int i = m_el[0].x() < m_el[1].y() ? 
00466                                 (m_el[1].y() < m_el[2].z() ? 2 : 1) :
00467                                 (m_el[0].x() < m_el[2].z() ? 2 : 0); 
00468                         int j = (i + 1) % 3;  
00469                         int k = (i + 2) % 3;
00470 
00471                         btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
00472                         temp[i] = s * btScalar(0.5);
00473                         s = btScalar(0.5) / s;
00474 
00475                         temp[3] = (m_el[k][j] - m_el[j][k]) * s;
00476                         temp[j] = (m_el[j][i] + m_el[i][j]) * s;
00477                         temp[k] = (m_el[k][i] + m_el[i][k]) * s;
00478                 }
00479                 q.setValue(temp[0],temp[1],temp[2],temp[3]);
00480 #endif
00481         }
00482 
00487         void getEulerYPR(btScalar& yaw, btScalar& pitch, btScalar& roll) const
00488         {
00489 
00490                 // first use the normal calculus
00491                 yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
00492                 pitch = btScalar(btAsin(-m_el[2].x()));
00493                 roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
00494 
00495                 // on pitch = +/-HalfPI
00496                 if (btFabs(pitch)==SIMD_HALF_PI)
00497                 {
00498                         if (yaw>0)
00499                                 yaw-=SIMD_PI;
00500                         else
00501                                 yaw+=SIMD_PI;
00502 
00503                         if (roll>0)
00504                                 roll-=SIMD_PI;
00505                         else
00506                                 roll+=SIMD_PI;
00507                 }
00508         };
00509 
00510 
00516         void getEulerZYX(btScalar& yaw, btScalar& pitch, btScalar& roll, unsigned int solution_number = 1) const
00517         {
00518                 struct Euler
00519                 {
00520                         btScalar yaw;
00521                         btScalar pitch;
00522                         btScalar roll;
00523                 };
00524 
00525                 Euler euler_out;
00526                 Euler euler_out2; //second solution
00527                 //get the pointer to the raw data
00528 
00529                 // Check that pitch is not at a singularity
00530                 if (btFabs(m_el[2].x()) >= 1)
00531                 {
00532                         euler_out.yaw = 0;
00533                         euler_out2.yaw = 0;
00534 
00535                         // From difference of angles formula
00536                         btScalar delta = btAtan2(m_el[0].x(),m_el[0].z());
00537                         if (m_el[2].x() > 0)  //gimbal locked up
00538                         {
00539                                 euler_out.pitch = SIMD_PI / btScalar(2.0);
00540                                 euler_out2.pitch = SIMD_PI / btScalar(2.0);
00541                                 euler_out.roll = euler_out.pitch + delta;
00542                                 euler_out2.roll = euler_out.pitch + delta;
00543                         }
00544                         else // gimbal locked down
00545                         {
00546                                 euler_out.pitch = -SIMD_PI / btScalar(2.0);
00547                                 euler_out2.pitch = -SIMD_PI / btScalar(2.0);
00548                                 euler_out.roll = -euler_out.pitch + delta;
00549                                 euler_out2.roll = -euler_out.pitch + delta;
00550                         }
00551                 }
00552                 else
00553                 {
00554                         euler_out.pitch = - btAsin(m_el[2].x());
00555                         euler_out2.pitch = SIMD_PI - euler_out.pitch;
00556 
00557                         euler_out.roll = btAtan2(m_el[2].y()/btCos(euler_out.pitch), 
00558                                 m_el[2].z()/btCos(euler_out.pitch));
00559                         euler_out2.roll = btAtan2(m_el[2].y()/btCos(euler_out2.pitch), 
00560                                 m_el[2].z()/btCos(euler_out2.pitch));
00561 
00562                         euler_out.yaw = btAtan2(m_el[1].x()/btCos(euler_out.pitch), 
00563                                 m_el[0].x()/btCos(euler_out.pitch));
00564                         euler_out2.yaw = btAtan2(m_el[1].x()/btCos(euler_out2.pitch), 
00565                                 m_el[0].x()/btCos(euler_out2.pitch));
00566                 }
00567 
00568                 if (solution_number == 1)
00569                 { 
00570                         yaw = euler_out.yaw; 
00571                         pitch = euler_out.pitch;
00572                         roll = euler_out.roll;
00573                 }
00574                 else
00575                 { 
00576                         yaw = euler_out2.yaw; 
00577                         pitch = euler_out2.pitch;
00578                         roll = euler_out2.roll;
00579                 }
00580         }
00581 
00585         btMatrix3x3 scaled(const btVector3& s) const
00586         {
00587 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00588                 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
00589 #else           
00590                 return btMatrix3x3(
00591             m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
00592                         m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
00593                         m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
00594 #endif
00595         }
00596 
00598         btScalar            determinant() const;
00600         btMatrix3x3 adjoint() const;
00602         btMatrix3x3 absolute() const;
00604         btMatrix3x3 transpose() const;
00606         btMatrix3x3 inverse() const; 
00607 
00608         btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
00609         btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
00610 
00611         SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const 
00612         {
00613                 return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
00614         }
00615         SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const 
00616         {
00617                 return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
00618         }
00619         SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const 
00620         {
00621                 return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
00622         }
00623 
00624 
00634         void diagonalize(btMatrix3x3& rot, btScalar threshold, int maxSteps)
00635         {
00636                 rot.setIdentity();
00637                 for (int step = maxSteps; step > 0; step--)
00638                 {
00639                         // find off-diagonal element [p][q] with largest magnitude
00640                         int p = 0;
00641                         int q = 1;
00642                         int r = 2;
00643                         btScalar max = btFabs(m_el[0][1]);
00644                         btScalar v = btFabs(m_el[0][2]);
00645                         if (v > max)
00646                         {
00647                                 q = 2;
00648                                 r = 1;
00649                                 max = v;
00650                         }
00651                         v = btFabs(m_el[1][2]);
00652                         if (v > max)
00653                         {
00654                                 p = 1;
00655                                 q = 2;
00656                                 r = 0;
00657                                 max = v;
00658                         }
00659 
00660                         btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
00661                         if (max <= t)
00662                         {
00663                                 if (max <= SIMD_EPSILON * t)
00664                                 {
00665                                         return;
00666                                 }
00667                                 step = 1;
00668                         }
00669 
00670                         // compute Jacobi rotation J which leads to a zero for element [p][q] 
00671                         btScalar mpq = m_el[p][q];
00672                         btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
00673                         btScalar theta2 = theta * theta;
00674                         btScalar cos;
00675                         btScalar sin;
00676                         if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
00677                         {
00678                                 t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
00679                                         : 1 / (theta - btSqrt(1 + theta2));
00680                                 cos = 1 / btSqrt(1 + t * t);
00681                                 sin = cos * t;
00682                         }
00683                         else
00684                         {
00685                                 // approximation for large theta-value, i.e., a nearly diagonal matrix
00686                                 t = 1 / (theta * (2 + btScalar(0.5) / theta2));
00687                                 cos = 1 - btScalar(0.5) * t * t;
00688                                 sin = cos * t;
00689                         }
00690 
00691                         // apply rotation to matrix (this = J^T * this * J)
00692                         m_el[p][q] = m_el[q][p] = 0;
00693                         m_el[p][p] -= t * mpq;
00694                         m_el[q][q] += t * mpq;
00695                         btScalar mrp = m_el[r][p];
00696                         btScalar mrq = m_el[r][q];
00697                         m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
00698                         m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
00699 
00700                         // apply rotation to rot (rot = rot * J)
00701                         for (int i = 0; i < 3; i++)
00702                         {
00703                                 btVector3& row = rot[i];
00704                                 mrp = row[p];
00705                                 mrq = row[q];
00706                                 row[p] = cos * mrp - sin * mrq;
00707                                 row[q] = cos * mrq + sin * mrp;
00708                         }
00709                 }
00710         }
00711 
00712 
00713 
00714 
00722         btScalar cofac(int r1, int c1, int r2, int c2) const 
00723         {
00724                 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
00725         }
00726 
00727         void    serialize(struct        btMatrix3x3Data& dataOut) const;
00728 
00729         void    serializeFloat(struct   btMatrix3x3FloatData& dataOut) const;
00730 
00731         void    deSerialize(const struct        btMatrix3x3Data& dataIn);
00732 
00733         void    deSerializeFloat(const struct   btMatrix3x3FloatData& dataIn);
00734 
00735         void    deSerializeDouble(const struct  btMatrix3x3DoubleData& dataIn);
00736 
00737 };
00738 
00739 
00740 SIMD_FORCE_INLINE btMatrix3x3& 
00741 btMatrix3x3::operator*=(const btMatrix3x3& m)
00742 {
00743 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
00744     __m128 rv00, rv01, rv02;
00745     __m128 rv10, rv11, rv12;
00746     __m128 rv20, rv21, rv22;
00747     __m128 mv0, mv1, mv2;
00748 
00749     rv02 = m_el[0].mVec128;
00750     rv12 = m_el[1].mVec128;
00751     rv22 = m_el[2].mVec128;
00752 
00753     mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask); 
00754     mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask); 
00755     mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask); 
00756     
00757     // rv0
00758     rv00 = bt_splat_ps(rv02, 0);
00759     rv01 = bt_splat_ps(rv02, 1);
00760     rv02 = bt_splat_ps(rv02, 2);
00761     
00762     rv00 = _mm_mul_ps(rv00, mv0);
00763     rv01 = _mm_mul_ps(rv01, mv1);
00764     rv02 = _mm_mul_ps(rv02, mv2);
00765     
00766     // rv1
00767     rv10 = bt_splat_ps(rv12, 0);
00768     rv11 = bt_splat_ps(rv12, 1);
00769     rv12 = bt_splat_ps(rv12, 2);
00770     
00771     rv10 = _mm_mul_ps(rv10, mv0);
00772     rv11 = _mm_mul_ps(rv11, mv1);
00773     rv12 = _mm_mul_ps(rv12, mv2);
00774     
00775     // rv2
00776     rv20 = bt_splat_ps(rv22, 0);
00777     rv21 = bt_splat_ps(rv22, 1);
00778     rv22 = bt_splat_ps(rv22, 2);
00779     
00780     rv20 = _mm_mul_ps(rv20, mv0);
00781     rv21 = _mm_mul_ps(rv21, mv1);
00782     rv22 = _mm_mul_ps(rv22, mv2);
00783 
00784     rv00 = _mm_add_ps(rv00, rv01);
00785     rv10 = _mm_add_ps(rv10, rv11);
00786     rv20 = _mm_add_ps(rv20, rv21);
00787 
00788     m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
00789     m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
00790     m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
00791 
00792 #elif defined(BT_USE_NEON)
00793 
00794     float32x4_t rv0, rv1, rv2;
00795     float32x4_t v0, v1, v2;
00796     float32x4_t mv0, mv1, mv2;
00797 
00798     v0 = m_el[0].mVec128;
00799     v1 = m_el[1].mVec128;
00800     v2 = m_el[2].mVec128;
00801 
00802     mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); 
00803     mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); 
00804     mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); 
00805     
00806     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
00807     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
00808     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
00809     
00810     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
00811     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
00812     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
00813     
00814     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
00815     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
00816     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
00817 
00818     m_el[0].mVec128 = rv0;
00819     m_el[1].mVec128 = rv1;
00820     m_el[2].mVec128 = rv2;
00821 #else    
00822         setValue(
00823         m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
00824                 m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
00825                 m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
00826 #endif
00827         return *this;
00828 }
00829 
00830 SIMD_FORCE_INLINE btMatrix3x3& 
00831 btMatrix3x3::operator+=(const btMatrix3x3& m)
00832 {
00833 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00834     m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
00835     m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
00836     m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
00837 #else
00838         setValue(
00839                 m_el[0][0]+m.m_el[0][0], 
00840                 m_el[0][1]+m.m_el[0][1],
00841                 m_el[0][2]+m.m_el[0][2],
00842                 m_el[1][0]+m.m_el[1][0], 
00843                 m_el[1][1]+m.m_el[1][1],
00844                 m_el[1][2]+m.m_el[1][2],
00845                 m_el[2][0]+m.m_el[2][0], 
00846                 m_el[2][1]+m.m_el[2][1],
00847                 m_el[2][2]+m.m_el[2][2]);
00848 #endif
00849         return *this;
00850 }
00851 
00852 SIMD_FORCE_INLINE btMatrix3x3
00853 operator*(const btMatrix3x3& m, const btScalar & k)
00854 {
00855 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
00856     __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
00857     return btMatrix3x3(
00858                 _mm_mul_ps(m[0].mVec128, vk), 
00859                 _mm_mul_ps(m[1].mVec128, vk), 
00860                 _mm_mul_ps(m[2].mVec128, vk)); 
00861 #elif defined(BT_USE_NEON)
00862     return btMatrix3x3(
00863                 vmulq_n_f32(m[0].mVec128, k),
00864                 vmulq_n_f32(m[1].mVec128, k),
00865                 vmulq_n_f32(m[2].mVec128, k)); 
00866 #else
00867         return btMatrix3x3(
00868                 m[0].x()*k,m[0].y()*k,m[0].z()*k,
00869                 m[1].x()*k,m[1].y()*k,m[1].z()*k,
00870                 m[2].x()*k,m[2].y()*k,m[2].z()*k);
00871 #endif
00872 }
00873 
00874 SIMD_FORCE_INLINE btMatrix3x3 
00875 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
00876 {
00877 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00878         return btMatrix3x3(
00879         m1[0].mVec128 + m2[0].mVec128,
00880         m1[1].mVec128 + m2[1].mVec128,
00881         m1[2].mVec128 + m2[2].mVec128);
00882 #else
00883         return btMatrix3x3(
00884         m1[0][0]+m2[0][0], 
00885         m1[0][1]+m2[0][1],
00886         m1[0][2]+m2[0][2],
00887         
00888         m1[1][0]+m2[1][0], 
00889         m1[1][1]+m2[1][1],
00890         m1[1][2]+m2[1][2],
00891         
00892         m1[2][0]+m2[2][0], 
00893         m1[2][1]+m2[2][1],
00894         m1[2][2]+m2[2][2]);
00895 #endif    
00896 }
00897 
00898 SIMD_FORCE_INLINE btMatrix3x3 
00899 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
00900 {
00901 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00902         return btMatrix3x3(
00903         m1[0].mVec128 - m2[0].mVec128,
00904         m1[1].mVec128 - m2[1].mVec128,
00905         m1[2].mVec128 - m2[2].mVec128);
00906 #else
00907         return btMatrix3x3(
00908         m1[0][0]-m2[0][0], 
00909         m1[0][1]-m2[0][1],
00910         m1[0][2]-m2[0][2],
00911         
00912         m1[1][0]-m2[1][0], 
00913         m1[1][1]-m2[1][1],
00914         m1[1][2]-m2[1][2],
00915         
00916         m1[2][0]-m2[2][0], 
00917         m1[2][1]-m2[2][1],
00918         m1[2][2]-m2[2][2]);
00919 #endif
00920 }
00921 
00922 
00923 SIMD_FORCE_INLINE btMatrix3x3& 
00924 btMatrix3x3::operator-=(const btMatrix3x3& m)
00925 {
00926 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
00927     m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
00928     m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
00929     m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
00930 #else
00931         setValue(
00932         m_el[0][0]-m.m_el[0][0], 
00933         m_el[0][1]-m.m_el[0][1],
00934         m_el[0][2]-m.m_el[0][2],
00935         m_el[1][0]-m.m_el[1][0], 
00936         m_el[1][1]-m.m_el[1][1],
00937         m_el[1][2]-m.m_el[1][2],
00938         m_el[2][0]-m.m_el[2][0], 
00939         m_el[2][1]-m.m_el[2][1],
00940         m_el[2][2]-m.m_el[2][2]);
00941 #endif
00942         return *this;
00943 }
00944 
00945 
00946 SIMD_FORCE_INLINE btScalar 
00947 btMatrix3x3::determinant() const
00948 { 
00949         return btTriple((*this)[0], (*this)[1], (*this)[2]);
00950 }
00951 
00952 
00953 SIMD_FORCE_INLINE btMatrix3x3 
00954 btMatrix3x3::absolute() const
00955 {
00956 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
00957     return btMatrix3x3(
00958             _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
00959             _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
00960             _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
00961 #elif defined(BT_USE_NEON)
00962     return btMatrix3x3(
00963             (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
00964             (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
00965             (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
00966 #else   
00967         return btMatrix3x3(
00968             btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
00969             btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
00970             btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
00971 #endif
00972 }
00973 
00974 SIMD_FORCE_INLINE btMatrix3x3 
00975 btMatrix3x3::transpose() const 
00976 {
00977 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
00978     __m128 v0 = m_el[0].mVec128;
00979     __m128 v1 = m_el[1].mVec128;
00980     __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
00981     __m128 vT;
00982     
00983     v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
00984     
00985     vT = _mm_unpackhi_ps(v0, v1);       //      z0 z1 * *
00986     v0 = _mm_unpacklo_ps(v0, v1);       //      x0 x1 y0 y1
00987 
00988     v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );       // y0 y1 y2 0
00989     v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );       // x0 x1 x2 0
00990     v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));      // z0 z1 z2 0
00991 
00992 
00993     return btMatrix3x3( v0, v1, v2 );
00994 #elif defined(BT_USE_NEON)
00995     // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
00996     static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
00997     float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
00998     float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
00999     float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
01000     float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
01001     float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
01002     float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
01003     return btMatrix3x3( v0, v1, v2 ); 
01004 #else
01005         return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
01006                         m_el[0].y(), m_el[1].y(), m_el[2].y(),
01007                         m_el[0].z(), m_el[1].z(), m_el[2].z());
01008 #endif
01009 }
01010 
01011 SIMD_FORCE_INLINE btMatrix3x3 
01012 btMatrix3x3::adjoint() const 
01013 {
01014         return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
01015                 cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
01016                 cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
01017 }
01018 
01019 SIMD_FORCE_INLINE btMatrix3x3 
01020 btMatrix3x3::inverse() const
01021 {
01022         btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
01023         btScalar det = (*this)[0].dot(co);
01024         btFullAssert(det != btScalar(0.0));
01025         btScalar s = btScalar(1.0) / det;
01026         return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
01027                 co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
01028                 co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
01029 }
01030 
01031 SIMD_FORCE_INLINE btMatrix3x3 
01032 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
01033 {
01034 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
01035     // zeros w
01036 //    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
01037     __m128 row = m_el[0].mVec128;
01038     __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
01039     __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
01040     __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
01041     __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
01042     __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
01043     __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
01044     row = m_el[1].mVec128;
01045     r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
01046     r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
01047     r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
01048     row = m_el[2].mVec128;
01049     r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
01050     r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
01051     r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
01052     return btMatrix3x3( r0, r1, r2 );
01053 
01054 #elif defined BT_USE_NEON
01055     // zeros w
01056     static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
01057     float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
01058     float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
01059     float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
01060     float32x4_t row = m_el[0].mVec128;
01061     float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
01062     float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
01063     float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
01064     row = m_el[1].mVec128;
01065     r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
01066     r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
01067     r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
01068     row = m_el[2].mVec128;
01069     r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
01070     r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
01071     r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
01072     return btMatrix3x3( r0, r1, r2 );
01073 #else
01074     return btMatrix3x3(
01075                 m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
01076                 m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
01077                 m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
01078                 m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
01079                 m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
01080                 m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
01081                 m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
01082                 m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
01083                 m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
01084 #endif
01085 }
01086 
01087 SIMD_FORCE_INLINE btMatrix3x3 
01088 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
01089 {
01090 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
01091     __m128 a0 = m_el[0].mVec128;
01092     __m128 a1 = m_el[1].mVec128;
01093     __m128 a2 = m_el[2].mVec128;
01094     
01095     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
01096     __m128 mx = mT[0].mVec128;
01097     __m128 my = mT[1].mVec128;
01098     __m128 mz = mT[2].mVec128;
01099     
01100     __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
01101     __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
01102     __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
01103     r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
01104     r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
01105     r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
01106     r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
01107     r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
01108     r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
01109     return btMatrix3x3( r0, r1, r2);
01110             
01111 #elif defined BT_USE_NEON
01112     float32x4_t a0 = m_el[0].mVec128;
01113     float32x4_t a1 = m_el[1].mVec128;
01114     float32x4_t a2 = m_el[2].mVec128;
01115     
01116     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
01117     float32x4_t mx = mT[0].mVec128;
01118     float32x4_t my = mT[1].mVec128;
01119     float32x4_t mz = mT[2].mVec128;
01120     
01121     float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
01122     float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
01123     float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
01124     r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
01125     r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
01126     r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
01127     r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
01128     r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
01129     r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
01130     return btMatrix3x3( r0, r1, r2 );
01131     
01132 #else
01133         return btMatrix3x3(
01134                 m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
01135                 m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
01136                 m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
01137 #endif
01138 }
01139 
01140 SIMD_FORCE_INLINE btVector3 
01141 operator*(const btMatrix3x3& m, const btVector3& v) 
01142 {
01143 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
01144     return v.dot3(m[0], m[1], m[2]);
01145 #else
01146         return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
01147 #endif
01148 }
01149 
01150 
01151 SIMD_FORCE_INLINE btVector3
01152 operator*(const btVector3& v, const btMatrix3x3& m)
01153 {
01154 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
01155 
01156     const __m128 vv = v.mVec128;
01157 
01158     __m128 c0 = bt_splat_ps( vv, 0);
01159     __m128 c1 = bt_splat_ps( vv, 1);
01160     __m128 c2 = bt_splat_ps( vv, 2);
01161 
01162     c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
01163     c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
01164     c0 = _mm_add_ps(c0, c1);
01165     c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
01166     
01167     return btVector3(_mm_add_ps(c0, c2));
01168 #elif defined(BT_USE_NEON)
01169     const float32x4_t vv = v.mVec128;
01170     const float32x2_t vlo = vget_low_f32(vv);
01171     const float32x2_t vhi = vget_high_f32(vv);
01172 
01173     float32x4_t c0, c1, c2;
01174 
01175     c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
01176     c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
01177     c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
01178 
01179     c0 = vmulq_lane_f32(c0, vlo, 0);
01180     c1 = vmulq_lane_f32(c1, vlo, 1);
01181     c2 = vmulq_lane_f32(c2, vhi, 0);
01182     c0 = vaddq_f32(c0, c1);
01183     c0 = vaddq_f32(c0, c2);
01184     
01185     return btVector3(c0);
01186 #else
01187         return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
01188 #endif
01189 }
01190 
01191 SIMD_FORCE_INLINE btMatrix3x3 
01192 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
01193 {
01194 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
01195 
01196     __m128 m10 = m1[0].mVec128;  
01197     __m128 m11 = m1[1].mVec128;
01198     __m128 m12 = m1[2].mVec128;
01199     
01200     __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
01201     
01202     __m128 c0 = bt_splat_ps( m10, 0);
01203     __m128 c1 = bt_splat_ps( m11, 0);
01204     __m128 c2 = bt_splat_ps( m12, 0);
01205     
01206     c0 = _mm_mul_ps(c0, m2v);
01207     c1 = _mm_mul_ps(c1, m2v);
01208     c2 = _mm_mul_ps(c2, m2v);
01209     
01210     m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
01211     
01212     __m128 c0_1 = bt_splat_ps( m10, 1);
01213     __m128 c1_1 = bt_splat_ps( m11, 1);
01214     __m128 c2_1 = bt_splat_ps( m12, 1);
01215     
01216     c0_1 = _mm_mul_ps(c0_1, m2v);
01217     c1_1 = _mm_mul_ps(c1_1, m2v);
01218     c2_1 = _mm_mul_ps(c2_1, m2v);
01219     
01220     m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
01221     
01222     c0 = _mm_add_ps(c0, c0_1);
01223     c1 = _mm_add_ps(c1, c1_1);
01224     c2 = _mm_add_ps(c2, c2_1);
01225     
01226     m10 = bt_splat_ps( m10, 2);
01227     m11 = bt_splat_ps( m11, 2);
01228     m12 = bt_splat_ps( m12, 2);
01229     
01230     m10 = _mm_mul_ps(m10, m2v);
01231     m11 = _mm_mul_ps(m11, m2v);
01232     m12 = _mm_mul_ps(m12, m2v);
01233     
01234     c0 = _mm_add_ps(c0, m10);
01235     c1 = _mm_add_ps(c1, m11);
01236     c2 = _mm_add_ps(c2, m12);
01237     
01238     return btMatrix3x3(c0, c1, c2);
01239 
01240 #elif defined(BT_USE_NEON)
01241 
01242     float32x4_t rv0, rv1, rv2;
01243     float32x4_t v0, v1, v2;
01244     float32x4_t mv0, mv1, mv2;
01245 
01246     v0 = m1[0].mVec128;
01247     v1 = m1[1].mVec128;
01248     v2 = m1[2].mVec128;
01249 
01250     mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask); 
01251     mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask); 
01252     mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask); 
01253     
01254     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
01255     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
01256     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
01257     
01258     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
01259     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
01260     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
01261     
01262     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
01263     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
01264     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
01265 
01266         return btMatrix3x3(rv0, rv1, rv2);
01267         
01268 #else   
01269         return btMatrix3x3(
01270                 m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
01271                 m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
01272                 m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
01273 #endif
01274 }
01275 
01276 /*
01277 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
01278 return btMatrix3x3(
01279 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
01280 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
01281 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
01282 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
01283 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
01284 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
01285 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
01286 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
01287 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
01288 }
01289 */
01290 
01293 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
01294 {
01295 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
01296 
01297     __m128 c0, c1, c2;
01298 
01299     c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
01300     c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
01301     c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
01302     
01303     c0 = _mm_and_ps(c0, c1);
01304     c0 = _mm_and_ps(c0, c2);
01305 
01306     return (0x7 == _mm_movemask_ps((__m128)c0));
01307 #else 
01308         return 
01309     (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
01310                 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
01311                 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
01312 #endif
01313 }
01314 
01316 struct  btMatrix3x3FloatData
01317 {
01318         btVector3FloatData m_el[3];
01319 };
01320 
01322 struct  btMatrix3x3DoubleData
01323 {
01324         btVector3DoubleData m_el[3];
01325 };
01326 
01327 
01328         
01329 
01330 SIMD_FORCE_INLINE       void    btMatrix3x3::serialize(struct   btMatrix3x3Data& dataOut) const
01331 {
01332         for (int i=0;i<3;i++)
01333                 m_el[i].serialize(dataOut.m_el[i]);
01334 }
01335 
01336 SIMD_FORCE_INLINE       void    btMatrix3x3::serializeFloat(struct      btMatrix3x3FloatData& dataOut) const
01337 {
01338         for (int i=0;i<3;i++)
01339                 m_el[i].serializeFloat(dataOut.m_el[i]);
01340 }
01341 
01342 
01343 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerialize(const struct   btMatrix3x3Data& dataIn)
01344 {
01345         for (int i=0;i<3;i++)
01346                 m_el[i].deSerialize(dataIn.m_el[i]);
01347 }
01348 
01349 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerializeFloat(const struct      btMatrix3x3FloatData& dataIn)
01350 {
01351         for (int i=0;i<3;i++)
01352                 m_el[i].deSerializeFloat(dataIn.m_el[i]);
01353 }
01354 
01355 SIMD_FORCE_INLINE       void    btMatrix3x3::deSerializeDouble(const struct     btMatrix3x3DoubleData& dataIn)
01356 {
01357         for (int i=0;i<3;i++)
01358                 m_el[i].deSerializeDouble(dataIn.m_el[i]);
01359 }
01360 
01361 #endif //BT_MATRIX3x3_H
01362